|
|
|
""" |
|
Custom evaluation tasks for lighteval |
|
|
|
This file generally create just a TASKS_TABLE and TASKS_GROUPS which are then imported by LightEval. |
|
""" |
|
import re |
|
from dataclasses import asdict |
|
from typing import Dict, List, Tuple |
|
|
|
from custom_evaluation_utils import * |
|
from lighteval.tasks.requests import Doc |
|
|
|
|
|
LETTER_INDICES = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z"] |
|
|
|
|
|
_TASKS_STRINGS: List[Tuple[CustomEvaluationTask, str]] = [] |
|
_TASKS: List[CustomEvaluationTask] = [] |
|
|
|
|
|
COMMON_SENSE_REASONING_TASKS = [ |
|
CustomEvaluationTask( |
|
name="hellaswag", |
|
prompt_function="hellaswag_prompt", |
|
hf_repo="hellaswag", |
|
hf_subset="default", |
|
metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], |
|
), |
|
CustomEvaluationTask( |
|
name="winogrande", |
|
prompt_function="winogrande", |
|
hf_repo="winogrande", |
|
hf_subset="winogrande_xl", |
|
metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], |
|
), |
|
CustomEvaluationTask( |
|
name="piqa", |
|
prompt_function="piqa_harness", |
|
hf_repo="piqa", |
|
hf_subset="plain_text", |
|
metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], |
|
), |
|
CustomEvaluationTask( |
|
name="siqa", |
|
prompt_function="siqa_prompt", |
|
hf_repo="lighteval/siqa", |
|
hf_subset="default", |
|
hf_avail_splits=["train", "validation"], |
|
metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], |
|
), |
|
CustomEvaluationTask( |
|
name="openbookqa", |
|
prompt_function="openbookqa", |
|
hf_repo="openbookqa", |
|
hf_subset="main", |
|
metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], |
|
), |
|
CustomEvaluationTask( |
|
name="arc:easy", |
|
prompt_function="arc", |
|
hf_repo="ai2_arc", |
|
hf_subset="ARC-Easy", |
|
evaluation_splits=["test"], |
|
generation_size=1, |
|
metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], |
|
), |
|
CustomEvaluationTask( |
|
name="arc:challenge", |
|
prompt_function="arc", |
|
hf_repo="ai2_arc", |
|
hf_subset="ARC-Challenge", |
|
evaluation_splits=["test"], |
|
generation_size=1, |
|
metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], |
|
), |
|
CustomEvaluationTask( |
|
name="commonsense_qa", |
|
prompt_function="commonsense_qa_prompt", |
|
hf_repo="commonsense_qa", |
|
hf_subset="default", |
|
metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], |
|
), |
|
] |
|
|
|
|
|
def commonsense_qa_prompt(line, task_name: str = None): |
|
return Doc( |
|
task_name=task_name, |
|
query=line["question"], |
|
choices=[f" {c}" for c in line["choices"]["text"]], |
|
gold_index=LETTER_INDICES.index(line["answerKey"].strip()), |
|
instruction="", |
|
) |
|
|
|
|
|
def siqa_prompt(line, task_name: str = None): |
|
return Doc( |
|
task_name=task_name, |
|
query=line["context"] + " " + line["question"], |
|
choices=[f" {c}" for c in [line["answerA"], line["answerB"], line["answerC"]]], |
|
gold_index=int(line["label"]) - 1, |
|
instruction="", |
|
) |
|
|
|
|
|
def hellaswag_prompt(line, task_name: str = None): |
|
def preprocess(text): |
|
"""Comes from AiHarness""" |
|
|
|
|
|
text = text.replace(" [title]", ". ") |
|
text = re.sub("\\[.*?\\]", "", text) |
|
text = text.replace(" ", " ") |
|
return text |
|
|
|
ctx = f"{line['ctx_a']} {line['ctx_b'].capitalize()} " |
|
return Doc( |
|
task_name=task_name, |
|
query=preprocess(line["activity_label"] + ": " + ctx), |
|
choices=[" " + preprocess(ending) for ending in line["endings"]], |
|
gold_index=int(line["label"]) if line["label"] != "" else -1, |
|
|
|
) |
|
|
|
|
|
|
|
COMMON_SENSE_REASONING_STRING = [(t, f"custom|{t.name}|0|1") for t in COMMON_SENSE_REASONING_TASKS] |
|
_TASKS_STRINGS.extend(COMMON_SENSE_REASONING_STRING) |
|
_TASKS += COMMON_SENSE_REASONING_TASKS |
|
|
|
|
|
|
|
WORLD_KNOWLEDGE_TASKS = [ |
|
CustomEvaluationTask( |
|
name="trivia_qa", |
|
prompt_function="triviaqa", |
|
hf_repo="trivia_qa", |
|
hf_subset="rc.nocontext", |
|
metric=[Metrics.quasi_exact_match2], |
|
generation_size=20, |
|
stop_sequence=["\n", ".", ","], |
|
), |
|
CustomEvaluationTask( |
|
name="natural_questions", |
|
prompt_function="natural_questions_prompt", |
|
hf_repo="lighteval/natural_questions_clean", |
|
hf_subset="default", |
|
metric=[Metrics.quasi_exact_match2], |
|
generation_size=20, |
|
stop_sequence=["\n", ".", ","], |
|
), |
|
] |
|
|
|
|
|
def natural_questions_prompt(line, task_name: str = None): |
|
return Doc( |
|
task_name=task_name, |
|
query=line["question"] + "?\nAnswer: ", |
|
choices=[line["short_answers"]], |
|
gold_index=0, |
|
instruction="", |
|
) |
|
|
|
|
|
WORLD_KNOWLEDGE_STRING = [(t, f"custom|{t.name}|5|1") for t in WORLD_KNOWLEDGE_TASKS] |
|
|
|
_TASKS_STRINGS.extend(WORLD_KNOWLEDGE_STRING) |
|
_TASKS += WORLD_KNOWLEDGE_TASKS |
|
|
|
|
|
|
|
READING_COMP_TASKS = [ |
|
CustomEvaluationTask( |
|
name="super_glue:boolq", |
|
prompt_function="boolq_prompt", |
|
hf_repo="super_glue", |
|
hf_subset="boolq", |
|
metric=[Metrics.target_perplexity], |
|
), |
|
CustomEvaluationTask( |
|
name="quac", |
|
prompt_function="quac", |
|
hf_repo="lighteval/quac_helm", |
|
hf_subset="default", |
|
metric=[Metrics.quasi_exact_match2], |
|
generation_size=20, |
|
stop_sequence=["\n", ".", ","], |
|
), |
|
] |
|
|
|
|
|
def boolq_prompt(line, task_name: str = None): |
|
return Doc( |
|
task_name=task_name, |
|
query=f"{line['passage']}\nQuestion: {line['question'].capitalize()}?\nAnswer:", |
|
choices=[" No", " Yes"], |
|
gold_index=int(line["label"]), |
|
) |
|
|
|
|
|
READING_COMP_STRING = [(t, f"custom|{t.name}|0|1") for t in READING_COMP_TASKS] |
|
_TASKS_STRINGS.extend(READING_COMP_STRING) |
|
_TASKS += READING_COMP_TASKS |
|
|
|
|
|
|
|
class CustomMathEvaluationTask(CustomEvaluationTask): |
|
"""Custom class for math tasks with all the defaults set""" |
|
|
|
def __init__( |
|
self, |
|
name, |
|
prompt_function="math", |
|
hf_repo="lighteval/MATH", |
|
hf_subset=None, |
|
metric=[Metrics.math_quasi_exact_match], |
|
hf_avail_splits=None, |
|
evaluation_splits=["test"], |
|
few_shots_split=None, |
|
few_shots_select=None, |
|
suite=["custom"], |
|
generation_size=40, |
|
stop_sequence=None, |
|
output_regex=None, |
|
frozen=False, |
|
): |
|
super().__init__( |
|
name=name, |
|
prompt_function=prompt_function, |
|
hf_repo=hf_repo, |
|
hf_subset=hf_subset, |
|
metric=metric, |
|
hf_avail_splits=hf_avail_splits, |
|
evaluation_splits=evaluation_splits, |
|
few_shots_split=few_shots_split, |
|
few_shots_select=few_shots_select, |
|
suite=suite, |
|
generation_size=generation_size, |
|
stop_sequence=stop_sequence, |
|
output_regex=output_regex, |
|
frozen=frozen, |
|
) |
|
|
|
|
|
MATH_TASKS = [ |
|
CustomMathEvaluationTask(name="math:algebra", hf_subset="algebra"), |
|
CustomMathEvaluationTask(name="math:counting_and_probability", hf_subset="counting_and_probability"), |
|
CustomMathEvaluationTask(name="math:geometry", hf_subset="geometry"), |
|
CustomMathEvaluationTask(name="math:intermediate_algebra", hf_subset="intermediate_algebra"), |
|
CustomMathEvaluationTask(name="math:number_theory", hf_subset="number_theory"), |
|
CustomMathEvaluationTask(name="math:prealgebra", hf_subset="prealgebra"), |
|
CustomMathEvaluationTask(name="math:precalculus", hf_subset="precalculus"), |
|
] |
|
GSM8K = CustomEvaluationTask( |
|
name="gsm8k", |
|
prompt_function="gsm8k", |
|
hf_repo="gsm8k", |
|
hf_subset="main", |
|
hf_avail_splits=["train", "test"], |
|
evaluation_splits=["test"], |
|
metric=[Metrics.perfect_exact_match], |
|
generation_size=10, |
|
stop_sequence=["\n"], |
|
) |
|
|
|
|
|
MATH_STRING = [(t, f"custom|{t.name}|4|1") for t in MATH_TASKS] |
|
GSM8K_STRING = [(GSM8K, f"custom|{GSM8K.name}|8|1")] |
|
_TASKS_STRINGS.extend(MATH_STRING) |
|
_TASKS_STRINGS.extend(GSM8K_STRING) |
|
_TASKS += MATH_TASKS + [GSM8K] |
|
|
|
|
|
|
|
class CustomMMLUEvaluationTask(CustomEvaluationTask): |
|
def __init__( |
|
self, |
|
name, |
|
prompt_function="mmlu_prompt", |
|
hf_repo="lighteval/mmlu", |
|
hf_subset=None, |
|
|
|
metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], |
|
hf_avail_splits=None, |
|
evaluation_splits=["test"], |
|
few_shots_split="dev", |
|
few_shots_select=None, |
|
suite=None, |
|
generation_size=-1, |
|
stop_sequence=None, |
|
output_regex=None, |
|
frozen=False, |
|
): |
|
super().__init__( |
|
name=name, |
|
prompt_function=prompt_function, |
|
hf_repo=hf_repo, |
|
hf_subset=hf_subset, |
|
metric=metric, |
|
hf_avail_splits=hf_avail_splits, |
|
evaluation_splits=evaluation_splits, |
|
few_shots_split=few_shots_split, |
|
few_shots_select=few_shots_select, |
|
suite=suite, |
|
generation_size=generation_size, |
|
stop_sequence=stop_sequence, |
|
output_regex=output_regex, |
|
frozen=frozen, |
|
) |
|
|
|
|
|
MMLU_TASKS = [ |
|
CustomMMLUEvaluationTask(name="mmlu:abstract_algebra", hf_subset="abstract_algebra"), |
|
CustomMMLUEvaluationTask(name="mmlu:anatomy", hf_subset="anatomy"), |
|
CustomMMLUEvaluationTask(name="mmlu:astronomy", hf_subset="astronomy"), |
|
CustomMMLUEvaluationTask(name="mmlu:business_ethics", hf_subset="business_ethics"), |
|
CustomMMLUEvaluationTask(name="mmlu:clinical_knowledge", hf_subset="clinical_knowledge"), |
|
CustomMMLUEvaluationTask(name="mmlu:college_biology", hf_subset="college_biology"), |
|
CustomMMLUEvaluationTask(name="mmlu:college_chemistry", hf_subset="college_chemistry"), |
|
CustomMMLUEvaluationTask(name="mmlu:college_computer_science", hf_subset="college_computer_science"), |
|
CustomMMLUEvaluationTask(name="mmlu:college_mathematics", hf_subset="college_mathematics"), |
|
CustomMMLUEvaluationTask(name="mmlu:college_medicine", hf_subset="college_medicine"), |
|
CustomMMLUEvaluationTask(name="mmlu:college_physics", hf_subset="college_physics"), |
|
CustomMMLUEvaluationTask(name="mmlu:computer_security", hf_subset="computer_security"), |
|
CustomMMLUEvaluationTask(name="mmlu:conceptual_physics", hf_subset="conceptual_physics"), |
|
CustomMMLUEvaluationTask(name="mmlu:econometrics", hf_subset="econometrics"), |
|
CustomMMLUEvaluationTask(name="mmlu:electrical_engineering", hf_subset="electrical_engineering"), |
|
CustomMMLUEvaluationTask(name="mmlu:elementary_mathematics", hf_subset="elementary_mathematics"), |
|
CustomMMLUEvaluationTask(name="mmlu:formal_logic", hf_subset="formal_logic"), |
|
CustomMMLUEvaluationTask(name="mmlu:global_facts", hf_subset="global_facts"), |
|
CustomMMLUEvaluationTask(name="mmlu:high_school_biology", hf_subset="high_school_biology"), |
|
CustomMMLUEvaluationTask(name="mmlu:high_school_chemistry", hf_subset="high_school_chemistry"), |
|
CustomMMLUEvaluationTask(name="mmlu:high_school_computer_science", hf_subset="high_school_computer_science"), |
|
CustomMMLUEvaluationTask(name="mmlu:high_school_european_history", hf_subset="high_school_european_history"), |
|
CustomMMLUEvaluationTask(name="mmlu:high_school_geography", hf_subset="high_school_geography"), |
|
CustomMMLUEvaluationTask( |
|
name="mmlu:high_school_government_and_politics", hf_subset="high_school_government_and_politics" |
|
), |
|
CustomMMLUEvaluationTask(name="mmlu:high_school_macroeconomics", hf_subset="high_school_macroeconomics"), |
|
CustomMMLUEvaluationTask(name="mmlu:high_school_mathematics", hf_subset="high_school_mathematics"), |
|
CustomMMLUEvaluationTask(name="mmlu:high_school_microeconomics", hf_subset="high_school_microeconomics"), |
|
CustomMMLUEvaluationTask(name="mmlu:high_school_physics", hf_subset="high_school_physics"), |
|
CustomMMLUEvaluationTask(name="mmlu:high_school_psychology", hf_subset="high_school_psychology"), |
|
CustomMMLUEvaluationTask(name="mmlu:high_school_statistics", hf_subset="high_school_statistics"), |
|
CustomMMLUEvaluationTask(name="mmlu:high_school_us_history", hf_subset="high_school_us_history"), |
|
CustomMMLUEvaluationTask(name="mmlu:high_school_world_history", hf_subset="high_school_world_history"), |
|
CustomMMLUEvaluationTask(name="mmlu:human_aging", hf_subset="human_aging"), |
|
CustomMMLUEvaluationTask(name="mmlu:human_sexuality", hf_subset="human_sexuality"), |
|
CustomMMLUEvaluationTask(name="mmlu:international_law", hf_subset="international_law"), |
|
CustomMMLUEvaluationTask(name="mmlu:jurisprudence", hf_subset="jurisprudence"), |
|
CustomMMLUEvaluationTask(name="mmlu:logical_fallacies", hf_subset="logical_fallacies"), |
|
CustomMMLUEvaluationTask(name="mmlu:machine_learning", hf_subset="machine_learning"), |
|
CustomMMLUEvaluationTask(name="mmlu:management", hf_subset="management"), |
|
CustomMMLUEvaluationTask(name="mmlu:marketing", hf_subset="marketing"), |
|
CustomMMLUEvaluationTask(name="mmlu:medical_genetics", hf_subset="medical_genetics"), |
|
CustomMMLUEvaluationTask(name="mmlu:miscellaneous", hf_subset="miscellaneous"), |
|
CustomMMLUEvaluationTask(name="mmlu:moral_disputes", hf_subset="moral_disputes"), |
|
CustomMMLUEvaluationTask(name="mmlu:moral_scenarios", hf_subset="moral_scenarios"), |
|
CustomMMLUEvaluationTask(name="mmlu:nutrition", hf_subset="nutrition"), |
|
CustomMMLUEvaluationTask(name="mmlu:philosophy", hf_subset="philosophy"), |
|
CustomMMLUEvaluationTask(name="mmlu:prehistory", hf_subset="prehistory"), |
|
CustomMMLUEvaluationTask(name="mmlu:professional_accounting", hf_subset="professional_accounting"), |
|
CustomMMLUEvaluationTask(name="mmlu:professional_law", hf_subset="professional_law"), |
|
CustomMMLUEvaluationTask(name="mmlu:professional_medicine", hf_subset="professional_medicine"), |
|
CustomMMLUEvaluationTask(name="mmlu:professional_psychology", hf_subset="professional_psychology"), |
|
CustomMMLUEvaluationTask(name="mmlu:public_relations", hf_subset="public_relations"), |
|
CustomMMLUEvaluationTask(name="mmlu:security_studies", hf_subset="security_studies"), |
|
CustomMMLUEvaluationTask(name="mmlu:sociology", hf_subset="sociology"), |
|
CustomMMLUEvaluationTask(name="mmlu:us_foreign_policy", hf_subset="us_foreign_policy"), |
|
CustomMMLUEvaluationTask(name="mmlu:virology", hf_subset="virology"), |
|
CustomMMLUEvaluationTask(name="mmlu:world_religions", hf_subset="world_religions"), |
|
] |
|
|
|
|
|
def mmlu_harness(line, task_name: str = None): |
|
topic = line["subject"] |
|
prompt = f"The following are multiple choice questions (with answers) about {topic.replace('_', ' ')}.\n\n" |
|
prompt += line["question"] + "\n" |
|
prompt += "".join([f"{key}. {choice}\n" for key, choice in zip(LETTER_INDICES, line["choices"])]) |
|
prompt += "Answer:" |
|
|
|
gold_ix = LETTER_INDICES.index(line["answer"]) if isinstance(line["answer"], str) else line["answer"] |
|
"__few_shots" in line and line["__few_shots"] is True |
|
|
|
return Doc( |
|
task_name=task_name, |
|
query=prompt, |
|
choices=[" A", " B", " C", " D"], |
|
target_for_fewshot_sorting=[" A", " B", " C", " D"][gold_ix], |
|
gold_index=gold_ix, |
|
instruction=f"The following are multiple choice questions (with answers) about {topic.replace('_', ' ')}.\n\n", |
|
) |
|
|
|
|
|
def mmlu_prompt(line, task_name: str = None): |
|
"""MMLU prompt without letters""" |
|
topic = line["subject"] |
|
prompt = f"The following are questions about {topic.replace('_', ' ')}.\nQuestion: " |
|
prompt += line["question"] + "\nAnswer:" |
|
|
|
return Doc( |
|
task_name=task_name, |
|
query=prompt, |
|
choices=[f" {c}" for c in line["choices"]], |
|
gold_index=line["answer"], |
|
instruction=f"The following are questions about {topic.replace('_', ' ')}.\n", |
|
) |
|
|
|
|
|
|
|
MMLU_STRING = [(t, f"custom|{t.name}|0|1") for t in MMLU_TASKS] |
|
_TASKS_STRINGS.extend(MMLU_STRING) |
|
_TASKS += MMLU_TASKS |
|
|
|
|
|
|
|
|
|
class CustomBBHEvaluationTask(CustomEvaluationTask): |
|
def __init__( |
|
self, |
|
name, |
|
prompt_function="bbh_prompt", |
|
hf_repo="lighteval/big_bench_hard", |
|
hf_subset=None, |
|
metric=[Metrics.exact_match], |
|
hf_avail_splits=["train"], |
|
evaluation_splits=["train"], |
|
few_shots_split="train", |
|
few_shots_select=None, |
|
suite=None, |
|
generation_size=4, |
|
stop_sequence=None, |
|
output_regex=None, |
|
frozen=False, |
|
): |
|
super().__init__( |
|
name=name, |
|
prompt_function=prompt_function, |
|
hf_repo=hf_repo, |
|
hf_subset=hf_subset, |
|
metric=metric, |
|
hf_avail_splits=hf_avail_splits, |
|
evaluation_splits=evaluation_splits, |
|
few_shots_split=few_shots_split, |
|
few_shots_select=few_shots_select, |
|
suite=suite, |
|
generation_size=generation_size, |
|
stop_sequence=stop_sequence, |
|
output_regex=output_regex, |
|
frozen=frozen, |
|
) |
|
|
|
|
|
BBH_TASKS = [ |
|
CustomBBHEvaluationTask(name="bbh:boolean_expressions", hf_subset="boolean_expressions"), |
|
CustomBBHEvaluationTask(name="bbh:causal_judgement", hf_subset="causal_judgement"), |
|
CustomBBHEvaluationTask(name="bbh:date_understanding", hf_subset="date_understanding"), |
|
CustomBBHEvaluationTask(name="bbh:disambiguation_qa", hf_subset="disambiguation_qa"), |
|
CustomBBHEvaluationTask(name="bbh:dyck_languages", hf_subset="dyck_languages"), |
|
CustomBBHEvaluationTask(name="bbh:formal_fallacies", hf_subset="formal_fallacies"), |
|
CustomBBHEvaluationTask(name="bbh:geometric_shapes", hf_subset="geometric_shapes"), |
|
CustomBBHEvaluationTask(name="bbh:hyperbaton", hf_subset="hyperbaton"), |
|
CustomBBHEvaluationTask(name="bbh:logical_deduction_five_objects", hf_subset="logical_deduction_five_objects"), |
|
CustomBBHEvaluationTask(name="bbh:logical_deduction_seven_objects", hf_subset="logical_deduction_seven_objects"), |
|
CustomBBHEvaluationTask(name="bbh:logical_deduction_three_objects", hf_subset="logical_deduction_three_objects"), |
|
CustomBBHEvaluationTask(name="bbh:movie_recommendation", hf_subset="movie_recommendation"), |
|
CustomBBHEvaluationTask(name="bbh:multistep_arithmetic_two", hf_subset="multistep_arithmetic_two"), |
|
CustomBBHEvaluationTask(name="bbh:navigate", hf_subset="navigate"), |
|
CustomBBHEvaluationTask(name="bbh:object_counting", hf_subset="object_counting"), |
|
CustomBBHEvaluationTask(name="bbh:penguins_in_a_table", hf_subset="penguins_in_a_table"), |
|
CustomBBHEvaluationTask(name="bbh:reasoning_about_colored_objects", hf_subset="reasoning_about_colored_objects"), |
|
CustomBBHEvaluationTask(name="bbh:ruin_names", hf_subset="ruin_names"), |
|
CustomBBHEvaluationTask( |
|
name="bbh:salient_translation_error_detection", hf_subset="salient_translation_error_detection" |
|
), |
|
CustomBBHEvaluationTask(name="bbh:snarks", hf_subset="snarks"), |
|
CustomBBHEvaluationTask(name="bbh:sports_understanding", hf_subset="sports_understanding"), |
|
CustomBBHEvaluationTask(name="bbh:temporal_sequences", hf_subset="temporal_sequences"), |
|
CustomBBHEvaluationTask( |
|
name="bbh:tracking_shuffled_objects_five_objects", hf_subset="tracking_shuffled_objects_five_objects" |
|
), |
|
CustomBBHEvaluationTask( |
|
name="bbh:tracking_shuffled_objects_seven_objects", hf_subset="tracking_shuffled_objects_seven_objects" |
|
), |
|
CustomBBHEvaluationTask( |
|
name="bbh:tracking_shuffled_objects_three_objects", hf_subset="tracking_shuffled_objects_three_objects" |
|
), |
|
CustomBBHEvaluationTask(name="bbh:web_of_lies", hf_subset="web_of_lies"), |
|
CustomBBHEvaluationTask(name="bbh:word_sorting", hf_subset="word_sorting"), |
|
] |
|
|
|
|
|
def bbh_prompt(line, task_name: str = None): |
|
return Doc( |
|
task_name=task_name, |
|
query=line["input"] + "\nAnswer: ", |
|
choices=[line["target"]], |
|
gold_index=0, |
|
) |
|
|
|
|
|
|
|
BBH_STRING = [(t, f"custom|{t.name}|0|1") for t in BBH_TASKS] |
|
_TASKS_STRINGS.extend(BBH_STRING) |
|
_TASKS += BBH_TASKS |
|
|
|
|
|
|
|
class CustomAGIEvalEvaluationTask(CustomEvaluationTask): |
|
def __init__( |
|
self, |
|
name, |
|
prompt_function="agi_eval_prompt_no_letters", |
|
hf_repo="lighteval/agi_eval_en", |
|
hf_subset=None, |
|
|
|
metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace], |
|
hf_avail_splits=["train", "validation"], |
|
evaluation_splits=["train"], |
|
few_shots_split="validation", |
|
few_shots_select=None, |
|
suite=None, |
|
generation_size=-1, |
|
stop_sequence=None, |
|
output_regex=None, |
|
frozen=False, |
|
): |
|
super().__init__( |
|
name=name, |
|
prompt_function=prompt_function, |
|
hf_repo=hf_repo, |
|
hf_subset=hf_subset, |
|
metric=metric, |
|
hf_avail_splits=hf_avail_splits, |
|
evaluation_splits=evaluation_splits, |
|
few_shots_split=few_shots_split, |
|
few_shots_select=few_shots_select, |
|
suite=suite, |
|
generation_size=generation_size, |
|
stop_sequence=stop_sequence, |
|
output_regex=output_regex, |
|
frozen=frozen, |
|
) |
|
|
|
|
|
AGIEVAL_TASKS = [ |
|
CustomAGIEvalEvaluationTask(name="agi_eval:aqua_rat", hf_subset="aqua_rat"), |
|
CustomAGIEvalEvaluationTask(name="agi_eval:logiqa-en", hf_subset="logiqa-en"), |
|
CustomAGIEvalEvaluationTask(name="agi_eval:lsat-ar", hf_subset="lsat-ar"), |
|
CustomAGIEvalEvaluationTask(name="agi_eval:lsat-lr", hf_subset="lsat-lr"), |
|
CustomAGIEvalEvaluationTask(name="agi_eval:lsat-rc", hf_subset="lsat-rc"), |
|
CustomAGIEvalEvaluationTask( |
|
name="agi_eval:math", |
|
hf_subset="math", |
|
prompt_function="agi_eval_math_prompt", |
|
metric=[Metrics.exact_match, Metrics.quasi_exact_match2], |
|
generation_size=40, |
|
), |
|
CustomAGIEvalEvaluationTask(name="agi_eval:sat-en", hf_subset="sat-en"), |
|
CustomAGIEvalEvaluationTask(name="agi_eval:sat-math", hf_subset="sat-math"), |
|
] |
|
|
|
|
|
def agi_eval_math_prompt(line, task_name: str = None): |
|
return Doc( |
|
task_name=task_name, |
|
query=line["question"], |
|
choices=[line["answer"]], |
|
gold_index=0, |
|
instruction="", |
|
) |
|
|
|
|
|
def agi_eval_prompt(line, task_name: str = None): |
|
cleaned_options = [o.replace("(", "").replace(")", " ") for o in line["options"]] |
|
prompt = "The following are multiple choice questions (with answers).\n\n" |
|
prompt += line["question"] + "\n" + "\n".join(cleaned_options) + "\n" |
|
prompt += "Answer: " |
|
|
|
choices = LETTER_INDICES[: len(line["options"])] |
|
|
|
output = Doc( |
|
query=prompt, |
|
instruction="The following are multiple choice questions (with answers).\n\n", |
|
) |
|
|
|
if line["label"]: |
|
output.choices = choices |
|
output.gold_index = LETTER_INDICES.index(line["label"].strip()) |
|
else: |
|
output.choices = [line["answer"]] |
|
output.gold_index = 0 |
|
|
|
return output |
|
|
|
|
|
def agi_eval_prompt_no_letters(line, task_name: str = None): |
|
cleaned_options = [ |
|
" " + o.replace("(A)", "").replace("(B)", "").replace("(C)", "").replace("(D)", "").replace("(E)", "") |
|
for o in line["options"] |
|
] |
|
|
|
output = Doc( |
|
query=line["question"], |
|
choices=cleaned_options, |
|
gold_index=LETTER_INDICES.index(line["label"].strip()), |
|
instruction="", |
|
) |
|
|
|
return output |
|
|
|
|
|
|
|
AGIEVAL_STRING = [(t, f"custom|{t.name}|0|1") for t in AGIEVAL_TASKS] |
|
_TASKS_STRINGS.extend(AGIEVAL_STRING) |
|
_TASKS += AGIEVAL_TASKS |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def has_generative_metrics(task: CustomEvaluationTask) -> bool: |
|
for metric in task.metric: |
|
if metric in NEEDS_GENERATION_ONLY: |
|
return True |
|
return False |
|
|
|
|
|
EARLY_SIGNAL_TASKS = ",".join([t[1] for t in COMMON_SENSE_REASONING_STRING] + [t[1] for t in MMLU_STRING]) |
|
|
|
|
|
TASKS_TABLE = [asdict(task) for task in _TASKS] |
|
|
|
TASKS_GROUPS = { |
|
"all": ",".join(t[1] for t in _TASKS_STRINGS), |
|
"early-signal": EARLY_SIGNAL_TASKS, |
|
"non-generatives": ",".join(t for k, t in _TASKS_STRINGS if not has_generative_metrics(k)), |
|
"generatives": ",".join(t for k, t in _TASKS_STRINGS if has_generative_metrics(k)), |
|
} |
|
|
|
if __name__ == "__main__": |
|
print(t["name"] for t in TASKS_TABLE) |
|
print(len(TASKS_TABLE)) |
|
|