File size: 2,275 Bytes
14e4843 a89d71b 14e4843 a89d71b 14e4843 a89d71b 14e4843 a89d71b 14e4843 2d754ab a89d71b 14e4843 a89d71b 14e4843 a89d71b 14e4843 2d754ab 14e4843 a89d71b 14e4843 a89d71b 14e4843 a89d71b 14e4843 a89d71b 2d754ab 14e4843 d6d7ec6 14e4843 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 |
import os
import torch
from dataclasses import dataclass
from enum import Enum
from src.envs import CACHE_PATH
@dataclass
class Task:
benchmark: str
metric: str
col_name: str
num_fewshot: int
class Tasks(Enum):
# task0 = Task("nq_open", "em", "NQ Open", 64) # 64, as in the ATLAS paper
# task1 = Task("triviaqa", "em", "TriviaQA", 64) # 64, as in the ATLAS paper
# task11 = Task("nq8", "em", "NQ Open 8", 8)
# task12 = Task("tqa8", "em", "TriviaQA 8", 8)
# TruthfulQA is intended as a zero-shot benchmark [5, 47]. https://owainevans.github.io/pdfs/truthfulQA_lin_evans.pdf
# task2 = Task("truthfulqa_gen", "rougeL_acc", "TruthfulQA Gen", 0)
# task3 = Task("truthfulqa_mc1", "acc", "TruthfulQA MC1", 0)
# task4 = Task("truthfulqa_mc2", "acc", "TruthfulQA MC2", 0)
# task5 = Task("halueval_qa", "acc", "HaluEval QA", 0)
# task6 = Task("halueval_dialogue", "acc", "HaluEval Dialogue", 0)
# task7 = Task("halueval_summarization", "acc", "HaluEval Summarization", 0)
# task8 = Task("xsum", "rougeL", "XSum", 2)
# task9 = Task("cnndm", "rougeL", "CNN/DM", 2)
# task8_1 = Task("xsum_v2", "rougeL", "XSum", 0)
# task9_1 = Task("cnndm_v2", "rougeL", "CNN/DM", 0)
# task10 = Task("memo-trap", "acc", "memo-trap", 0)
# task10_2 = Task("memo-trap_v2", "acc", "memo-trap", 0)
# task13 = Task("ifeval", "prompt_level_strict_acc", "IFEval", 0)
task14 = Task("selfcheckgpt", "max-selfcheckgpt", "SelfCheckGPT", 0)
# task15 = Task("fever10", "acc", "FEVER", 16)
# task15_1 = Task("fever11", "acc", "FEVER", 8)
# task16 = Task("squadv2", "exact", "SQuADv2", 4)
# task17 = Task("truefalse_cieacf", "acc", "TrueFalse", 8)
# task18 = Task("faithdial_hallu", "acc", "FaithDial", 8)
# task19 = Task("faithdial_hallu_v2", "acc", "FaithDial", 8)
# task20 = Task("race", "acc", "RACE", 0)
task21 = Task("mmlu", "acc", "MMLU", 5)
EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
EVAL_REQUESTS_PATH_BACKEND_SYNC = os.path.join(CACHE_PATH, "eval-queue-bk-sync")
EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
LIMIT = None # Testing; needs to be None
|