Spaces:
Runtime error
Runtime error
File size: 2,151 Bytes
894c4b4 6c79b12 894c4b4 90dff75 73d1e6e c639c51 a6af742 be7e092 bcdca08 73d1e6e bcdca08 23a137b 73d1e6e e598f52 73d1e6e fd7beec e598f52 1591f9d 894c4b4 5999035 f21645c 9aa52c9 60529bb 5ca644e 39b4e9f 62679c8 21eac98 62ea587 efa0391 a117804 efa0391 7c35ca5 6c79b12 894c4b4 b1a5839 894c4b4 7e68bad 894c4b4 f9d415e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 |
import os
import torch
from dataclasses import dataclass
from enum import Enum
from src.envs import CACHE_PATH
@dataclass
class Task:
benchmark: str
metric: str
col_name: str
num_fewshot: int
class Tasks(Enum):
task0 = Task("nq_open", "em", "NQ Open", 64) # 64, as in the ATLAS paper
task1 = Task("triviaqa", "em", "TriviaQA", 64) # 64, as in the ATLAS paper
task11 = Task("nq8", "em", "NQ Open 8", 8)
task12 = Task("tqa8", "em", "TriviaQA 8", 8)
# TruthfulQA is intended as a zero-shot benchmark [5, 47]. https://owainevans.github.io/pdfs/truthfulQA_lin_evans.pdf
task2 = Task("truthfulqa_gen", "rougeL_acc", "TruthfulQA Gen", 0)
task3 = Task("truthfulqa_mc1", "acc", "TruthfulQA MC1", 0)
task4 = Task("truthfulqa_mc2", "acc", "TruthfulQA MC2", 0)
task5 = Task("halueval_qa", "acc", "HaluEval QA", 0)
task6 = Task("halueval_dialogue", "acc", "HaluEval Dialogue", 0)
task7 = Task("halueval_summarization", "acc", "HaluEval Summarization", 0)
task8 = Task("xsum", "rougeL", "XSum", 2)
task9 = Task("cnndm", "rougeL", "CNN/DM", 2)
task8_1 = Task("xsum_v2", "rougeL", "XSum", 0)
task9_1 = Task("cnndm_v2", "rougeL", "CNN/DM", 0)
task10 = Task("memo-trap", "acc", "memo-trap", 0)
task10_2 = Task("memo-trap_v2", "acc", "memo-trap", 0)
task13 = Task("ifeval", "prompt_level_strict_acc", "IFEval", 0)
task14 = Task("selfcheckgpt", "max-selfcheckgpt", "SelfCheckGPT", 0)
task15 = Task("fever10", "acc", "FEVER", 16)
task15_1 = Task("fever11", "acc", "FEVER", 8)
task16 = Task("squadv2", "exact", "SQuADv2", 4)
task17 = Task("truefalse_cieacf", "acc", "TrueFalse", 8)
task18 = Task("faithdial_hallu", "acc", "FaithDial", 8)
task19 = Task("faithdial_hallu_v2", "acc", "FaithDial", 8)
task20 = Task("race", "acc", "RACE", 0)
# NUM_FEWSHOT = 64 # Change with your few shot
EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
DEVICE = "cuda" if torch.cuda.is_available() else 'cpu'
LIMIT = None # Testing; needs to be None
|