Spaces:
Runtime error
Runtime error
File size: 2,129 Bytes
894c4b4 6c79b12 894c4b4 90dff75 bcdca08 73d1e6e be7e092 bcdca08 73d1e6e bcdca08 23a137b 73d1e6e e598f52 73d1e6e e598f52 894c4b4 7e267bf 5999035 f21645c 9aa52c9 53c755d 39b4e9f 62679c8 21eac98 62ea587 efa0391 a117804 efa0391 7c35ca5 6c79b12 894c4b4 b1a5839 894c4b4 7e68bad 894c4b4 f9d415e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 |
import os
import torch
from dataclasses import dataclass
from enum import Enum
from src.envs import CACHE_PATH
@dataclass
class Task:
benchmark: str
metric: str
col_name: str
num_fewshot: int
class Tasks(Enum):
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
# task0 = Task("anli_r1", "acc", "ANLI")
# task1 = Task("logiqa", "acc_norm", "LogiQA")
task0 = Task("nq_open", "em", "NQ Open", 64) # 64, as in the ATLAS paper
task1 = Task("triviaqa", "em", "TriviaQA", 64) # 64, as in the ATLAS paper
# TruthfulQA is intended as a zero-shot benchmark [5, 47]. https://owainevans.github.io/pdfs/truthfulQA_lin_evans.pdf
task2 = Task("truthfulqa_gen", "rougeL_acc", "TruthfulQA Gen", 0)
task3 = Task("truthfulqa_mc1", "acc", "TruthfulQA MC1", 0)
task4 = Task("truthfulqa_mc2", "acc", "TruthfulQA MC2", 0)
task5 = Task("halueval_qa", "acc", "HaluEval QA", 0)
task6 = Task("halueval_dialogue", "acc", "HaluEval Dialogue", 0)
task7 = Task("halueval_summarization", "acc", "HaluEval Summarization", 0)
task8 = Task("xsum", "rougeL", "XSum", 2)
task9 = Task("cnndm", "rougeL", "CNN/DM", 2)
task10 = Task("memo-trap", "acc", "memo-trap", 0)
task11 = Task("nq8", "em", "NQ Open 8", 8)
task12 = Task("tqa8", "em", "TriviaQA 8", 8)
task13 = Task("ifeval", "prompt_level_strict_acc", "IFEval", 0)
task14 = Task("selfcheckgpt", "max-selfcheckgpt", "SelfCheckGPT", 0)
task15 = Task("fever10", "acc", "FEVER", 16)
task16 = Task("squadv2", "exact", "SQuADv2", 4)
task17 = Task("truefalse_cieacf", "acc", "TrueFalse", 8)
task18 = Task("faithdial_hallu", "acc", "FaithDial", 8)
task19 = Task("faithdial_hallu_v2", "acc", "FaithDial", 8)
task20 = Task("race", "acc", "RACE", 0)
# NUM_FEWSHOT = 64 # Change with your few shot
EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
DEVICE = "cuda" if torch.cuda.is_available() else 'cpu'
LIMIT = None # Testing; needs to be None
|