pminervini commited on
Commit
6dcc9f8
1 Parent(s): 4d089a8
src/backend/run_eval_suite.py CHANGED
@@ -9,7 +9,7 @@ from src.backend.tasks.xsum.task_v2 import XSumv2
9
  from src.backend.tasks.cnndm.task import CNNDM
10
  from src.backend.tasks.cnndm.task_v2 import CNNDMv2
11
 
12
- from src.backend.tasks.selfcheckgpt.task import SelfCheckGpt
13
 
14
 
15
  def run_evaluation(eval_request: EvalRequest, task_names, num_fewshot, batch_size, device, use_cache=None, limit=None, max_nb_samples=100) -> dict:
 
9
  from src.backend.tasks.cnndm.task import CNNDM
10
  from src.backend.tasks.cnndm.task_v2 import CNNDMv2
11
 
12
+ from src.backend.tasks.selfcheckgpt.task import SelfCheckGPT
13
 
14
 
15
  def run_evaluation(eval_request: EvalRequest, task_names, num_fewshot, batch_size, device, use_cache=None, limit=None, max_nb_samples=100) -> dict:
src/backend/tasks/__init__.py CHANGED
@@ -4,4 +4,4 @@ from src.backend.tasks.xsum.task_v2 import XSumv2
4
  from src.backend.tasks.cnndm.task import CNNDM
5
  from src.backend.tasks.cnndm.task_v2 import CNNDMv2
6
 
7
- from src.backend.tasks.selfcheckgpt.task import SelfCheckGpt
 
4
  from src.backend.tasks.cnndm.task import CNNDM
5
  from src.backend.tasks.cnndm.task_v2 import CNNDMv2
6
 
7
+ from src.backend.tasks.selfcheckgpt.task import SelfCheckGPT
src/backend/tasks/cnndm/cnndm.yaml ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ task: cnndm
2
+ class: !function task.CNNDM
src/backend/tasks/cnndm/task.py CHANGED
@@ -59,14 +59,14 @@ def rouge(refs, preds):
59
  return {type: result[type].mid.fmeasure * 100 for type in rouge_types}
60
 
61
 
62
- @register_task("cnndm")
63
  class CNNDM(Task):
64
  VERSION = 0
65
  DATASET_PATH = "cnn_dailymail"
66
  DATASET_NAME = "3.0.0"
67
 
68
- def __init__(self, data_dir=None, cache_dir=None, download_mode=None, config=None):
69
- super().__init__(data_dir=data_dir, cache_dir=cache_dir, download_mode=download_mode, config=config)
70
  self.factkb_tokenizer = None
71
  self.factkb_model = None
72
  self.bert_score = None
 
59
  return {type: result[type].mid.fmeasure * 100 for type in rouge_types}
60
 
61
 
62
+ # @register_task("cnndm")
63
  class CNNDM(Task):
64
  VERSION = 0
65
  DATASET_PATH = "cnn_dailymail"
66
  DATASET_NAME = "3.0.0"
67
 
68
+ def __init__(self):
69
+ super().__init__(config={'metadata': {'version': self.VERSION}})
70
  self.factkb_tokenizer = None
71
  self.factkb_model = None
72
  self.bert_score = None
src/backend/tasks/cnndm/task_v2.py CHANGED
@@ -59,14 +59,14 @@ def rouge(refs, preds):
59
  return {type: result[type].mid.fmeasure * 100 for type in rouge_types}
60
 
61
 
62
- @register_task("cnndm_v2")
63
  class CNNDMv2(Task):
64
  VERSION = 0
65
  DATASET_PATH = "cnn_dailymail"
66
  DATASET_NAME = "3.0.0"
67
 
68
- def __init__(self, data_dir=None, cache_dir=None, download_mode=None, config=None):
69
- super().__init__(data_dir=data_dir, cache_dir=cache_dir, download_mode=download_mode, config=config)
70
  self.factkb_tokenizer = None
71
  self.factkb_model = None
72
  self.bert_score = None
 
59
  return {type: result[type].mid.fmeasure * 100 for type in rouge_types}
60
 
61
 
62
+ # @register_task("cnndm_v2")
63
  class CNNDMv2(Task):
64
  VERSION = 0
65
  DATASET_PATH = "cnn_dailymail"
66
  DATASET_NAME = "3.0.0"
67
 
68
+ def __init__(self):
69
+ super().__init__(config={'metadata': {'version': self.VERSION}})
70
  self.factkb_tokenizer = None
71
  self.factkb_model = None
72
  self.bert_score = None
src/backend/tasks/cnndm/xsum_v2.yaml ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ task: cnndm_v2
2
+ class: !function task_v2.CNNDM_v2
src/backend/tasks/selfcheckgpt/selfcheckgpt.yaml ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ task: selfcheckgpt
2
+ class: !function task.SelfCheckGPT
src/backend/tasks/selfcheckgpt/task.py CHANGED
@@ -12,15 +12,15 @@ import spacy
12
  from selfcheckgpt.modeling_selfcheck import SelfCheckMQAG, SelfCheckNLI, SelfCheckBERTScore, SelfCheckNgram
13
 
14
 
15
- @register_task("selfcheckgpt")
16
- class SelfCheckGpt(Task):
17
  VERSION = 0.0
18
  DATASET_PATH = "potsawee/wiki_bio_gpt3_hallucination"
19
  DATASET_NAME = None
20
  OUTPUT_TYPE = 'generate_until'
21
 
22
- def __init__(self, data_dir=None, cache_dir=None, download_mode=None, config=None):
23
- super().__init__(data_dir=data_dir, cache_dir=cache_dir, download_mode=download_mode, config=config)
24
  # these end tokens are hard coded because of the current limitaion of the llm-eval.
25
  self.generation_kwargs = {"until": ["\n\n", "<unk>", "<|im_end|>", "</s>"], "max_length": 512}
26
  self.generation_kwargs_sampling_number = 5 # the number of sampling for self-consistence
 
12
  from selfcheckgpt.modeling_selfcheck import SelfCheckMQAG, SelfCheckNLI, SelfCheckBERTScore, SelfCheckNgram
13
 
14
 
15
+ # @register_task("selfcheckgpt")
16
+ class SelfCheckGPT(Task):
17
  VERSION = 0.0
18
  DATASET_PATH = "potsawee/wiki_bio_gpt3_hallucination"
19
  DATASET_NAME = None
20
  OUTPUT_TYPE = 'generate_until'
21
 
22
+ def __init__(self):
23
+ super().__init__(config={'metadata': {'version': self.VERSION}})
24
  # these end tokens are hard coded because of the current limitaion of the llm-eval.
25
  self.generation_kwargs = {"until": ["\n\n", "<unk>", "<|im_end|>", "</s>"], "max_length": 512}
26
  self.generation_kwargs_sampling_number = 5 # the number of sampling for self-consistence
src/backend/tasks/xsum/task.py CHANGED
@@ -1,6 +1,5 @@
1
- from lm_eval.api.task import Task
2
  from lm_eval.api.instance import Instance
3
- from lm_eval.api.registry import register_task
4
  from lm_eval.api.metrics import mean
5
 
6
  import torch
@@ -51,14 +50,14 @@ def rouge(refs, preds):
51
  return {type: result[type].mid.fmeasure * 100 for type in rouge_types}
52
 
53
 
54
- @register_task("xsum")
55
  class XSum(Task):
56
  VERSION = 0
57
  DATASET_PATH = "EdinburghNLP/xsum"
58
  DATASET_NAME = None
59
 
60
- def __init__(self, data_dir=None, cache_dir=None, download_mode=None, config=None):
61
- super().__init__(data_dir=data_dir, cache_dir=cache_dir, download_mode=download_mode, config=config)
62
  self.factkb_tokenizer = None
63
  self.factkb_model = None
64
  self.bert_score = None
 
1
+ from lm_eval.api.task import Task, ConfigurableTask
2
  from lm_eval.api.instance import Instance
 
3
  from lm_eval.api.metrics import mean
4
 
5
  import torch
 
50
  return {type: result[type].mid.fmeasure * 100 for type in rouge_types}
51
 
52
 
53
+ # @register_task("xsum")
54
  class XSum(Task):
55
  VERSION = 0
56
  DATASET_PATH = "EdinburghNLP/xsum"
57
  DATASET_NAME = None
58
 
59
+ def __init__(self):
60
+ super().__init__(config={'metadata': {'version': self.VERSION}})
61
  self.factkb_tokenizer = None
62
  self.factkb_model = None
63
  self.bert_score = None
src/backend/tasks/xsum/task_v2.py CHANGED
@@ -1,4 +1,4 @@
1
- from lm_eval.api.task import Task
2
  from lm_eval.api.instance import Instance
3
  from lm_eval.api.registry import register_task
4
  from lm_eval.api.metrics import mean
@@ -51,14 +51,14 @@ def rouge(refs, preds):
51
  return {type: result[type].mid.fmeasure * 100 for type in rouge_types}
52
 
53
 
54
- @register_task("xsum_v2")
55
- class XSumv2(Task):
56
  VERSION = 0
57
  DATASET_PATH = "EdinburghNLP/xsum"
58
  DATASET_NAME = None
59
 
60
- def __init__(self, data_dir=None, cache_dir=None, download_mode=None, config=None):
61
- super().__init__(data_dir=data_dir, cache_dir=cache_dir, download_mode=download_mode, config=config)
62
  self.factkb_tokenizer = None
63
  self.factkb_model = None
64
  self.bert_score = None
@@ -92,15 +92,14 @@ class XSumv2(Task):
92
  def test_docs(self):
93
  return self.dataset["test"]
94
 
95
- def prompt(self):
96
  res = "Provide a summary of the provided document."
97
  return res
98
 
99
  def doc_to_text(self, doc):
100
- return f'{self.prompt()}\n\nDocument: {doc["document"]}\nSummary:'
101
 
102
- @staticmethod
103
- def should_decontaminate():
104
  return True
105
 
106
  def doc_to_decontamination_query(self, doc):
 
1
+ from lm_eval.api.task import ConfigurableTask, Task, TaskConfig
2
  from lm_eval.api.instance import Instance
3
  from lm_eval.api.registry import register_task
4
  from lm_eval.api.metrics import mean
 
51
  return {type: result[type].mid.fmeasure * 100 for type in rouge_types}
52
 
53
 
54
+ # @register_task("xsum_v2")
55
+ class XSumv2(ConfigurableTask):
56
  VERSION = 0
57
  DATASET_PATH = "EdinburghNLP/xsum"
58
  DATASET_NAME = None
59
 
60
+ def __init__(self):
61
+ super().__init__(config={'metadata': {'version': self.VERSION}})
62
  self.factkb_tokenizer = None
63
  self.factkb_model = None
64
  self.bert_score = None
 
92
  def test_docs(self):
93
  return self.dataset["test"]
94
 
95
+ def custom_prompt(self):
96
  res = "Provide a summary of the provided document."
97
  return res
98
 
99
  def doc_to_text(self, doc):
100
+ return f'{self.custom_prompt()}\n\nDocument: {doc["document"]}\nSummary:'
101
 
102
+ def should_decontaminate(self):
 
103
  return True
104
 
105
  def doc_to_decontamination_query(self, doc):
src/backend/tasks/xsum/xsum.yaml ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ task: xsum
2
+ class: !function task.XSum
src/backend/tasks/xsum/xsum_v2.yaml ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ task: xsum_v2
2
+ class: !function task_v2.XSumv2