Spaces:
Runtime error
Runtime error
pminervini
commited on
Commit
•
6dcc9f8
1
Parent(s):
4d089a8
update
Browse files- src/backend/run_eval_suite.py +1 -1
- src/backend/tasks/__init__.py +1 -1
- src/backend/tasks/cnndm/cnndm.yaml +2 -0
- src/backend/tasks/cnndm/task.py +3 -3
- src/backend/tasks/cnndm/task_v2.py +3 -3
- src/backend/tasks/cnndm/xsum_v2.yaml +2 -0
- src/backend/tasks/selfcheckgpt/selfcheckgpt.yaml +2 -0
- src/backend/tasks/selfcheckgpt/task.py +4 -4
- src/backend/tasks/xsum/task.py +4 -5
- src/backend/tasks/xsum/task_v2.py +8 -9
- src/backend/tasks/xsum/xsum.yaml +2 -0
- src/backend/tasks/xsum/xsum_v2.yaml +2 -0
src/backend/run_eval_suite.py
CHANGED
@@ -9,7 +9,7 @@ from src.backend.tasks.xsum.task_v2 import XSumv2
|
|
9 |
from src.backend.tasks.cnndm.task import CNNDM
|
10 |
from src.backend.tasks.cnndm.task_v2 import CNNDMv2
|
11 |
|
12 |
-
from src.backend.tasks.selfcheckgpt.task import
|
13 |
|
14 |
|
15 |
def run_evaluation(eval_request: EvalRequest, task_names, num_fewshot, batch_size, device, use_cache=None, limit=None, max_nb_samples=100) -> dict:
|
|
|
9 |
from src.backend.tasks.cnndm.task import CNNDM
|
10 |
from src.backend.tasks.cnndm.task_v2 import CNNDMv2
|
11 |
|
12 |
+
from src.backend.tasks.selfcheckgpt.task import SelfCheckGPT
|
13 |
|
14 |
|
15 |
def run_evaluation(eval_request: EvalRequest, task_names, num_fewshot, batch_size, device, use_cache=None, limit=None, max_nb_samples=100) -> dict:
|
src/backend/tasks/__init__.py
CHANGED
@@ -4,4 +4,4 @@ from src.backend.tasks.xsum.task_v2 import XSumv2
|
|
4 |
from src.backend.tasks.cnndm.task import CNNDM
|
5 |
from src.backend.tasks.cnndm.task_v2 import CNNDMv2
|
6 |
|
7 |
-
from src.backend.tasks.selfcheckgpt.task import
|
|
|
4 |
from src.backend.tasks.cnndm.task import CNNDM
|
5 |
from src.backend.tasks.cnndm.task_v2 import CNNDMv2
|
6 |
|
7 |
+
from src.backend.tasks.selfcheckgpt.task import SelfCheckGPT
|
src/backend/tasks/cnndm/cnndm.yaml
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
task: cnndm
|
2 |
+
class: !function task.CNNDM
|
src/backend/tasks/cnndm/task.py
CHANGED
@@ -59,14 +59,14 @@ def rouge(refs, preds):
|
|
59 |
return {type: result[type].mid.fmeasure * 100 for type in rouge_types}
|
60 |
|
61 |
|
62 |
-
@register_task("cnndm")
|
63 |
class CNNDM(Task):
|
64 |
VERSION = 0
|
65 |
DATASET_PATH = "cnn_dailymail"
|
66 |
DATASET_NAME = "3.0.0"
|
67 |
|
68 |
-
def __init__(self
|
69 |
-
super().__init__(
|
70 |
self.factkb_tokenizer = None
|
71 |
self.factkb_model = None
|
72 |
self.bert_score = None
|
|
|
59 |
return {type: result[type].mid.fmeasure * 100 for type in rouge_types}
|
60 |
|
61 |
|
62 |
+
# @register_task("cnndm")
|
63 |
class CNNDM(Task):
|
64 |
VERSION = 0
|
65 |
DATASET_PATH = "cnn_dailymail"
|
66 |
DATASET_NAME = "3.0.0"
|
67 |
|
68 |
+
def __init__(self):
|
69 |
+
super().__init__(config={'metadata': {'version': self.VERSION}})
|
70 |
self.factkb_tokenizer = None
|
71 |
self.factkb_model = None
|
72 |
self.bert_score = None
|
src/backend/tasks/cnndm/task_v2.py
CHANGED
@@ -59,14 +59,14 @@ def rouge(refs, preds):
|
|
59 |
return {type: result[type].mid.fmeasure * 100 for type in rouge_types}
|
60 |
|
61 |
|
62 |
-
@register_task("cnndm_v2")
|
63 |
class CNNDMv2(Task):
|
64 |
VERSION = 0
|
65 |
DATASET_PATH = "cnn_dailymail"
|
66 |
DATASET_NAME = "3.0.0"
|
67 |
|
68 |
-
def __init__(self
|
69 |
-
super().__init__(
|
70 |
self.factkb_tokenizer = None
|
71 |
self.factkb_model = None
|
72 |
self.bert_score = None
|
|
|
59 |
return {type: result[type].mid.fmeasure * 100 for type in rouge_types}
|
60 |
|
61 |
|
62 |
+
# @register_task("cnndm_v2")
|
63 |
class CNNDMv2(Task):
|
64 |
VERSION = 0
|
65 |
DATASET_PATH = "cnn_dailymail"
|
66 |
DATASET_NAME = "3.0.0"
|
67 |
|
68 |
+
def __init__(self):
|
69 |
+
super().__init__(config={'metadata': {'version': self.VERSION}})
|
70 |
self.factkb_tokenizer = None
|
71 |
self.factkb_model = None
|
72 |
self.bert_score = None
|
src/backend/tasks/cnndm/xsum_v2.yaml
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
task: cnndm_v2
|
2 |
+
class: !function task_v2.CNNDM_v2
|
src/backend/tasks/selfcheckgpt/selfcheckgpt.yaml
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
task: selfcheckgpt
|
2 |
+
class: !function task.SelfCheckGPT
|
src/backend/tasks/selfcheckgpt/task.py
CHANGED
@@ -12,15 +12,15 @@ import spacy
|
|
12 |
from selfcheckgpt.modeling_selfcheck import SelfCheckMQAG, SelfCheckNLI, SelfCheckBERTScore, SelfCheckNgram
|
13 |
|
14 |
|
15 |
-
@register_task("selfcheckgpt")
|
16 |
-
class
|
17 |
VERSION = 0.0
|
18 |
DATASET_PATH = "potsawee/wiki_bio_gpt3_hallucination"
|
19 |
DATASET_NAME = None
|
20 |
OUTPUT_TYPE = 'generate_until'
|
21 |
|
22 |
-
def __init__(self
|
23 |
-
super().__init__(
|
24 |
# these end tokens are hard coded because of the current limitaion of the llm-eval.
|
25 |
self.generation_kwargs = {"until": ["\n\n", "<unk>", "<|im_end|>", "</s>"], "max_length": 512}
|
26 |
self.generation_kwargs_sampling_number = 5 # the number of sampling for self-consistence
|
|
|
12 |
from selfcheckgpt.modeling_selfcheck import SelfCheckMQAG, SelfCheckNLI, SelfCheckBERTScore, SelfCheckNgram
|
13 |
|
14 |
|
15 |
+
# @register_task("selfcheckgpt")
|
16 |
+
class SelfCheckGPT(Task):
|
17 |
VERSION = 0.0
|
18 |
DATASET_PATH = "potsawee/wiki_bio_gpt3_hallucination"
|
19 |
DATASET_NAME = None
|
20 |
OUTPUT_TYPE = 'generate_until'
|
21 |
|
22 |
+
def __init__(self):
|
23 |
+
super().__init__(config={'metadata': {'version': self.VERSION}})
|
24 |
# these end tokens are hard coded because of the current limitaion of the llm-eval.
|
25 |
self.generation_kwargs = {"until": ["\n\n", "<unk>", "<|im_end|>", "</s>"], "max_length": 512}
|
26 |
self.generation_kwargs_sampling_number = 5 # the number of sampling for self-consistence
|
src/backend/tasks/xsum/task.py
CHANGED
@@ -1,6 +1,5 @@
|
|
1 |
-
from lm_eval.api.task import Task
|
2 |
from lm_eval.api.instance import Instance
|
3 |
-
from lm_eval.api.registry import register_task
|
4 |
from lm_eval.api.metrics import mean
|
5 |
|
6 |
import torch
|
@@ -51,14 +50,14 @@ def rouge(refs, preds):
|
|
51 |
return {type: result[type].mid.fmeasure * 100 for type in rouge_types}
|
52 |
|
53 |
|
54 |
-
@register_task("xsum")
|
55 |
class XSum(Task):
|
56 |
VERSION = 0
|
57 |
DATASET_PATH = "EdinburghNLP/xsum"
|
58 |
DATASET_NAME = None
|
59 |
|
60 |
-
def __init__(self
|
61 |
-
super().__init__(
|
62 |
self.factkb_tokenizer = None
|
63 |
self.factkb_model = None
|
64 |
self.bert_score = None
|
|
|
1 |
+
from lm_eval.api.task import Task, ConfigurableTask
|
2 |
from lm_eval.api.instance import Instance
|
|
|
3 |
from lm_eval.api.metrics import mean
|
4 |
|
5 |
import torch
|
|
|
50 |
return {type: result[type].mid.fmeasure * 100 for type in rouge_types}
|
51 |
|
52 |
|
53 |
+
# @register_task("xsum")
|
54 |
class XSum(Task):
|
55 |
VERSION = 0
|
56 |
DATASET_PATH = "EdinburghNLP/xsum"
|
57 |
DATASET_NAME = None
|
58 |
|
59 |
+
def __init__(self):
|
60 |
+
super().__init__(config={'metadata': {'version': self.VERSION}})
|
61 |
self.factkb_tokenizer = None
|
62 |
self.factkb_model = None
|
63 |
self.bert_score = None
|
src/backend/tasks/xsum/task_v2.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
from lm_eval.api.task import Task
|
2 |
from lm_eval.api.instance import Instance
|
3 |
from lm_eval.api.registry import register_task
|
4 |
from lm_eval.api.metrics import mean
|
@@ -51,14 +51,14 @@ def rouge(refs, preds):
|
|
51 |
return {type: result[type].mid.fmeasure * 100 for type in rouge_types}
|
52 |
|
53 |
|
54 |
-
@register_task("xsum_v2")
|
55 |
-
class XSumv2(
|
56 |
VERSION = 0
|
57 |
DATASET_PATH = "EdinburghNLP/xsum"
|
58 |
DATASET_NAME = None
|
59 |
|
60 |
-
def __init__(self
|
61 |
-
super().__init__(
|
62 |
self.factkb_tokenizer = None
|
63 |
self.factkb_model = None
|
64 |
self.bert_score = None
|
@@ -92,15 +92,14 @@ class XSumv2(Task):
|
|
92 |
def test_docs(self):
|
93 |
return self.dataset["test"]
|
94 |
|
95 |
-
def
|
96 |
res = "Provide a summary of the provided document."
|
97 |
return res
|
98 |
|
99 |
def doc_to_text(self, doc):
|
100 |
-
return f'{self.
|
101 |
|
102 |
-
|
103 |
-
def should_decontaminate():
|
104 |
return True
|
105 |
|
106 |
def doc_to_decontamination_query(self, doc):
|
|
|
1 |
+
from lm_eval.api.task import ConfigurableTask, Task, TaskConfig
|
2 |
from lm_eval.api.instance import Instance
|
3 |
from lm_eval.api.registry import register_task
|
4 |
from lm_eval.api.metrics import mean
|
|
|
51 |
return {type: result[type].mid.fmeasure * 100 for type in rouge_types}
|
52 |
|
53 |
|
54 |
+
# @register_task("xsum_v2")
|
55 |
+
class XSumv2(ConfigurableTask):
|
56 |
VERSION = 0
|
57 |
DATASET_PATH = "EdinburghNLP/xsum"
|
58 |
DATASET_NAME = None
|
59 |
|
60 |
+
def __init__(self):
|
61 |
+
super().__init__(config={'metadata': {'version': self.VERSION}})
|
62 |
self.factkb_tokenizer = None
|
63 |
self.factkb_model = None
|
64 |
self.bert_score = None
|
|
|
92 |
def test_docs(self):
|
93 |
return self.dataset["test"]
|
94 |
|
95 |
+
def custom_prompt(self):
|
96 |
res = "Provide a summary of the provided document."
|
97 |
return res
|
98 |
|
99 |
def doc_to_text(self, doc):
|
100 |
+
return f'{self.custom_prompt()}\n\nDocument: {doc["document"]}\nSummary:'
|
101 |
|
102 |
+
def should_decontaminate(self):
|
|
|
103 |
return True
|
104 |
|
105 |
def doc_to_decontamination_query(self, doc):
|
src/backend/tasks/xsum/xsum.yaml
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
task: xsum
|
2 |
+
class: !function task.XSum
|
src/backend/tasks/xsum/xsum_v2.yaml
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
task: xsum_v2
|
2 |
+
class: !function task_v2.XSumv2
|