pminervini commited on
Commit
a6af742
·
1 Parent(s): efa0391
src/backend/envs.py CHANGED
@@ -17,13 +17,10 @@ class Task:
17
 
18
 
19
  class Tasks(Enum):
20
- # task_key in the json file, metric_key in the json file, name to display in the leaderboard
21
- # task0 = Task("anli_r1", "acc", "ANLI")
22
- # task1 = Task("logiqa", "acc_norm", "LogiQA")
23
  task0 = Task("nq_open", "em", "NQ Open", 64) # 64, as in the ATLAS paper
24
  task1 = Task("triviaqa", "em", "TriviaQA", 64) # 64, as in the ATLAS paper
25
- # TruthfulQA is intended as a zero-shot benchmark [5, 47]. https://owainevans.github.io/pdfs/truthfulQA_lin_evans.pdf
26
 
 
27
  task2 = Task("truthfulqa_gen", "rougeL_acc", "TruthfulQA Gen", 0)
28
  task3 = Task("truthfulqa_mc1", "acc", "TruthfulQA MC1", 0)
29
  task4 = Task("truthfulqa_mc2", "acc", "TruthfulQA MC2", 0)
 
17
 
18
 
19
  class Tasks(Enum):
 
 
 
20
  task0 = Task("nq_open", "em", "NQ Open", 64) # 64, as in the ATLAS paper
21
  task1 = Task("triviaqa", "em", "TriviaQA", 64) # 64, as in the ATLAS paper
 
22
 
23
+ # TruthfulQA is intended as a zero-shot benchmark [5, 47]. https://owainevans.github.io/pdfs/truthfulQA_lin_evans.pdf
24
  task2 = Task("truthfulqa_gen", "rougeL_acc", "TruthfulQA Gen", 0)
25
  task3 = Task("truthfulqa_mc1", "acc", "TruthfulQA MC1", 0)
26
  task4 = Task("truthfulqa_mc2", "acc", "TruthfulQA MC2", 0)
src/backend/tasks/selfcheckgpt/task.py CHANGED
@@ -18,6 +18,7 @@ class SelfCheckGpt(Task):
18
  DATASET_PATH = "potsawee/wiki_bio_gpt3_hallucination"
19
  DATASET_NAME = None
20
  OUTPUT_TYPE = 'generate_until'
 
21
  def __init__(self, data_dir=None, cache_dir=None, download_mode=None, config=None):
22
  super().__init__(data_dir=data_dir, cache_dir=cache_dir, download_mode=download_mode, config=config)
23
  self.generation_kwargs = {"temperature": 0.0, "do_sample": False}
@@ -54,7 +55,8 @@ class SelfCheckGpt(Task):
54
  doc_text = doc["wiki_bio_text"]
55
  doc_text = doc_text.split()
56
  doc_text = " ".join(doc_text[:5])
57
- doc_text = f"Please generating a Wikipedia passage starting with: {doc_text}\n"
 
58
  return doc_text
59
 
60
  def doc_to_target(self, doc):
@@ -82,35 +84,32 @@ class SelfCheckGpt(Task):
82
  sentences = self.selfcheckgpt_nlp(response_temperature_0)
83
  sentences = [sent.text.strip() for sent in sentences.sents]
84
  if self.selfcheckgpt_type == 'SelfCheckNgram':
85
- selfcheckgpt_scores = self.selfcheckgpt.predict(
86
- sentences = sentences,
87
- passage = response_temperature_0,
88
- sampled_passages = other_responses,
89
- )
90
- return {'avg-selfcheckgpt': selfcheckgpt_scores['doc_level']['avg_neg_logprob'],
91
- 'max-selfcheckgpt': selfcheckgpt_scores['doc_level']['avg_max_neg_logprob']}
92
 
93
  elif self.selfcheckgpt_type == 'SelfCheckBERTScore':
94
  selfcheckgpt_scores = self.selfcheckgpt.predict(sentences=sentences, sampled_passages=other_responses)
95
  elif self.selfcheckgpt_type == 'SelfCheckMQAG':
96
  selfcheckgpt_scores = self.selfcheckgpt.predict(
97
- sentences = sentences,
98
- passage = response_temperature_0,
99
- sampled_passages = other_responses,
100
- num_questions_per_sent = 5, # number of questions to be drawn
101
- scoring_method = 'bayes_with_alpha', # options = 'counting', 'bayes', 'bayes_with_alpha'
102
- beta1 = 0.8, beta2 = 0.8, # additional params depending on scoring_method
103
- )
104
  elif self.selfcheckgpt_type == 'SelfCheckNLI':
105
- selfcheckgpt_scores = self.selfcheckgpt.predict(
106
- sentences = sentences,
107
- sampled_passages = other_responses,
108
- )
109
 
110
  if len(selfcheckgpt_scores) == 0:
111
  self.SelfCheckNLI_error_cnt += 1
112
  print(f"SelfCheckNLI Warning.SelfCheckNLI_error_cnt:{self.SelfCheckNLI_error_cnt}. This instance is marked as hallucinated with 1.0.")
113
- result = {'avg-selfcheckgpt': 1.0, 'max-selfcheckgpt': 1.0}
 
 
 
114
 
115
  else:
116
  threshold = 0.5
 
18
  DATASET_PATH = "potsawee/wiki_bio_gpt3_hallucination"
19
  DATASET_NAME = None
20
  OUTPUT_TYPE = 'generate_until'
21
+
22
  def __init__(self, data_dir=None, cache_dir=None, download_mode=None, config=None):
23
  super().__init__(data_dir=data_dir, cache_dir=cache_dir, download_mode=download_mode, config=config)
24
  self.generation_kwargs = {"temperature": 0.0, "do_sample": False}
 
55
  doc_text = doc["wiki_bio_text"]
56
  doc_text = doc_text.split()
57
  doc_text = " ".join(doc_text[:5])
58
+ # prompt = f"This is a passage from Wikipedia about {context}:\n\n"
59
+ doc_text = f"Please generate a Wikipedia passage starting with: {doc_text}\n"
60
  return doc_text
61
 
62
  def doc_to_target(self, doc):
 
84
  sentences = self.selfcheckgpt_nlp(response_temperature_0)
85
  sentences = [sent.text.strip() for sent in sentences.sents]
86
  if self.selfcheckgpt_type == 'SelfCheckNgram':
87
+ selfcheckgpt_scores = self.selfcheckgpt.predict(sentences=sentences, passage=response_temperature_0, sampled_passages=other_responses)
88
+ return {
89
+ 'avg-selfcheckgpt': selfcheckgpt_scores['doc_level']['avg_neg_logprob'],
90
+ 'max-selfcheckgpt': selfcheckgpt_scores['doc_level']['avg_max_neg_logprob']
91
+ }
 
 
92
 
93
  elif self.selfcheckgpt_type == 'SelfCheckBERTScore':
94
  selfcheckgpt_scores = self.selfcheckgpt.predict(sentences=sentences, sampled_passages=other_responses)
95
  elif self.selfcheckgpt_type == 'SelfCheckMQAG':
96
  selfcheckgpt_scores = self.selfcheckgpt.predict(
97
+ sentences=sentences,
98
+ passage=response_temperature_0,
99
+ sampled_passages=other_responses,
100
+ num_questions_per_sent=5, # number of questions to be drawn
101
+ scoring_method='bayes_with_alpha', # options = 'counting', 'bayes', 'bayes_with_alpha'
102
+ beta1=0.8, beta2=0.8) # additional params depending on scoring_method
 
103
  elif self.selfcheckgpt_type == 'SelfCheckNLI':
104
+ selfcheckgpt_scores = self.selfcheckgpt.predict(sentences=sentences, sampled_passages=other_responses)
 
 
 
105
 
106
  if len(selfcheckgpt_scores) == 0:
107
  self.SelfCheckNLI_error_cnt += 1
108
  print(f"SelfCheckNLI Warning.SelfCheckNLI_error_cnt:{self.SelfCheckNLI_error_cnt}. This instance is marked as hallucinated with 1.0.")
109
+ result = {
110
+ 'avg-selfcheckgpt': 1.0,
111
+ 'max-selfcheckgpt': 1.0
112
+ }
113
 
114
  else:
115
  threshold = 0.5