brunneis commited on
Commit
bd9f032
1 Parent(s): 5a0ccc4

Update scores

Browse files
Files changed (2) hide show
  1. src/about.py +9 -8
  2. src/leaderboard/read_evals.py +2 -1
src/about.py CHANGED
@@ -17,14 +17,15 @@ class Task:
17
  # ---------------------------------------------------
18
  class Tasks(Enum):
19
  # task_key, metric_key, title
20
- task00 = Task("naive_judge", "score", "NaïveJudge (11)")
21
- task01 = Task("human_eval_solidity", "score", "HumanEval for Solidity (9)")
22
- task02 = Task("rouge1", "score", "ROUGE-unigrams")
23
- task03 = Task("rouge2", "score", "ROUGE-bigrams")
24
- task04 = Task("rougeL", "score", "ROUGE-Longest Common Subsequence")
25
- task05 = Task("rougeLsum", "score", "ROUGE-Lsum")
26
- task06 = Task("bleu", "score", "Bleu")
27
- task07 = Task("brevity_penalty", "score", "Brevity Penalty")
 
28
  # ---------------------------------------------------
29
 
30
  # Your leaderboard name
 
17
  # ---------------------------------------------------
18
  class Tasks(Enum):
19
  # task_key, metric_key, title
20
+ task00 = Task("naive_judge", "score", "NaïveJudge")
21
+ task01 = Task("human_eval_solidity_pass@1", "score", "HumanEval for Solidity (pass@1)")
22
+ task02 = Task("human_eval_solidity_pass@3", "score", "HumanEval for Solidity (pass@3)")
23
+ task03 = Task("rouge1", "score", "ROUGE-unigrams")
24
+ task04 = Task("rouge2", "score", "ROUGE-bigrams")
25
+ task05 = Task("rougeL", "score", "ROUGE-Longest Common Subsequence")
26
+ task06 = Task("rougeLsum", "score", "ROUGE-Lsum")
27
+ task07 = Task("bleu", "score", "Bleu")
28
+ task08 = Task("brevity_penalty", "score", "Brevity Penalty")
29
  # ---------------------------------------------------
30
 
31
  # Your leaderboard name
src/leaderboard/read_evals.py CHANGED
@@ -117,7 +117,8 @@ class EvalResult:
117
  # average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
118
  solbench = (
119
  self.results.get('naive_judge', 0) * 0.3 +
120
- self.results.get('human_eval_solidity', 0) * 0.7
 
121
  )
122
  data_dict = {
123
  "eval_name": self.eval_name, # not a column, just a save name,
 
117
  # average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
118
  solbench = (
119
  self.results.get('naive_judge', 0) * 0.3 +
120
+ self.results.get('human_eval_solidity_pass@1', 0) * 0.5 +
121
+ self.results.get('human_eval_solidity_pass@3', 0) * 0.2
122
  )
123
  data_dict = {
124
  "eval_name": self.eval_name, # not a column, just a save name,