apsys commited on
Commit
b47b51e
1 Parent(s): 7ec1b66

Added result screener + UI format

Browse files
app.py CHANGED
@@ -68,21 +68,21 @@ def init_leaderboard(dataframe):
68
  cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
69
  label="Select Columns to Display:",
70
  ),
71
- search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
72
  hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
73
  filter_columns=[
74
- ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
75
- ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
76
- ColumnFilter(
77
- AutoEvalColumn.params.name,
78
- type="slider",
79
- min=0.01,
80
- max=150,
81
- label="Select the number of parameters (B)",
82
- ),
83
- ColumnFilter(
84
- AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
85
- ),
86
  ],
87
  bool_checkboxgroup_label="Hide models",
88
  interactive=False,
@@ -172,7 +172,7 @@ with demo:
172
  interactive=True,
173
  )
174
  base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
175
- ans_file = gr.File(label="Arena Hard Answer File", file_types=[".json"])
176
 
177
  submit_button = gr.Button("Submit Eval")
178
  submission_result = gr.Markdown()
 
68
  cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
69
  label="Select Columns to Display:",
70
  ),
71
+ search_columns=[AutoEvalColumn.model.name],
72
  hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
73
  filter_columns=[
74
+ # ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
75
+ # ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
76
+ # ColumnFilter(
77
+ # AutoEvalColumn.params.name,
78
+ # type="slider",
79
+ # min=0.01,
80
+ # max=150,
81
+ # label="Select the number of parameters (B)",
82
+ # ),
83
+ # ColumnFilter(
84
+ # AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
85
+ # ),
86
  ],
87
  bool_checkboxgroup_label="Hide models",
88
  interactive=False,
 
172
  interactive=True,
173
  )
174
  base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
175
+ ans_file = gr.File(label="Arena Hard Answer File", file_types=["json","jsonl"])
176
 
177
  submit_button = gr.Button("Submit Eval")
178
  submission_result = gr.Markdown()
src/about.py CHANGED
@@ -13,7 +13,7 @@ class Task:
13
  class Tasks(Enum):
14
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
  task0 = Task("arenahard", "score", "score")
16
- task1 = Task("logiqa", "acc_norm", "LogiQA")
17
 
18
  NUM_FEWSHOT = 0 # Change with your few shot
19
  # ---------------------------------------------------
 
13
  class Tasks(Enum):
14
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
  task0 = Task("arenahard", "score", "score")
16
+ # task1 = Task("logiqa", "acc_norm", "LogiQA")
17
 
18
  NUM_FEWSHOT = 0 # Change with your few shot
19
  # ---------------------------------------------------
src/display/utils.py CHANGED
@@ -22,23 +22,29 @@ class ColumnContent:
22
 
23
  ## Leaderboard columns
24
  auto_eval_column_dict = []
 
25
  # Init
26
  # auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
27
- auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
28
  #Scores
29
- auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
30
  for task in Tasks:
31
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
32
  # Model information
33
- auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
34
- auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
35
- auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
36
- auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
37
- auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
38
- auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
39
- auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
40
- auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
41
- auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
 
 
 
 
 
42
 
43
  # We use make dataclass to dynamically fill the scores from Tasks
44
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
@@ -52,6 +58,7 @@ class EvalQueueColumn: # Queue column
52
  precision = ColumnContent("precision", "str", True)
53
  weight_type = ColumnContent("weight_type", "str", "Original")
54
  status = ColumnContent("status", "str", True)
 
55
 
56
  ## All the model information that we might need
57
  @dataclass
 
22
 
23
  ## Leaderboard columns
24
  auto_eval_column_dict = []
25
+ # ['results', 'model', 'score', 'lower', 'upper', 'avg_tokens', 'std_tokens', 'lc_score']
26
  # Init
27
  # auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
28
+ auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("model", "markdown", True, never_hidden=True)])
29
  #Scores
30
+ # auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
31
  for task in Tasks:
32
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
33
  # Model information
34
+ # auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
35
+ auto_eval_column_dict.append(["lower", ColumnContent, ColumnContent("lower", "number", True)])
36
+ auto_eval_column_dict.append(["upper", ColumnContent, ColumnContent("upper", "number", True)])
37
+ auto_eval_column_dict.append(["avg_tokens", ColumnContent, ColumnContent("avg_tokens", "number", True)])
38
+ auto_eval_column_dict.append(["std_tokens", ColumnContent, ColumnContent("std_tokens", "number", True)])
39
+ auto_eval_column_dict.append(["lc_score", ColumnContent, ColumnContent("lc_score", "number", True)])
40
+ # auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
41
+ # auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
42
+ # auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
43
+ # auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
44
+ # auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
45
+ # auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
46
+ # auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
47
+ # auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
48
 
49
  # We use make dataclass to dynamically fill the scores from Tasks
50
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
 
58
  precision = ColumnContent("precision", "str", True)
59
  weight_type = ColumnContent("weight_type", "str", "Original")
60
  status = ColumnContent("status", "str", True)
61
+ answers_file = ColumnContent("answers_file", "str", True)
62
 
63
  ## All the model information that we might need
64
  @dataclass
src/leaderboard/read_evals.py CHANGED
@@ -18,14 +18,14 @@ class EvalResult:
18
  """
19
  eval_name: str # org_model_precision (uid)
20
  full_model: str # org/model (path on hub)
21
- org: str
22
  model: str
23
  revision: str # commit hash, "" if main
24
  results: dict
25
  precision: Precision = Precision.Unknown
26
  model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
27
  weight_type: WeightType = WeightType.Original # Original or Adapter
28
- architecture: str = "Unknown"
29
  license: str = "?"
30
  likes: int = 0
31
  num_params: int = 0
@@ -85,7 +85,7 @@ class EvalResult:
85
  org=org,
86
  model=model,
87
  results=results,
88
- precision=precision,
89
  revision= config.get("model_sha", ""),
90
  still_on_hub=still_on_hub,
91
  architecture=architecture
@@ -172,25 +172,25 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
172
  for file in files:
173
  model_result_filepaths.append(os.path.join(root, file))
174
 
175
- eval_results = {}
176
- for model_result_filepath in model_result_filepaths:
177
- # Creation of result
178
- eval_result = EvalResult.init_from_json_file(model_result_filepath)
179
- eval_result.update_with_request_file(requests_path)
180
-
181
- # Store results of same eval together
182
- eval_name = eval_result.eval_name
183
- if eval_name in eval_results.keys():
184
- eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
185
- else:
186
- eval_results[eval_name] = eval_result
187
-
188
- results = []
189
- for v in eval_results.values():
190
- try:
191
- v.to_dict() # we test if the dict version is complete
192
- results.append(v)
193
- except KeyError: # not all eval values present
194
- continue
195
-
196
- return results
 
18
  """
19
  eval_name: str # org_model_precision (uid)
20
  full_model: str # org/model (path on hub)
21
+ org: str
22
  model: str
23
  revision: str # commit hash, "" if main
24
  results: dict
25
  precision: Precision = Precision.Unknown
26
  model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
27
  weight_type: WeightType = WeightType.Original # Original or Adapter
28
+ architecture: str = "Unknown"
29
  license: str = "?"
30
  likes: int = 0
31
  num_params: int = 0
 
85
  org=org,
86
  model=model,
87
  results=results,
88
+ precision=precision,
89
  revision= config.get("model_sha", ""),
90
  still_on_hub=still_on_hub,
91
  architecture=architecture
 
172
  for file in files:
173
  model_result_filepaths.append(os.path.join(root, file))
174
 
175
+ # eval_results = {}
176
+ # for model_result_filepath in model_result_filepaths:
177
+ # # Creation of result
178
+ # eval_result = EvalResult.init_from_json_file(model_result_filepath)
179
+ # eval_result.update_with_request_file(requests_path)
180
+
181
+ # # Store results of same eval together
182
+ # eval_name = eval_result.eval_name
183
+ # if eval_name in eval_results.keys():
184
+ # eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
185
+ # else:
186
+ # eval_results[eval_name] = eval_result
187
+
188
+ # results = []
189
+ # for v in eval_results.values():
190
+ # try:
191
+ # v.to_dict() # we test if the dict version is complete
192
+ # results.append(v)
193
+ # except KeyError: # not all eval values present
194
+ # continue
195
+
196
+ return model_result_filepaths#results
src/populate.py CHANGED
@@ -6,19 +6,22 @@ import pandas as pd
6
  from src.display.formatting import has_no_nan_values, make_clickable_model
7
  from src.display.utils import AutoEvalColumn, EvalQueueColumn
8
  from src.leaderboard.read_evals import get_raw_eval_results
 
9
 
10
 
11
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
12
  """Creates a dataframe from all the individual experiment results"""
13
  raw_data = get_raw_eval_results(results_path, requests_path)
14
- all_data_json = [v.to_dict() for v in raw_data]
15
-
16
- df = pd.DataFrame.from_records(all_data_json,columns=cols)
17
- df['model']="nothing"
 
18
  # df.columns = cols
19
  # df.iloc[0]= create dummy
20
- # df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
21
- # df = df[cols].round(decimals=2)
 
22
 
23
  # filter out if any of the benchmarks have not been produced
24
  # df = df[has_no_nan_values(df, benchmark_cols)]
@@ -27,25 +30,29 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
27
 
28
  def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
29
  """Creates the different dataframes for the evaluation queues requestes"""
30
- entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
31
  all_evals = []
32
 
33
  for entry in entries:
34
- if ".json" in entry:
35
  file_path = os.path.join(save_path, entry)
36
  with open(file_path) as fp:
 
37
  data = json.load(fp)
38
 
39
  data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
40
  data[EvalQueueColumn.revision.name] = data.get("revision", "main")
41
 
42
  all_evals.append(data)
43
- elif ".md" not in entry:
44
  # this is a folder
45
  sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")]
46
  for sub_entry in sub_entries:
 
 
47
  file_path = os.path.join(save_path, entry, sub_entry)
48
  with open(file_path) as fp:
 
49
  data = json.load(fp)
50
 
51
  data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
 
6
  from src.display.formatting import has_no_nan_values, make_clickable_model
7
  from src.display.utils import AutoEvalColumn, EvalQueueColumn
8
  from src.leaderboard.read_evals import get_raw_eval_results
9
+ from src.envs import RESULTS_REPO
10
 
11
 
12
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
13
  """Creates a dataframe from all the individual experiment results"""
14
  raw_data = get_raw_eval_results(results_path, requests_path)
15
+ # all_data_json = [v.to_dict() for v in raw_data]
16
+ # print(raw_data)
17
+ df = pd.DataFrame.from_records(json.load(open(raw_data[0])))
18
+ print(list(df.columns))
19
+ # df['model']="nothing"
20
  # df.columns = cols
21
  # df.iloc[0]= create dummy
22
+ # print(dir(AutoEvalColumn))
23
+ df = df.sort_values(by=[AutoEvalColumn.task0.name], ascending=False)
24
+ df = df[cols].round(decimals=2)
25
 
26
  # filter out if any of the benchmarks have not been produced
27
  # df = df[has_no_nan_values(df, benchmark_cols)]
 
30
 
31
  def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
32
  """Creates the different dataframes for the evaluation queues requestes"""
33
+ entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".") and not entry.endswith(".jsonl")]
34
  all_evals = []
35
 
36
  for entry in entries:
37
+ if ".json" in entry and 'toeval' not in entry:
38
  file_path = os.path.join(save_path, entry)
39
  with open(file_path) as fp:
40
+ print(file_path)
41
  data = json.load(fp)
42
 
43
  data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
44
  data[EvalQueueColumn.revision.name] = data.get("revision", "main")
45
 
46
  all_evals.append(data)
47
+ elif ".md" not in entry and 'toeval' not in entry and 'results' not in entry:
48
  # this is a folder
49
  sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")]
50
  for sub_entry in sub_entries:
51
+ if 'toeval' in sub_entry:
52
+ continue
53
  file_path = os.path.join(save_path, entry, sub_entry)
54
  with open(file_path) as fp:
55
+ # print(file_path)
56
  data = json.load(fp)
57
 
58
  data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
src/submission/submit.py CHANGED
@@ -16,7 +16,7 @@ USERS_TO_SUBMISSION_DATES = None
16
 
17
  def add_new_eval(
18
  model: str,
19
- base_model: str,
20
  revision: str,
21
  precision: str,
22
  weight_type: str,
@@ -28,7 +28,7 @@ def add_new_eval(
28
  if not REQUESTED_MODELS:
29
  REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
30
 
31
- user_name = ""
32
  model_path = model
33
  if "/" in model:
34
  user_name = model.split("/")[0]
@@ -75,10 +75,15 @@ def add_new_eval(
75
 
76
  # Seems good, creating the eval
77
  print("Adding new eval")
 
 
 
 
 
78
 
79
  eval_entry = {
80
  "model": model,
81
- "base_model": base_model,
82
  "revision": revision,
83
  "precision": precision,
84
  "weight_type": weight_type,
@@ -87,8 +92,9 @@ def add_new_eval(
87
  "model_type": model_type,
88
  "likes": "",
89
  "params": "",
90
- "license": license,
91
  "private": False,
 
92
  }
93
 
94
  # Check for duplicate submission
@@ -96,10 +102,6 @@ def add_new_eval(
96
  return styled_warning("This model has been already submitted.")
97
 
98
  print("Creating eval file")
99
- OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
100
- os.makedirs(OUT_DIR, exist_ok=True)
101
- out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
102
- out_path_upload = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}_toeval.json"
103
 
104
  with open(out_path, "w") as f:
105
  f.write(json.dumps(eval_entry))
 
16
 
17
  def add_new_eval(
18
  model: str,
19
+ user_name: str,
20
  revision: str,
21
  precision: str,
22
  weight_type: str,
 
28
  if not REQUESTED_MODELS:
29
  REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
30
 
31
+ user_name = user_name
32
  model_path = model
33
  if "/" in model:
34
  user_name = model.split("/")[0]
 
75
 
76
  # Seems good, creating the eval
77
  print("Adding new eval")
78
+ OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
79
+ os.makedirs(OUT_DIR, exist_ok=True)
80
+ out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
81
+ out_path_upload = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}_toeval.json"
82
+
83
 
84
  eval_entry = {
85
  "model": model,
86
+ "user_name": user_name,
87
  "revision": revision,
88
  "precision": precision,
89
  "weight_type": weight_type,
 
92
  "model_type": model_type,
93
  "likes": "",
94
  "params": "",
95
+ "license": "",
96
  "private": False,
97
+ "answers_file": str(out_path_upload),
98
  }
99
 
100
  # Check for duplicate submission
 
102
  return styled_warning("This model has been already submitted.")
103
 
104
  print("Creating eval file")
 
 
 
 
105
 
106
  with open(out_path, "w") as f:
107
  f.write(json.dumps(eval_entry))