apsys commited on
Commit
d78ed99
1 Parent(s): 521f99a

more ref, ci, debug

Browse files
app.py CHANGED
@@ -60,6 +60,7 @@ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS,
60
  def init_leaderboard(dataframe):
61
  # if dataframe is None or dataframe.empty:
62
  # raise ValueError("Leaderboard DataFrame is empty or None.")
 
63
  return Leaderboard(
64
  value=dataframe,
65
  datatype=[c.type for c in fields(AutoEvalColumn)],
@@ -102,45 +103,45 @@ with demo:
102
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
103
 
104
  with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
105
- with gr.Column():
106
- with gr.Row():
107
- gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
108
-
109
- with gr.Column():
110
- with gr.Accordion(
111
- f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
112
- open=False,
113
- ):
114
- with gr.Row():
115
- finished_eval_table = gr.components.Dataframe(
116
- value=finished_eval_queue_df,
117
- headers=EVAL_COLS,
118
- datatype=EVAL_TYPES,
119
- row_count=5,
120
- )
121
- with gr.Accordion(
122
- f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
123
- open=False,
124
- ):
125
- with gr.Row():
126
- running_eval_table = gr.components.Dataframe(
127
- value=running_eval_queue_df,
128
- headers=EVAL_COLS,
129
- datatype=EVAL_TYPES,
130
- row_count=5,
131
- )
132
-
133
- with gr.Accordion(
134
- f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
135
- open=False,
136
- ):
137
- with gr.Row():
138
- pending_eval_table = gr.components.Dataframe(
139
- value=pending_eval_queue_df,
140
- headers=EVAL_COLS,
141
- datatype=EVAL_TYPES,
142
- row_count=5,
143
- )
144
  with gr.Row():
145
  gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
146
 
@@ -171,7 +172,7 @@ with demo:
171
  value="Original",
172
  interactive=True,
173
  )
174
- base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
175
  ans_file = gr.File(label="Arena Hard Answer File", file_types=["json","jsonl"])
176
 
177
  submit_button = gr.Button("Submit Eval")
 
60
  def init_leaderboard(dataframe):
61
  # if dataframe is None or dataframe.empty:
62
  # raise ValueError("Leaderboard DataFrame is empty or None.")
63
+ # print(dataframe.columns)
64
  return Leaderboard(
65
  value=dataframe,
66
  datatype=[c.type for c in fields(AutoEvalColumn)],
 
103
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
104
 
105
  with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
106
+ # with gr.Column():
107
+ # with gr.Row():
108
+ # gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
109
+
110
+ # with gr.Column():
111
+ # with gr.Accordion(
112
+ # f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
113
+ # open=False,
114
+ # ):
115
+ # with gr.Row():
116
+ # finished_eval_table = gr.components.Dataframe(
117
+ # value=finished_eval_queue_df,
118
+ # headers=EVAL_COLS,
119
+ # datatype=EVAL_TYPES,
120
+ # row_count=5,
121
+ # )
122
+ # with gr.Accordion(
123
+ # f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
124
+ # open=False,
125
+ # ):
126
+ # with gr.Row():
127
+ # running_eval_table = gr.components.Dataframe(
128
+ # value=running_eval_queue_df,
129
+ # headers=EVAL_COLS,
130
+ # datatype=EVAL_TYPES,
131
+ # row_count=5,
132
+ # )
133
+
134
+ # with gr.Accordion(
135
+ # f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
136
+ # open=False,
137
+ # ):
138
+ # with gr.Row():
139
+ # pending_eval_table = gr.components.Dataframe(
140
+ # value=pending_eval_queue_df,
141
+ # headers=EVAL_COLS,
142
+ # datatype=EVAL_TYPES,
143
+ # row_count=5,
144
+ # )
145
  with gr.Row():
146
  gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
147
 
 
172
  value="Original",
173
  interactive=True,
174
  )
175
+ base_model_name_textbox = gr.Textbox(label="Организация")
176
  ans_file = gr.File(label="Arena Hard Answer File", file_types=["json","jsonl"])
177
 
178
  submit_button = gr.Button("Submit Eval")
src/display/utils.py CHANGED
@@ -32,8 +32,9 @@ for task in Tasks:
32
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
33
  # Model information
34
  # auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
35
- auto_eval_column_dict.append(["lower", ColumnContent, ColumnContent("lower", "number", True)])
36
- auto_eval_column_dict.append(["upper", ColumnContent, ColumnContent("upper", "number", True)])
 
37
  auto_eval_column_dict.append(["avg_tokens", ColumnContent, ColumnContent("avg_tokens", "number", True)])
38
  auto_eval_column_dict.append(["std_tokens", ColumnContent, ColumnContent("std_tokens", "number", True)])
39
  auto_eval_column_dict.append(["lc_score", ColumnContent, ColumnContent("lc_score", "number", True)])
 
32
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
33
  # Model information
34
  # auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
35
+ auto_eval_column_dict.append(["CI", ColumnContent, ColumnContent("95% CI", "string", True)])
36
+ auto_eval_column_dict.append(["lower", ColumnContent, ColumnContent("lower", "number", False)])
37
+ auto_eval_column_dict.append(["upper", ColumnContent, ColumnContent("upper", "number", False)])
38
  auto_eval_column_dict.append(["avg_tokens", ColumnContent, ColumnContent("avg_tokens", "number", True)])
39
  auto_eval_column_dict.append(["std_tokens", ColumnContent, ColumnContent("std_tokens", "number", True)])
40
  auto_eval_column_dict.append(["lc_score", ColumnContent, ColumnContent("lc_score", "number", True)])
src/leaderboard/read_evals.py CHANGED
@@ -167,9 +167,10 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
167
  try:
168
  files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
169
  except dateutil.parser._parser.ParserError:
170
- files = [files[-1]]
171
 
172
- for file in files:
 
173
  model_result_filepaths.append(os.path.join(root, file))
174
 
175
  # eval_results = {}
 
167
  try:
168
  files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
169
  except dateutil.parser._parser.ParserError:
170
+ files = [sorted(files)[-1]]
171
 
172
+
173
+ for file in [files[-1]]:
174
  model_result_filepaths.append(os.path.join(root, file))
175
 
176
  # eval_results = {}
src/populate.py CHANGED
@@ -16,11 +16,18 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
16
  # print(raw_data)
17
  df = pd.DataFrame.from_records(json.load(open(raw_data[0])))
18
  print(list(df.columns))
 
19
  # df['model']="nothing"
20
  # df.columns = cols
21
  # df.iloc[0]= create dummy
22
  # print(dir(AutoEvalColumn))
23
  df = df.sort_values(by=[AutoEvalColumn.task0.name], ascending=False)
 
 
 
 
 
 
24
  df = df[cols].round(decimals=2)
25
 
26
  # filter out if any of the benchmarks have not been produced
 
16
  # print(raw_data)
17
  df = pd.DataFrame.from_records(json.load(open(raw_data[0])))
18
  print(list(df.columns))
19
+ df['95% CI'] = " "
20
  # df['model']="nothing"
21
  # df.columns = cols
22
  # df.iloc[0]= create dummy
23
  # print(dir(AutoEvalColumn))
24
  df = df.sort_values(by=[AutoEvalColumn.task0.name], ascending=False)
25
+ decimal = 1
26
+ for i,row in df.iterrows():
27
+ if 'lower' not in row:
28
+ continue
29
+ interval = '+'+str(round(row['upper'] - row['score'], decimal))+' / '+str(round(row['lower'] - row['score'], decimal))
30
+ df.at[i,'95% CI'] = interval
31
  df = df[cols].round(decimals=2)
32
 
33
  # filter out if any of the benchmarks have not been produced