Andrea Seveso commited on
Commit
f728b4f
1 Parent(s): 539e451

Colored dataframe

Browse files
Files changed (2) hide show
  1. app.py +6 -3
  2. src/about.py +11 -0
app.py CHANGED
@@ -12,6 +12,8 @@ from src.about import (
12
  LLM_BENCHMARKS_TEXT,
13
  TITLE,
14
  EVALUATION_QUEUE_TEXT,
 
 
15
  )
16
  from src.display.css_html_js import custom_css
17
  from src.display.utils import (
@@ -129,11 +131,13 @@ def filter_models(
129
 
130
  def get_macro_area_data():
131
  dataset = pd.read_csv("src/macro_area.csv", sep=',')
 
132
  return dataset
133
 
134
 
135
  def get_question_format_data():
136
  dataset = pd.read_csv("src/question_format.csv", sep=',')
 
137
  return dataset
138
 
139
 
@@ -233,13 +237,12 @@ with demo:
233
  with gr.TabItem('In Depth Evaluation'):
234
 
235
  gr.Markdown('''# In Depth Evaluation''')
236
- gr.Markdown('''Question Format evaluation''')
237
  gr.Dataframe(get_question_format_data())
238
 
239
  with gr.TabItem('Evaluation by Macro Area'):
240
  gr.Markdown('''# Macro Area evaluation''')
241
- gr.Markdown(
242
- '''This table shows the evaluation of the models by macro area. The evaluation is based on the following metrics:''')
243
  gr.Dataframe(get_macro_area_data())
244
 
245
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
 
12
  LLM_BENCHMARKS_TEXT,
13
  TITLE,
14
  EVALUATION_QUEUE_TEXT,
15
+ QUESTION_FORMAT_TEXT,
16
+ MACRO_AREA_TEXT,
17
  )
18
  from src.display.css_html_js import custom_css
19
  from src.display.utils import (
 
131
 
132
  def get_macro_area_data():
133
  dataset = pd.read_csv("src/macro_area.csv", sep=',')
134
+ dataset = dataset.style.highlight_max(color='lightgreen', axis=0)
135
  return dataset
136
 
137
 
138
  def get_question_format_data():
139
  dataset = pd.read_csv("src/question_format.csv", sep=',')
140
+ dataset = dataset.style.highlight_max(color='lightgreen', axis=0)
141
  return dataset
142
 
143
 
 
237
  with gr.TabItem('In Depth Evaluation'):
238
 
239
  gr.Markdown('''# In Depth Evaluation''')
240
+ gr.Markdown(QUESTION_FORMAT_TEXT)
241
  gr.Dataframe(get_question_format_data())
242
 
243
  with gr.TabItem('Evaluation by Macro Area'):
244
  gr.Markdown('''# Macro Area evaluation''')
245
+ gr.Markdown(MACRO_AREA_TEXT)
 
246
  gr.Dataframe(get_macro_area_data())
247
 
248
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
src/about.py CHANGED
@@ -28,8 +28,11 @@ TITLE = """<h1 align="center" id="space-title">👩‍🏫Invalsi Leaderboard
28
  # What does your leaderboard evaluate?
29
  INTRODUCTION_TEXT = """
30
  Welcome into <a href="https://crispresearch.it/"><b>CRISP Bicocca</b></a> Invalsi Leaderboard page!
 
31
  We adapted the INVALSI benchmark for automated LLM evaluation, which involves rigorous adaptation of the test format to suit automated processing while retaining the essence of the original tests. In this leaderboard, we provide a detailed assessment of current LLMs, offering a crucial reference point for the academic community.
 
32
  Researchers are invited to submit their models for ongoing evaluation, ensuring the benchmark remains a current and valuable resource.
 
33
  For more information on the benchmark, please refer to our arXiv paper <a href="https://arxiv.org/abs/SOON"><b>here</b></a> and read the "About" section.
34
  """
35
 
@@ -244,3 +247,11 @@ This is a leaderboard for Open LLMs, and we'd love for as many people as possibl
244
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
245
  CITATION_BUTTON_TEXT = r"""
246
  """
 
 
 
 
 
 
 
 
 
28
  # What does your leaderboard evaluate?
29
  INTRODUCTION_TEXT = """
30
  Welcome into <a href="https://crispresearch.it/"><b>CRISP Bicocca</b></a> Invalsi Leaderboard page!
31
+
32
  We adapted the INVALSI benchmark for automated LLM evaluation, which involves rigorous adaptation of the test format to suit automated processing while retaining the essence of the original tests. In this leaderboard, we provide a detailed assessment of current LLMs, offering a crucial reference point for the academic community.
33
+
34
  Researchers are invited to submit their models for ongoing evaluation, ensuring the benchmark remains a current and valuable resource.
35
+
36
  For more information on the benchmark, please refer to our arXiv paper <a href="https://arxiv.org/abs/SOON"><b>here</b></a> and read the "About" section.
37
  """
38
 
 
247
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
248
  CITATION_BUTTON_TEXT = r"""
249
  """
250
+
251
+ QUESTION_FORMAT_TEXT = """
252
+ Question Format evaluation
253
+ """
254
+
255
+ MACRO_AREA_TEXT = """"
256
+ This table shows the evaluation of the models by macro area. The evaluation is based on the following metrics:
257
+ """