Spaces:

Crisp-Unimib
/

INVALSIbenchmark

Running

Andrea Seveso commited on Jun 25

Commit

f728b4f

•

1 Parent(s): 539e451

Colored dataframe

Files changed (2) hide show

app.py CHANGED Viewed

@@ -12,6 +12,8 @@ from src.about import (
     LLM_BENCHMARKS_TEXT,
     TITLE,
     EVALUATION_QUEUE_TEXT,
 )
 from src.display.css_html_js import custom_css
 from src.display.utils import (
@@ -129,11 +131,13 @@ def filter_models(
 def get_macro_area_data():
     dataset = pd.read_csv("src/macro_area.csv", sep=',')
     return dataset
 def get_question_format_data():
     dataset = pd.read_csv("src/question_format.csv", sep=',')
     return dataset
@@ -233,13 +237,12 @@ with demo:
         with gr.TabItem('In Depth Evaluation'):
             gr.Markdown('''# In Depth Evaluation''')
-            gr.Markdown('''Question Format evaluation''')
             gr.Dataframe(get_question_format_data())
         with gr.TabItem('Evaluation by Macro Area'):
             gr.Markdown('''# Macro Area evaluation''')
-            gr.Markdown(
-                '''This table shows the evaluation of the models by macro area. The evaluation is based on the following metrics:''')
             gr.Dataframe(get_macro_area_data())
         with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):

     LLM_BENCHMARKS_TEXT,
     TITLE,
     EVALUATION_QUEUE_TEXT,
+    QUESTION_FORMAT_TEXT,
+    MACRO_AREA_TEXT,
 )
 from src.display.css_html_js import custom_css
 from src.display.utils import (
 def get_macro_area_data():
     dataset = pd.read_csv("src/macro_area.csv", sep=',')
+    dataset = dataset.style.highlight_max(color='lightgreen', axis=0)
     return dataset
 def get_question_format_data():
     dataset = pd.read_csv("src/question_format.csv", sep=',')
+    dataset = dataset.style.highlight_max(color='lightgreen', axis=0)
     return dataset
         with gr.TabItem('In Depth Evaluation'):
             gr.Markdown('''# In Depth Evaluation''')
+            gr.Markdown(QUESTION_FORMAT_TEXT)
             gr.Dataframe(get_question_format_data())
         with gr.TabItem('Evaluation by Macro Area'):
             gr.Markdown('''# Macro Area evaluation''')
+            gr.Markdown(MACRO_AREA_TEXT)
             gr.Dataframe(get_macro_area_data())
         with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):

src/about.py CHANGED Viewed

@@ -28,8 +28,11 @@ TITLE = """<h1 align="center" id="space-title">👩‍🏫Invalsi Leaderboard
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
 Welcome into <a href="https://crispresearch.it/"><b>CRISP Bicocca</b></a> Invalsi Leaderboard page!
 We adapted the INVALSI benchmark for automated LLM evaluation, which involves rigorous adaptation of the test format to suit automated processing while retaining the essence of the original tests. In this leaderboard, we provide a detailed assessment of current LLMs, offering a crucial reference point for the academic community.
 Researchers are invited to submit their models for ongoing evaluation, ensuring the benchmark remains a current and valuable resource.
 For more information on the benchmark, please refer to our arXiv paper <a href="https://arxiv.org/abs/SOON"><b>here</b></a> and read the "About" section.
 """
@@ -244,3 +247,11 @@ This is a leaderboard for Open LLMs, and we'd love for as many people as possibl
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
 CITATION_BUTTON_TEXT = r"""
 """

 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
 Welcome into <a href="https://crispresearch.it/"><b>CRISP Bicocca</b></a> Invalsi Leaderboard page!
 We adapted the INVALSI benchmark for automated LLM evaluation, which involves rigorous adaptation of the test format to suit automated processing while retaining the essence of the original tests. In this leaderboard, we provide a detailed assessment of current LLMs, offering a crucial reference point for the academic community.
 Researchers are invited to submit their models for ongoing evaluation, ensuring the benchmark remains a current and valuable resource.
 For more information on the benchmark, please refer to our arXiv paper <a href="https://arxiv.org/abs/SOON"><b>here</b></a> and read the "About" section.
 """
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
 CITATION_BUTTON_TEXT = r"""
 """
+QUESTION_FORMAT_TEXT = """
+Question Format evaluation
+"""
+MACRO_AREA_TEXT = """"
+This table shows the evaluation of the models by macro area. The evaluation is based on the following metrics:
+"""