Andrea Seveso commited on
Commit
539e451
1 Parent(s): 0e13466

Additional tabs

Browse files
Files changed (3) hide show
  1. app.py +22 -0
  2. src/macro_area.csv +20 -0
  3. src/question_format.csv +20 -0
app.py CHANGED
@@ -127,6 +127,16 @@ def filter_models(
127
  return filtered_df
128
 
129
 
 
 
 
 
 
 
 
 
 
 
130
  demo = gr.Blocks(css=custom_css)
131
  with demo:
132
  gr.HTML(TITLE)
@@ -220,6 +230,18 @@ with demo:
220
  queue=True,
221
  )
222
 
 
 
 
 
 
 
 
 
 
 
 
 
223
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
224
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
225
 
 
127
  return filtered_df
128
 
129
 
130
+ def get_macro_area_data():
131
+ dataset = pd.read_csv("src/macro_area.csv", sep=',')
132
+ return dataset
133
+
134
+
135
+ def get_question_format_data():
136
+ dataset = pd.read_csv("src/question_format.csv", sep=',')
137
+ return dataset
138
+
139
+
140
  demo = gr.Blocks(css=custom_css)
141
  with demo:
142
  gr.HTML(TITLE)
 
230
  queue=True,
231
  )
232
 
233
+ with gr.TabItem('In Depth Evaluation'):
234
+
235
+ gr.Markdown('''# In Depth Evaluation''')
236
+ gr.Markdown('''Question Format evaluation''')
237
+ gr.Dataframe(get_question_format_data())
238
+
239
+ with gr.TabItem('Evaluation by Macro Area'):
240
+ gr.Markdown('''# Macro Area evaluation''')
241
+ gr.Markdown(
242
+ '''This table shows the evaluation of the models by macro area. The evaluation is based on the following metrics:''')
243
+ gr.Dataframe(get_macro_area_data())
244
+
245
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
246
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
247
 
src/macro_area.csv ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Sezione,Comprensione del testo,Comprensione del testo,Comprensione del testo,Riflessione sulla lingua,Riflessione sulla lingua,Riflessione sulla lingua,Riflessione sulla lingua,Riflessione sulla lingua,Riflessione sulla lingua
2
+ MacroAspetto,Localizzare e individuare informazioni all’interno del testo,"Ricostruire il significato del testo, a livello locale o globale","Riflettere sul contenuto o sulla forma del testo, a livello locale o globale, e valutarli",Formazione delle parole,Lessico e semantica,Morfologia,Ortografia,Sintassi,Testualità e pragmatica
3
+ Model,,,,,,,,,
4
+ LLaMAntino-3-ANITA-8B-Inst-DPO-ITA,60.2,63.1,78.8,28.6,37.9,16.7,0.0,26.3,50.0
5
+ Minerva-3B-base-v1.0,4.6,3.9,9.1,28.6,3.4,4.2,0.0,5.3,0.0
6
+ claude-3-haiku,78.7,86.0,75.8,71.4,65.5,62.5,0.0,57.9,83.3
7
+ claude-3-opus,91.7,91.6,78.8,100.0,82.8,75.0,50.0,89.5,83.3
8
+ claude-3-sonnet,87.0,90.5,75.8,100.0,62.1,75.0,0.0,52.6,100.0
9
+ command-r-plus,74.1,80.4,81.8,71.4,65.5,66.7,0.0,57.9,83.3
10
+ gemini-flash-1.5,83.3,85.5,81.8,85.7,62.1,83.3,25.0,63.2,66.7
11
+ gemini-pro,78.7,82.1,81.8,71.4,51.7,70.8,0.0,68.4,66.7
12
+ gemini-pro-1.5,90.7,87.7,84.8,57.1,55.2,58.3,25.0,63.2,33.3
13
+ gpt-3.5-turbo-0125,61.1,64.8,63.6,42.9,55.2,58.3,0.0,47.4,83.3
14
+ gpt-4-turbo,77.8,82.1,75.8,71.4,82.8,75.0,50.0,73.7,100.0
15
+ gpt-4o,64.8,69.8,51.5,100.0,69.0,87.5,0.0,89.5,100.0
16
+ llama-3-70b-instruct,83.3,85.5,75.8,71.4,55.2,33.3,0.0,47.4,50.0
17
+ llama-3-8b-instruct,48.2,53.6,63.6,14.3,34.5,29.2,0.0,31.6,50.0
18
+ mistral-7b-instruct:nitro,51.8,59.2,51.5,28.6,37.9,29.2,0.0,31.6,33.3
19
+ mixtral-8x7b-instruct,74.1,77.1,69.7,42.9,37.9,50.0,0.0,52.6,50.0
20
+ zefiro-7b-base-ITA,50.0,49.7,48.5,57.1,20.7,16.7,0.0,26.3,50.0
src/question_format.csv ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Grado,2,2,5,5,5,5,5,6,6,6,8,8,8,10,10,10,10,13,13
2
+ Type,MC,MCC,CL,MC,MCC,RB,RU,MC,MCC,RU,MC,MCC,RU,CL,MC,MCC,RU,MC,MCC
3
+ Model,,,,,,,,,,,,,,,,,,,
4
+ LLaMAntino-3-ANITA-8B-Inst-DPO-ITA,71.9,0.0,0.0,70.0,14.3,0.0,16.7,67.6,42.9,22.2,55.6,100.0,50.0,0.0,64.6,23.1,57.1,45.2,0.0
5
+ Minerva-3B-base-v1.0,0.0,0.0,0.0,13.3,0.0,0.0,0.0,0.0,0.0,0.0,8.6,0.0,0.0,0.0,6.2,0.0,0.0,4.8,0.0
6
+ claude-3-haiku,100.0,50.0,0.0,91.7,28.6,0.0,33.3,84.5,57.1,77.8,85.2,100.0,75.0,50.0,75.0,46.2,64.3,71.4,12.5
7
+ claude-3-opus,100.0,100.0,100.0,98.3,71.4,100.0,33.3,93.0,85.7,88.9,93.8,0.0,100.0,50.0,85.4,61.5,71.4,90.5,25.0
8
+ claude-3-sonnet,100.0,100.0,100.0,96.7,85.7,100.0,50.0,88.7,57.1,66.7,87.6,0.0,75.0,50.0,81.2,53.8,64.3,78.6,12.5
9
+ command-r-plus,90.6,0.0,100.0,88.3,14.3,0.0,50.0,80.3,57.1,66.7,85.2,0.0,100.0,50.0,79.2,46.2,57.1,61.9,12.5
10
+ gemini-flash-1.5,90.6,0.0,0.0,86.7,71.4,100.0,33.3,93.0,85.7,88.9,88.9,0.0,100.0,50.0,81.2,38.5,50.0,81.0,0.0
11
+ gemini-pro,96.9,0.0,0.0,90.0,14.3,0.0,16.7,80.3,71.4,66.7,88.9,0.0,100.0,0.0,79.2,46.2,64.3,69.0,0.0
12
+ gemini-pro-1.5,96.9,0.0,0.0,90.0,42.9,100.0,33.3,87.3,42.9,77.8,87.6,0.0,100.0,50.0,79.2,46.2,85.7,85.7,12.5
13
+ gpt-3.5-turbo-0125,84.4,0.0,0.0,73.3,14.3,0.0,50.0,53.5,42.9,44.4,67.9,0.0,75.0,50.0,68.8,46.2,71.4,52.4,0.0
14
+ gpt-4-turbo,100.0,100.0,100.0,91.7,71.4,100.0,66.7,63.4,100.0,88.9,92.6,0.0,100.0,50.0,87.5,61.5,50.0,64.3,12.5
15
+ gpt-4o,78.1,100.0,100.0,83.3,71.4,100.0,66.7,66.2,85.7,77.8,80.2,0.0,100.0,0.0,68.8,38.5,78.6,38.1,12.5
16
+ llama-3-70b-instruct,96.9,0.0,0.0,90.0,14.3,0.0,33.3,87.3,71.4,66.7,79.0,0.0,75.0,0.0,68.8,46.2,71.4,76.2,0.0
17
+ llama-3-8b-instruct,65.6,0.0,0.0,66.7,0.0,0.0,16.7,57.8,28.6,11.1,42.0,0.0,0.0,0.0,54.2,15.4,28.6,57.1,0.0
18
+ mistral-7b-instruct:nitro,71.9,0.0,0.0,66.7,0.0,0.0,16.7,59.2,14.3,33.3,50.6,0.0,25.0,0.0,50.0,23.1,28.6,57.1,0.0
19
+ mixtral-8x7b-instruct,96.9,0.0,0.0,76.7,14.3,0.0,16.7,80.3,57.1,55.6,71.6,0.0,75.0,0.0,68.8,30.8,57.1,69.0,0.0
20
+ zefiro-7b-base-ITA,56.2,0.0,0.0,55.0,0.0,0.0,16.7,56.3,0.0,33.3,43.2,0.0,0.0,0.0,41.7,15.4,42.9,54.8,0.0