DanielePoterti commited on
Commit
6fe9760
·
verified ·
1 Parent(s): 0f6dbd0
Files changed (2) hide show
  1. src/macro_area.csv +6 -2
  2. src/question_format.csv +6 -2
src/macro_area.csv CHANGED
@@ -10,17 +10,21 @@ claude-3-opus,91.7,91.6,78.8,100.0,82.8,75.0,50.0,89.5,83.3
10
  claude-3-sonnet,87.0,90.5,75.8,100.0,62.1,75.0,0.0,52.6,100.0
11
  claude-3.5-sonnet:beta,92.6,95.0,84.8,100.0,93.1,87.5,25.0,94.7,83.3
12
  command-r-plus,74.1,80.4,81.8,71.4,65.5,66.7,0.0,57.9,83.3
 
13
  gemini-flash-1.5,83.3,85.5,81.8,85.7,62.1,83.3,25.0,63.2,66.7
14
  gemini-pro,78.7,82.1,81.8,71.4,51.7,70.8,0.0,68.4,66.7
15
  gemini-pro-1.5,90.7,87.7,84.8,57.1,55.2,58.3,25.0,63.2,33.3
16
  gemma-2-9b-it,75.9,82.7,66.7,71.4,51.7,58.3,0.0,57.9,83.3
17
  gpt-3.5-turbo-0125,61.1,64.8,63.6,42.9,55.2,58.3,0.0,47.4,83.3
18
- gpt-4-turbo,77.8,82.1,75.8,71.4,82.8,75.0,50.0,73.7,100.0
19
- gpt-4o,64.8,69.8,51.5,100.0,69.0,87.5,0.0,89.5,100.0
 
20
  llama-3-70b-instruct,83.3,85.5,75.8,71.4,55.2,33.3,0.0,47.4,50.0
21
  llama-3-8b-instruct,48.2,53.6,63.6,14.3,34.5,29.2,0.0,31.6,50.0
 
22
  maestrale-chat-v0.4-beta,62.0,61.4,60.6,42.9,44.8,33.3,0.0,15.8,50.0
23
  mistral-7b-instruct:nitro,51.8,59.2,51.5,28.6,37.9,29.2,0.0,31.6,33.3
 
24
  mixtral-8x22b-instruct,74.1,76.0,72.7,28.6,44.8,66.7,0.0,31.6,66.7
25
  mixtral-8x7b-instruct,74.1,77.1,69.7,42.9,37.9,50.0,0.0,52.6,50.0
26
  modello-italia-9b,28.7,28.5,30.3,14.3,10.3,16.7,0.0,10.5,50.0
 
10
  claude-3-sonnet,87.0,90.5,75.8,100.0,62.1,75.0,0.0,52.6,100.0
11
  claude-3.5-sonnet:beta,92.6,95.0,84.8,100.0,93.1,87.5,25.0,94.7,83.3
12
  command-r-plus,74.1,80.4,81.8,71.4,65.5,66.7,0.0,57.9,83.3
13
+ dolphin-llama-3-70b,82.4,84.9,78.8,85.7,55.2,50.0,0.0,68.4,83.3
14
  gemini-flash-1.5,83.3,85.5,81.8,85.7,62.1,83.3,25.0,63.2,66.7
15
  gemini-pro,78.7,82.1,81.8,71.4,51.7,70.8,0.0,68.4,66.7
16
  gemini-pro-1.5,90.7,87.7,84.8,57.1,55.2,58.3,25.0,63.2,33.3
17
  gemma-2-9b-it,75.9,82.7,66.7,71.4,51.7,58.3,0.0,57.9,83.3
18
  gpt-3.5-turbo-0125,61.1,64.8,63.6,42.9,55.2,58.3,0.0,47.4,83.3
19
+ gpt-4-turbo,86.1,89.9,81.8,71.4,86.2,79.2,50.0,73.7,100.0
20
+ gpt-4o,75.0,76.0,63.6,85.7,75.9,79.2,25.0,94.7,100.0
21
+ gpt-4o-mini,80.6,86.0,81.8,85.7,55.2,70.8,0.0,57.9,83.3
22
  llama-3-70b-instruct,83.3,85.5,75.8,71.4,55.2,33.3,0.0,47.4,50.0
23
  llama-3-8b-instruct,48.2,53.6,63.6,14.3,34.5,29.2,0.0,31.6,50.0
24
+ llama-3.1-405b-instruct,85.2,87.7,84.8,100.0,82.8,83.3,50.0,84.2,100.0
25
  maestrale-chat-v0.4-beta,62.0,61.4,60.6,42.9,44.8,33.3,0.0,15.8,50.0
26
  mistral-7b-instruct:nitro,51.8,59.2,51.5,28.6,37.9,29.2,0.0,31.6,33.3
27
+ mistral-nemo,64.8,71.0,57.6,28.6,44.8,33.3,0.0,47.4,83.3
28
  mixtral-8x22b-instruct,74.1,76.0,72.7,28.6,44.8,66.7,0.0,31.6,66.7
29
  mixtral-8x7b-instruct,74.1,77.1,69.7,42.9,37.9,50.0,0.0,52.6,50.0
30
  modello-italia-9b,28.7,28.5,30.3,14.3,10.3,16.7,0.0,10.5,50.0
src/question_format.csv CHANGED
@@ -10,17 +10,21 @@ claude-3-opus,100.0,100.0,100.0,98.3,71.4,100.0,33.3,93.0,85.7,88.9,93.8,0.0,100
10
  claude-3-sonnet,100.0,100.0,100.0,96.7,85.7,100.0,50.0,88.7,57.1,66.7,87.6,0.0,75.0,50.0,81.2,53.8,64.3,78.6,12.5
11
  claude-3.5-sonnet:beta,100.0,100.0,100.0,100.0,85.7,100.0,50.0,97.2,100.0,88.9,95.1,100.0,100.0,50.0,93.8,69.2,50.0,92.9,62.5
12
  command-r-plus,90.6,0.0,100.0,88.3,14.3,0.0,50.0,80.3,57.1,66.7,85.2,0.0,100.0,50.0,79.2,46.2,57.1,61.9,12.5
 
13
  gemini-flash-1.5,90.6,0.0,0.0,86.7,71.4,100.0,33.3,93.0,85.7,88.9,88.9,0.0,100.0,50.0,81.2,38.5,50.0,81.0,0.0
14
  gemini-pro,96.9,0.0,0.0,90.0,14.3,0.0,16.7,80.3,71.4,66.7,88.9,0.0,100.0,0.0,79.2,46.2,64.3,69.0,0.0
15
  gemini-pro-1.5,96.9,0.0,0.0,90.0,42.9,100.0,33.3,87.3,42.9,77.8,87.6,0.0,100.0,50.0,79.2,46.2,85.7,85.7,12.5
16
  gemma-2-9b-it,96.9,50.0,0.0,88.3,14.3,0.0,33.3,83.1,42.9,66.7,77.8,0.0,50.0,0.0,79.2,61.5,57.1,61.9,12.5
17
  gpt-3.5-turbo-0125,84.4,0.0,0.0,73.3,14.3,0.0,50.0,53.5,42.9,44.4,67.9,0.0,75.0,50.0,68.8,46.2,71.4,52.4,0.0
18
- gpt-4-turbo,100.0,100.0,100.0,91.7,71.4,100.0,66.7,63.4,100.0,88.9,92.6,0.0,100.0,50.0,87.5,61.5,50.0,64.3,12.5
19
- gpt-4o,78.1,100.0,100.0,83.3,71.4,100.0,66.7,66.2,85.7,77.8,80.2,0.0,100.0,0.0,68.8,38.5,78.6,38.1,12.5
 
20
  llama-3-70b-instruct,96.9,0.0,0.0,90.0,14.3,0.0,33.3,87.3,71.4,66.7,79.0,0.0,75.0,0.0,68.8,46.2,71.4,76.2,0.0
21
  llama-3-8b-instruct,65.6,0.0,0.0,66.7,0.0,0.0,16.7,57.8,28.6,11.1,42.0,0.0,0.0,0.0,54.2,15.4,28.6,57.1,0.0
 
22
  maestrale-chat-v0.4-beta,65.6,0.0,0.0,66.7,14.3,0.0,0.0,62.0,0.0,33.3,60.5,0.0,25.0,0.0,62.5,23.1,35.7,71.4,0.0
23
  mistral-7b-instruct:nitro,71.9,0.0,0.0,66.7,0.0,0.0,16.7,59.2,14.3,33.3,50.6,0.0,25.0,0.0,50.0,23.1,28.6,57.1,0.0
 
24
  mixtral-8x22b-instruct,81.2,0.0,0.0,73.3,42.9,0.0,50.0,78.9,42.9,44.4,74.1,0.0,100.0,0.0,72.9,38.5,57.1,69.0,12.5
25
  mixtral-8x7b-instruct,96.9,0.0,0.0,76.7,14.3,0.0,16.7,80.3,57.1,55.6,71.6,0.0,75.0,0.0,68.8,30.8,57.1,69.0,0.0
26
  modello-italia-9b,25.0,0.0,0.0,31.7,0.0,0.0,0.0,29.6,0.0,22.2,27.2,0.0,25.0,0.0,35.4,0.0,7.1,33.3,0.0
 
10
  claude-3-sonnet,100.0,100.0,100.0,96.7,85.7,100.0,50.0,88.7,57.1,66.7,87.6,0.0,75.0,50.0,81.2,53.8,64.3,78.6,12.5
11
  claude-3.5-sonnet:beta,100.0,100.0,100.0,100.0,85.7,100.0,50.0,97.2,100.0,88.9,95.1,100.0,100.0,50.0,93.8,69.2,50.0,92.9,62.5
12
  command-r-plus,90.6,0.0,100.0,88.3,14.3,0.0,50.0,80.3,57.1,66.7,85.2,0.0,100.0,50.0,79.2,46.2,57.1,61.9,12.5
13
+ dolphin-llama-3-70b,100.0,50.0,100.0,91.7,28.6,0.0,50.0,93.0,71.4,44.4,81.5,0.0,75.0,50.0,70.8,53.8,64.3,69.0,12.5
14
  gemini-flash-1.5,90.6,0.0,0.0,86.7,71.4,100.0,33.3,93.0,85.7,88.9,88.9,0.0,100.0,50.0,81.2,38.5,50.0,81.0,0.0
15
  gemini-pro,96.9,0.0,0.0,90.0,14.3,0.0,16.7,80.3,71.4,66.7,88.9,0.0,100.0,0.0,79.2,46.2,64.3,69.0,0.0
16
  gemini-pro-1.5,96.9,0.0,0.0,90.0,42.9,100.0,33.3,87.3,42.9,77.8,87.6,0.0,100.0,50.0,79.2,46.2,85.7,85.7,12.5
17
  gemma-2-9b-it,96.9,50.0,0.0,88.3,14.3,0.0,33.3,83.1,42.9,66.7,77.8,0.0,50.0,0.0,79.2,61.5,57.1,61.9,12.5
18
  gpt-3.5-turbo-0125,84.4,0.0,0.0,73.3,14.3,0.0,50.0,53.5,42.9,44.4,67.9,0.0,75.0,50.0,68.8,46.2,71.4,52.4,0.0
19
+ gpt-4-turbo,100.0,100.0,100.0,91.7,71.4,100.0,66.7,87.3,100.0,100.0,92.6,100.0,100.0,50.0,89.6,53.8,57.1,81.0,12.5
20
+ gpt-4o,81.2,100.0,100.0,80.0,71.4,100.0,50.0,81.7,85.7,88.9,84.0,0.0,100.0,50.0,79.2,38.5,64.3,61.9,12.5
21
+ gpt-4o-mini,93.8,0.0,0.0,93.3,42.9,0.0,50.0,88.7,71.4,66.7,82.7,0.0,75.0,50.0,81.2,69.2,50.0,71.4,12.5
22
  llama-3-70b-instruct,96.9,0.0,0.0,90.0,14.3,0.0,33.3,87.3,71.4,66.7,79.0,0.0,75.0,0.0,68.8,46.2,71.4,76.2,0.0
23
  llama-3-8b-instruct,65.6,0.0,0.0,66.7,0.0,0.0,16.7,57.8,28.6,11.1,42.0,0.0,0.0,0.0,54.2,15.4,28.6,57.1,0.0
24
+ llama-3.1-405b-instruct,100.0,100.0,100.0,96.7,71.4,0.0,83.3,91.6,100.0,77.8,91.4,100.0,75.0,0.0,87.5,61.5,50.0,81.0,12.5
25
  maestrale-chat-v0.4-beta,65.6,0.0,0.0,66.7,14.3,0.0,0.0,62.0,0.0,33.3,60.5,0.0,25.0,0.0,62.5,23.1,35.7,71.4,0.0
26
  mistral-7b-instruct:nitro,71.9,0.0,0.0,66.7,0.0,0.0,16.7,59.2,14.3,33.3,50.6,0.0,25.0,0.0,50.0,23.1,28.6,57.1,0.0
27
+ mistral-nemo,81.2,0.0,0.0,75.0,14.3,0.0,50.0,66.2,28.6,44.4,66.7,0.0,75.0,0.0,60.4,30.8,57.1,64.3,0.0
28
  mixtral-8x22b-instruct,81.2,0.0,0.0,73.3,42.9,0.0,50.0,78.9,42.9,44.4,74.1,0.0,100.0,0.0,72.9,38.5,57.1,69.0,12.5
29
  mixtral-8x7b-instruct,96.9,0.0,0.0,76.7,14.3,0.0,16.7,80.3,57.1,55.6,71.6,0.0,75.0,0.0,68.8,30.8,57.1,69.0,0.0
30
  modello-italia-9b,25.0,0.0,0.0,31.7,0.0,0.0,0.0,29.6,0.0,22.2,27.2,0.0,25.0,0.0,35.4,0.0,7.1,33.3,0.0