Upload 8 files
Browse files- ITOG.csv +26 -0
- Leaderboard.py +198 -0
- main.py +46 -0
- oblzn.csv +27 -0
- provokac.csv +27 -0
- setup.cfg +2 -0
- streamlit_app.py +23 -0
- vidvopr.csv +27 -0
ITOG.csv
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Model,NUM_Q_multich_EM,NUM_Q_multich_CC,NUM_Q_multich_PM,NUM_Q_onech_EM,NUM_Q_seq_EM,NUM_Q_seq_CC,NUM_Q_seq_PM,NUM_Q_map_EM,NUM_Q_map_CC,NUM_Q_map_PM,OPEN_Q_EM,OPEN_Q_F1,OPEN_Q_LR,LEADERBOARD
|
2 |
+
qwen2:72b-instruct-q4_0,55.59,69.97,71.88,85.66,62.01,62.12,62.12,31.4,34.38,36.87,8.93,43.03,57.1,52.39
|
3 |
+
GigaChat_Pro,63.29,66.23,75.44,76.33,52.06,52.06,52.06,11.59,11.59,15.88,40.89,47.64,64.51,48.43
|
4 |
+
yandexgpt_pro,47.28,49.35,75.05,84.88,41.75,41.75,43.3,5.58,5.58,7.3,15.46,50.76,63.7,40.9
|
5 |
+
GigaChat_Plus,51.2,55.56,68.03,69.27,34.02,34.02,34.02,6.01,6.01,8.15,32.99,40.2,58.88,38.33
|
6 |
+
GigaChat_Lite,51.2,55.56,68.03,69.27,34.02,34.02,34.02,6.01,6.01,8.15,32.65,40.37,58.92,38.32
|
7 |
+
gemma2:9b-instruct-q4_0,40.65,52.63,63.85,76.58,29.31,30.48,35.03,4.31,4.89,9.49,24.45,32.04,49.76,34.88
|
8 |
+
llama3:70b-instruct-q4_0,33.5,59.86,59.18,78.14,16.61,17.88,28.41,4.14,5.22,7.25,10.32,44.83,60.49,32.76
|
9 |
+
yandexgpt_lite,7.3,8.06,54.08,76.16,19.59,19.59,24.48,1.29,1.29,6.65,31.96,49.83,64.93,28.09
|
10 |
+
llama3.1:70b-instruct-q4_0,26.43,50.33,61.55,24.28,8.36,15.66,26.56,1.49,2.24,10.15,14.31,48.9,63.83,27.24
|
11 |
+
qwen2:7b-instruct-q4_0,10.39,13.02,54.71,66.22,19.58,20.53,21.27,1.91,2.32,7.5,3.74,11.44,32.79,20.42
|
12 |
+
ilyagusev/saiga_llama3,2.09,10.0,47.38,63.04,6.88,6.88,18.1,0.0,0.0,2.82,6.9,21.94,40.68,17.44
|
13 |
+
phi3:14b-medium-4k-instruct-q4_0,0.04,0.15,57.14,56.63,2.75,4.87,36.3,0.0,0.33,9.07,5.26,16.85,31.65,17.0
|
14 |
+
mistral:7b-instruct-v0.3-q4_0,0.0,0.02,44.55,46.65,0.0,0.0,26.98,0.0,0.0,3.94,2.47,11.6,26.0,12.48
|
15 |
+
solar:10.7b-instruct-v1-q4_0,0.0,0.26,48.67,46.3,0.0,0.21,18.25,0.0,0.08,3.23,3.42,11.43,23.18,11.93
|
16 |
+
random,4.04,7.6,32.7,24.51,14.07,14.07,14.07,0.83,0.83,3.23,,,,11.6
|
17 |
+
wavecut/vikhr:7b-instruct_0.4-Q4_1,0.0,0.02,36.12,34.32,0.95,2.75,6.24,0.0,0.0,0.58,10.01,19.1,34.3,11.11
|
18 |
+
llama3:8b-instruct-q4_0,0.32,1.0,47.4,26.49,0.53,1.48,4.07,0.0,0.08,2.28,0.06,17.11,35.27,10.47
|
19 |
+
mixtral:8x7b-instruct-v0.1-q4_0,0.0,11.81,51.05,8.67,0.11,16.93,19.47,0.0,1.66,10.48,0.63,5.19,9.85,10.45
|
20 |
+
yi:9b,4.09,11.51,34.87,25.9,8.99,10.26,12.7,0.0,0.25,1.41,0.63,3.46,15.14,9.94
|
21 |
+
gemma:7b-instruct-v1.1-q4_0,2.4,12.13,30.96,27.28,4.97,11.75,5.61,0.83,0.83,2.53,0.06,6.03,23.68,9.93
|
22 |
+
llama3.1:8b-instruct-q4_0,0.06,0.62,46.74,8.41,0.0,1.9,6.35,0.0,0.08,4.64,0.44,19.23,39.63,9.85
|
23 |
+
qwen:7b,0.0,0.02,30.09,37.09,8.99,10.05,11.96,0.0,0.0,1.33,0.0,2.44,17.1,9.16
|
24 |
+
gemma2:27b-instruct-q4_0,1.45,6.9,36.8,22.15,2.54,8.57,8.57,0.0,0.08,0.79,1.65,8.24,15.15,8.68
|
25 |
+
yi:6b,0.6,8.94,21.59,12.62,0.32,9.74,1.75,0.17,0.83,0.41,0.19,2.29,10.27,5.36
|
26 |
+
llama2:13b,0.0,0.04,28.24,0.0,0.0,0.11,2.75,0.0,0.0,0.12,0.06,5.17,13.06,3.81
|
Leaderboard.py
ADDED
@@ -0,0 +1,198 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import streamlit as st
|
3 |
+
|
4 |
+
def load_data():
|
5 |
+
df_itog = pd.read_csv("ITOG.csv")
|
6 |
+
df_oblzn = pd.read_csv("oblzn.csv")
|
7 |
+
df_vidvopr = pd.read_csv("vidvopr.csv")
|
8 |
+
df_provokac = pd.read_csv("provokac.csv")
|
9 |
+
|
10 |
+
return df_itog, df_oblzn, df_vidvopr, df_provokac
|
11 |
+
|
12 |
+
def select_table(tables):
|
13 |
+
table_choice = st.selectbox(
|
14 |
+
"Выберите таблицу для отображения:",
|
15 |
+
["ITOG", "Область знаний", "Вид вопроса", "Провокационность"],
|
16 |
+
index=0
|
17 |
+
)
|
18 |
+
|
19 |
+
return tables[table_choice]
|
20 |
+
|
21 |
+
def filter_itog_columns(df):
|
22 |
+
st.markdown("### Фильтрация по отдельным колонкам (кроме Model, которая всегда видна):")
|
23 |
+
|
24 |
+
selected_columns = st.multiselect(
|
25 |
+
"Выберите колонки для отображения:",
|
26 |
+
options=[col for col in df.columns if col != "Model"]
|
27 |
+
)
|
28 |
+
|
29 |
+
selected_columns = ["Model"] + selected_columns if selected_columns else ["Model"]
|
30 |
+
|
31 |
+
return df[selected_columns]
|
32 |
+
|
33 |
+
def filter_oblzn_columns(df):
|
34 |
+
st.markdown("### Фильтрация по областям знаний:")
|
35 |
+
|
36 |
+
knowledge_areas_dict = {
|
37 |
+
"GEO": "География",
|
38 |
+
"HIST": "История",
|
39 |
+
"SOC": "Обществознание (социология)",
|
40 |
+
"POL": "Политология и основы нац.безопасности",
|
41 |
+
"ALL": "Все области"
|
42 |
+
}
|
43 |
+
|
44 |
+
knowledge_areas = list(knowledge_areas_dict.values())
|
45 |
+
|
46 |
+
selected_columns = st.multiselect(
|
47 |
+
"Выберите области знаний:",
|
48 |
+
knowledge_areas
|
49 |
+
)
|
50 |
+
|
51 |
+
if 'Все области' in selected_columns or not selected_columns:
|
52 |
+
return df
|
53 |
+
else:
|
54 |
+
selected_abbr = [abbr for abbr, full in knowledge_areas_dict.items() if full in selected_columns]
|
55 |
+
|
56 |
+
filtered_columns = [col for col in df.columns if any(abbr in col for abbr in selected_abbr)]
|
57 |
+
|
58 |
+
selected_columns = ["Model"] + filtered_columns if filtered_columns else ["Model"]
|
59 |
+
return df[selected_columns]
|
60 |
+
|
61 |
+
|
62 |
+
def filter_vidvopr_columns(df):
|
63 |
+
st.markdown("### Фильтрация по виду вопроса:")
|
64 |
+
|
65 |
+
question_types_dict = {
|
66 |
+
"multich": "multichoice (мультивыбор)",
|
67 |
+
"onech": "one choice (вопрос с одним правильным ответом)",
|
68 |
+
"seq": "sequence (последовательность)",
|
69 |
+
"map": "mapping (сопоставление)",
|
70 |
+
"ALL": "Все типы"
|
71 |
+
}
|
72 |
+
|
73 |
+
question_types = list(question_types_dict.values())
|
74 |
+
|
75 |
+
selected_columns = st.multiselect(
|
76 |
+
"Выберите типы вопросов:",
|
77 |
+
question_types
|
78 |
+
)
|
79 |
+
|
80 |
+
if 'Все типы' in selected_columns or not selected_columns:
|
81 |
+
return df
|
82 |
+
else:
|
83 |
+
selected_abbr = [abbr for abbr, full in question_types_dict.items() if full in selected_columns]
|
84 |
+
|
85 |
+
filtered_columns = [col for col in df.columns if any(abbr in col for abbr in selected_abbr)]
|
86 |
+
|
87 |
+
selected_columns = ["Model"] + filtered_columns if filtered_columns else ["Model"]
|
88 |
+
return df[selected_columns]
|
89 |
+
|
90 |
+
|
91 |
+
def filter_provokac_columns(df):
|
92 |
+
st.markdown("### Фильтрация по уровню провокативности:")
|
93 |
+
|
94 |
+
provocation_levels_dict = {
|
95 |
+
"PROVOC_1": "1ый уровень провокативности",
|
96 |
+
"PROVOC_2": "2ой уровень провокативности",
|
97 |
+
"PROVOC_3": "3ий уровень провокативности",
|
98 |
+
"ALL": "Все уровни"
|
99 |
+
}
|
100 |
+
|
101 |
+
provocation_levels = list(provocation_levels_dict.values())
|
102 |
+
|
103 |
+
selected_columns = st.multiselect(
|
104 |
+
"Выберите уровни провокативности:",
|
105 |
+
provocation_levels
|
106 |
+
)
|
107 |
+
|
108 |
+
if 'Все уровни' in selected_columns or not selected_columns:
|
109 |
+
return df
|
110 |
+
else:
|
111 |
+
selected_abbr = [abbr for abbr, full in provocation_levels_dict.items() if full in selected_columns]
|
112 |
+
|
113 |
+
filtered_columns = [col for col in df.columns if any(abbr in col for abbr in selected_abbr)]
|
114 |
+
|
115 |
+
selected_columns = ["Model"] + filtered_columns if filtered_columns else ["Model"]
|
116 |
+
return df[selected_columns]
|
117 |
+
|
118 |
+
|
119 |
+
|
120 |
+
|
121 |
+
st.title("Leaderboard")
|
122 |
+
|
123 |
+
df_itog, df_oblzn, df_vidvopr, df_provokac = load_data()
|
124 |
+
|
125 |
+
tables = {
|
126 |
+
"ITOG": df_itog,
|
127 |
+
"Область знаний": df_oblzn,
|
128 |
+
"Вид вопроса": df_vidvopr,
|
129 |
+
"Провокационность": df_provokac
|
130 |
+
}
|
131 |
+
|
132 |
+
df_selected = select_table(tables)
|
133 |
+
|
134 |
+
if st.checkbox("Добавить фильтры"):
|
135 |
+
if df_selected is df_itog:
|
136 |
+
df_selected = filter_itog_columns(df_selected)
|
137 |
+
elif df_selected is df_oblzn:
|
138 |
+
df_selected = filter_oblzn_columns(df_selected)
|
139 |
+
elif df_selected is df_vidvopr:
|
140 |
+
df_selected = filter_vidvopr_columns(df_selected)
|
141 |
+
elif df_selected is df_provokac:
|
142 |
+
df_selected = filter_provokac_columns(df_selected)
|
143 |
+
|
144 |
+
st.dataframe(df_selected, use_container_width=True)
|
145 |
+
|
146 |
+
st.download_button(
|
147 |
+
"Скачать таблицу в формате CSV",
|
148 |
+
df_selected.to_csv(index=False).encode('utf-8'),
|
149 |
+
"filtered_table.csv",
|
150 |
+
"text/csv"
|
151 |
+
)
|
152 |
+
st.write('---')
|
153 |
+
|
154 |
+
st.write("""
|
155 |
+
### Описание структуры названий колонок:
|
156 |
+
|
157 |
+
#### Части, отвечающие за область знаний:
|
158 |
+
- **GEO** - география
|
159 |
+
- **HIST** - история
|
160 |
+
- **SOC** - обществознание (социология)
|
161 |
+
- **POL** - политология и основы национальной безопасности
|
162 |
+
|
163 |
+
#### Части, отвечающие за вид вопроса:
|
164 |
+
- **NUM_Q** или **_num_q_** - вопрос с числовым ответом, с делением на:
|
165 |
+
- **_multich_** - "multichoice", мультивыбор
|
166 |
+
- **_onech_** - "one choice", вопрос с одним правильным ответом
|
167 |
+
- **_seq_** - "sequence", последовательность
|
168 |
+
- **_map_** - "mapping", соответствие
|
169 |
+
- **OPEN_Q** или **_open_q_** - открытый вопрос, подразумевающий свободный письменный ответ
|
170 |
+
|
171 |
+
#### Части, отвечающие за уровень провокативности:
|
172 |
+
- **PROVOC_1** - первый уровень провокативности
|
173 |
+
- **PROVOC_2** - второй уровень провокативности
|
174 |
+
- **PROVOC_3** - третий уровень провокативности
|
175 |
+
|
176 |
+
#### Части, указывающие метрику:
|
177 |
+
- **_EM** - "exact match", ответ модели точно совпадает с правильным
|
178 |
+
- **_CC** - "contains check", ответ модели содержит правильный ответ
|
179 |
+
- **_PM** - "partially match", ответ модели частично верный
|
180 |
+
- **_F1** - метрика f1-score
|
181 |
+
- **_LR** - "levenshtein ratio", мера схожести ответа модели с эталонным, на основе расстояния Левенштейна
|
182 |
+
|
183 |
+
---
|
184 |
+
|
185 |
+
### Структура таблиц:
|
186 |
+
В таблице представлены три обобщенные вкладки по каждому срезу:
|
187 |
+
- **Область знаний**
|
188 |
+
- **Вид вопроса**
|
189 |
+
- **Уровень провокативности**
|
190 |
+
|
191 |
+
Также присутствует таблица **ИТОГ**, представляющая итоговый рейтинг. Это таблица по виду вопроса, но без колонок с метриками для мультивыбора с одним правильным ответом.
|
192 |
+
""")
|
193 |
+
|
194 |
+
st.write("### `Ссылки/контакты`")
|
195 |
+
|
196 |
+
st.write("[GitHub](https://github.com/ikanam-ai/slava)")
|
197 |
+
st.write("[Dataset](https://huggingface.co/datasets/RANEPA-ai/SLAVA-OpenData-2800-v1)")
|
198 |
+
|
main.py
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
st.title("SLAVA")
|
4 |
+
st.write('### Бенчмарк социально-политического ландшафта и ценностного анализа')
|
5 |
+
|
6 |
+
html_code = '''
|
7 |
+
<div style="text-align: center;">
|
8 |
+
<a href="https://raw.githubusercontent.com/Y1OV/project_lab/main/data/ranepa.png">
|
9 |
+
<img src="https://raw.githubusercontent.com/Y1OV/project_lab/main/data/ranepa.png" alt="Foo" style="width: 50%; height: auto;">
|
10 |
+
</a>
|
11 |
+
</div>
|
12 |
+
'''
|
13 |
+
|
14 |
+
st.markdown(html_code, unsafe_allow_html=True)
|
15 |
+
|
16 |
+
|
17 |
+
st.write("""
|
18 |
+
|
19 |
+
С 2024 года был разработан бенчмарк SLAVA, содержащий около 14 тысяч вопросов для российского домена из таких областей, как история, политология, социология, политическая география и основы национальной безопасности. Этот бенчмарк оценивает способности больших языковых моделей (LLM) справляться с чувствительными темами, важными для российского информационного пространства.
|
20 |
+
|
21 |
+
#### Основные задачи:
|
22 |
+
- Проверка фактических знаний LLM в российских доменах.
|
23 |
+
- Оценка чувствительности (провокативности) вопросов.
|
24 |
+
- Создание комплексной системы оценки на основе точности ответов.
|
25 |
+
|
26 |
+
#### Структура:
|
27 |
+
Вопросы делятся на следующие типы:
|
28 |
+
- Мультивыбор с одним или несколькими правильными ответами.
|
29 |
+
- Последовательности и соответствия.
|
30 |
+
- Открытые ответы.
|
31 |
+
|
32 |
+
#### Провокативность вопросов:
|
33 |
+
- **1 балл**: Низкая чувствительность — общепризнанные факты.
|
34 |
+
- **2 балла**: Средняя чувствительность — спорные темы.
|
35 |
+
- **3 балла**: Высокая чувствительность — политические и культурные вопросы, вызывающие конфликты.
|
36 |
+
|
37 |
+
#### Результаты:
|
38 |
+
Были протестированы 24 LLM, поддерживающие русский язык. Модели от компаний **GigaChat**, **YandexGPT** и **qwen2** показали наивысшую точность и способность справляться с сложными, провокативными вопросами. В то время как некоторые модели, такие как **llama2** и **mixtral**, продемонстрировали более слабые результаты.
|
39 |
+
|
40 |
+
Этот бенчмарк подчеркивает необходимость дальнейших исследований в области надежности LLM, особенно в контексте социально-политических тем, значимых для России.
|
41 |
+
""")
|
42 |
+
|
43 |
+
st.write("### `Ссылки/контакты`")
|
44 |
+
|
45 |
+
st.write("[GitHub](https://github.com/ikanam-ai/slava)")
|
46 |
+
st.write("[Dataset](https://huggingface.co/datasets/RANEPA-ai/SLAVA-OpenData-2800-v1)")
|
oblzn.csv
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Model,GEO_num_q_EM,GEO_num_q_CC,GEO_num_q_PM,GEO_open_q_EM,GEO_open_q_F1,GEO_open_q_LR,GEO_avg,HIST_num_q_EM,HIST_num_q_CC,HIST_num_q_PM,HIST_open_q_EM,HIST_open_q_F1,HIST_open_q_LR,HIST_avg,SOC_num_q_EM,SOC_num_q_CC,SOC_num_q_PM,SOC_open_q_EM,SOC_open_q_F1,SOC_open_q_LR,SOC_avg,POL_num_q_EM,POL_num_q_CC,POL_num_q_PM,POL_avg,DOMAIN_avg
|
2 |
+
gemma2:27b-instruct-q4_0,15.5348837209302,36.2325581395349,34.7209302325581,1.36518771331058,6.08108161580778,12.3447098976109,17.71322521995874,9.0968443960827,29.5756256800871,26.822633297062,1.97309417040359,8.93722159294191,16.0089686098655,15.402397957740467,8.88761467889908,29.1284403669725,42.6032110091743,0.0,7.37259511525245,14.374269005848,17.061021696024387,23.7947122861586,59.0979782270607,46.8895800933126,43.26075686884396,20.51629237375588
|
3 |
+
gemma2:9b-instruct-q4_0,58.9302325581395,62.3720930232558,64.4883720930233,30.7167235494881,38.6347348301442,54.7849829351536,51.65452316486741,39.8694232861806,43.4385201305767,48.139281828074,17.6681614349776,25.4223580804077,44.1973094170404,36.45584236287616,59.6139143730887,68.4633027522936,72.887996941896,57.8947368421053,63.8700918964077,77.4327485380117,66.6937985573005,68.2737169517885,71.3841368584759,70.4510108864697,70.0362882322447,54.234945200333264
|
4 |
+
gemma:7b-instruct-v1.1-q4_0,20.8372093023256,39.2093023255814,42.046511627907,0.0,3.94542088889186,19.0238907849829,20.843722488281458,9.88030467899891,19.8258977149075,23.8737758433079,0.0896860986547085,4.38105088097796,23.3417040358744,13.565403208786895,12.6911314984709,32.0336391437309,43.243501529052,0.0,20.3867021011607,33.8479532163743,23.70048791479813,27.6827371695179,54.7433903576983,55.6765163297045,46.03421461897357,23.179063120386648
|
5 |
+
ilyagusev/saiga_llama3,37.8139534883721,40.8372093023256,49.3023255813954,15.358361774744,35.1715192074092,52.7337883959044,38.53619295835845,26.2241566920566,27.7040261153428,37.1490750816104,2.51121076233184,15.1855722725022,34.7470852017937,23.920187687606255,26.5863914373089,32.2438837920489,56.5271406727829,21.0526315789474,43.3031923570162,58.7485380116959,39.74362964163337,60.8087091757387,63.2970451010886,63.9191290824261,62.67496111975114,38.15356881356391
|
6 |
+
llama2:13b,0.0,31.6279069767442,4.69767441860465,0.341296928327645,9.40460676527417,20.5904436860068,11.11032146249291,0.0,19.0424374319913,5.61479869423286,0.0,4.03632452572392,11.5237668161435,6.702887911348597,0.0,9.53746177370031,20.3172782874618,0.0,5.28747736387109,10.2046783625731,7.55781596460105,0.0,49.3001555209953,5.05443234836703,18.11819595645411,9.837178090477034
|
7 |
+
llama3.1:70b-instruct-q4_0,17.2558139534884,60.6511627906977,65.3255813953489,5.11945392491468,59.2786046255473,67.2184300341297,45.80817445402112,16.7791077257889,46.8335146898803,51.2622415669206,16.1434977578475,43.4389491770808,61.27533632287,39.288774540064686,28.3448012232416,64.7362385321101,71.1678134556575,18.1286549707602,66.7182357754662,74.672514619883,53.96137642951976,17.8849144634526,72.6283048211509,71.850699844479,54.12130637636083,47.46256531765313
|
8 |
+
llama3.1:8b-instruct-q4_0,4.6046511627907,31.8139534883721,32.1860465116279,0.0,32.1790002422677,49.0546075085324,24.973043152265134,3.13384113166485,24.3960826985854,30.4678998911861,0.62780269058296,14.4408672828017,36.1085201793722,18.195835645698867,3.26834862385321,24.8853211009174,51.5768348623853,0.0,28.2554385814658,46.4093567251462,25.732549982294653,11.5085536547434,57.0762052877138,51.1664074650078,39.917055469155,25.388559004238896
|
9 |
+
llama3:70b-instruct-q4_0,57.3953488372093,63.4883720930233,63.9069767441861,5.80204778156997,60.7482833489855,68.3788395904437,53.28664473256964,40.7181719260065,46.5070729053319,49.2491838955386,10.6726457399103,40.3680070949795,58.322869955157,40.9729919194873,52.3891437308869,70.7759938837921,67.8516819571865,15.7894736842105,46.6919490734894,61.1052631578947,52.43391758124334,68.895800933126,70.6065318818041,71.3841368584759,70.29548989113533,51.95465690824798
|
10 |
+
llama3:8b-instruct-q4_0,15.8139534883721,35.5348837209302,45.7674418604651,0.0,31.6684961159729,48.0511945392492,29.472661620831584,10.5114254624592,25.5495103373232,33.8737758433079,0.0896860986547085,12.4341412479717,31.7417040358744,19.033373837598518,11.2767584097859,27.1215596330275,57.7025993883792,0.0,22.6456467258,36.4152046783626,25.860294805892533,20.99533437014,62.6749611197512,62.5194401244168,48.729911871436,28.20893891429732
|
11 |
+
mistral:7b-instruct-v0.3-q4_0,25.4883720930233,29.953488372093,44.4883720930233,6.8259385665529,14.7143576600266,29.5290102389079,25.166589837271164,18.759521218716,23.8302502720348,34.7007616974973,0.717488789237668,8.34350531701643,24.0941704035874,18.407616283014935,18.4059633027523,23.7576452599388,52.3318042813456,6.4327485380117,27.5101026274299,32.4035087719298,26.806962130234684,49.4556765163297,57.3872472783826,58.0093312597201,54.950751684810804,27.959012597978905
|
12 |
+
mixtral:8x7b-instruct-v0.1-q4_0,5.53488372093023,48.7906976744186,52.3953488372093,0.341296928327645,3.94842331452323,6.73037542662116,19.62350431700503,3.5038084874864,37.9542981501632,42.3612622415669,0.717488789237668,4.68070314515325,10.1865470852018,16.5673513164682,3.45948012232416,37.5955657492355,59.7094801223242,0.584795321637427,10.6595347898574,12.9590643274854,20.827986738810683,6.22083981337481,72.1617418351477,61.1975116640747,46.52669777086573,22.937768930776222
|
13 |
+
phi3:14b-medium-4k-instruct-q4_0,34.2325581395349,43.8139534883721,59.5813953488372,12.9692832764505,30.2658027774082,44.8976109215017,37.62676732535076,24.069640914037,33.6235038084875,46.9314472252448,2.69058295964126,12.3929007725114,27.7345291479821,24.573767471317343,21.5596330275229,30.2752293577982,65.7874617737003,8.7719298245614,22.9512670175593,34.4678362573099,30.63555954307533,50.5443234836703,68.7402799377916,66.5629860031104,61.94919647485742,35.37448359347776
|
14 |
+
qwen2:72b-instruct-q4_0,80.5581395348837,83.953488372093,83.7441860465116,1.70648464163823,60.9664006973775,66.8259385665529,62.95910630984282,60.1088139281828,65.310119695321,65.4733405875952,10.6726457399103,37.226795170081,53.6,48.73195252018172,68.5015290519878,79.434250764526,78.3925840978593,9.94152046783626,50.1081604264453,63.2748538011696,58.27548310163737,68.5847589424572,74.805598755832,71.2286158631415,71.53965785381023,58.781820245304864
|
15 |
+
qwen2:7b-instruct-q4_0,45.7209302325581,46.9302325581395,54.8837209302326,6.48464163822526,12.4030115766671,33.8430034129693,33.37759005813198,27.8346028291621,30.1196953210011,37.2143634385201,2.0627802690583,7.87949319222932,29.6869955156951,22.466321760944336,35.4548929663609,37.2133027522936,65.5485474006116,9.94152046783626,33.0038691193979,51.2105263157895,38.72877650371496,62.0528771384137,63.9191290824261,64.0746500777605,63.34888543286676,36.0706088683499
|
16 |
+
qwen:7b,20.093023255814,20.8837209302326,26.5813953488372,0.0,1.97488246634156,16.8600682593857,14.39884837676851,14.6681175190424,15.4080522306855,21.7627856365615,0.0,1.34680160823358,15.7542600896861,11.49000284736818,16.532874617737,16.5519877675841,35.3211009174312,0.0,10.3653291238234,26.2690058479532,17.50671637908815,39.50233281493,39.9688958009331,40.9797822706065,40.15033696215653,18.134496024086605
|
17 |
+
solar:10.7b-instruct-v1-q4_0,23.7674418604651,34.4651162790698,46.0232558139535,1.70648464163823,8.4900835179791,17.1194539249147,21.92863933967007,16.474428726877,26.89880304679,37.0729053318825,3.04932735426009,10.4661683783297,22.9569506726457,19.48643058513083,21.6360856269113,25.9556574923547,57.559250764526,8.7719298245614,22.7690138336274,35.0643274853801,28.626044171226813,42.1461897356143,60.3421461897356,60.1866251944012,54.22498703991704,27.7581736045675
|
18 |
+
wavecut/vikhr:7b-instruct_0.4-Q4_1,17.8139534883721,23.2093023255814,28.093023255814,12.9692832764505,22.0442276930355,35.7849829351536,23.31912882906785,14.2110990206746,17.5843307943417,23.4820457018498,7.98206278026906,16.8323787548876,32.647533632287,18.78990844738496,14.2201834862385,17.6796636085627,40.9690366972477,18.1286549707602,28.8398070736436,42.5029239766082,27.05671163551015,32.5038880248834,42.7682737169518,41.9906687402799,39.08761016070503,25.34558685494728
|
19 |
+
yi:6b,7.06976744186047,26.4651162790698,17.4651162790698,0.0,1.72191372616652,8.90102389078498,10.270489602825263,3.65614798694233,22.6550598476605,13.7540805223069,0.0,2.08897265807147,10.7318385650224,8.814349930000601,6.19266055045872,23.9296636085627,26.3379204892966,1.75438596491228,4.61048579154667,9.56140350877193,12.064419985591483,18.1959564541213,36.8584758942457,30.0933125972006,28.382581648522535,12.954442955051032
|
20 |
+
yi:9b,16.046511627907,27.1162790697674,30.5813953488372,0.0,3.15355705067136,12.098976109215,14.832786534399661,9.07508161044614,19.7606093579978,22.829162132753,0.179372197309417,2.8367657031748,15.2224215246637,11.65056875439081,15.1567278287462,29.6062691131498,42.4120795107034,4.67836257309942,8.03562572777585,19.7836257309942,19.945448414078147,33.5925349922239,50.0777604976672,48.2115085536547,43.9606013478486,19.5454583933694
|
21 |
+
GigaChat_Lite,61.1872146118721,62.5570776255708,63.1278538812785,40.9090909090909,51.7079785604165,65.8030303030303,57.54870764854318,40.8845738942826,43.042071197411,45.5231930960086,26.2032085561497,33.7021080330654,53.9144385026738,40.54493221326518,63.4799235181644,66.5391969407266,73.565965583174,50.0,53.5087719298246,71.6052631578947,63.11652018829738,52.8985507246377,55.7971014492754,54.7101449275362,54.46859903381644,53.841274162003984
|
22 |
+
GigaChat_Plus,61.1872146118721,62.5570776255708,63.1278538812785,40.9090909090909,51.7298973967309,65.7424242424243,57.542259777827915,40.8845738942826,43.042071197411,45.5231930960086,26.7379679144385,33.4191279951085,53.8449197860963,40.57530898055759,63.4799235181644,66.5391969407266,73.565965583174,50.0,53.5087719298246,71.7631578947368,63.142835977771064,52.8985507246377,55.7971014492754,54.7101449275362,54.46859903381644,53.85562978658994
|
23 |
+
GigaChat_Pro,71.2328767123288,72.1461187214612,72.6027397260274,53.030303030303,62.0469396605864,73.8484848484849,67.48457711653195,49.8381877022654,51.024811218986,54.1531823085221,33.6898395721925,40.7455479303299,59.379679144385,48.138541312780156,74.3785850860421,76.7686424474187,81.357552581262,55.2631578947369,56.578947368421,73.5,69.64114756298011,55.072463768116,57.9710144927536,56.8840579710145,56.642512077294704,61.02443486598271
|
24 |
+
yandexgpt_lite,47.2602739726027,48.1735159817352,55.5936073059361,42.4242424242424,55.6932275713253,67.6363636363636,52.79687181536755,39.697950377562,40.453074433657,50.9708737864078,28.3422459893048,42.8268094822329,61.0160427807487,43.884499474985525,35.3728489483748,36.2332695984704,65.2485659655832,31.5789473684211,74.1228070175439,79.5263157894737,53.68045911464452,57.9710144927536,58.695652173913,60.8695652173913,59.17874396135263,51.41462925304969
|
25 |
+
yandexgpt_pro,63.9269406392694,64.3835616438356,68.7214611872146,22.7272727272727,44.3545700753268,56.3939393939394,53.41795761114307,56.0949298813376,56.2028047464941,62.2977346278317,7.48663101604278,47.2091634117127,61.855614973262,48.52447977611348,63.6711281070746,65.2963671128107,80.9273422562142,42.1052631578947,79.3859649122807,85.4473684210526,69.47223899455459,60.8695652173913,61.5942028985507,63.0434782608696,61.83574879227053,57.809300222270394
|
26 |
+
random,16.8372093023256,17.5813953488372,22.3720930232558,,,,18.930232558139533,13.2535364526659,13.9281828073993,20.4134929270947,,,,15.8650707290533,11.6207951070336,13.8188073394495,29.0997706422018,,,,18.17979102956163,26.1275272161742,26.905132192846,28.149300155521,27.0606531881804,20.008936876233715
|
27 |
+
Среднее значение,33.0457343102899,44.58990336625253,47.67298715089731,12.571103526734921,29.2636258910368,41.42481556176096,34.58490265246131,23.569131589728055,32.9484170331947,37.23665961195567,8.346142778350634,19.61007223781356,35.41221693324701,26.081951898966594,29.447253553657156,40.244822273288065,56.480499470825215,,,,,40.339261162575795,57.78393851285865,54.392521468659126,50.838573714697866,35.27007299907976
|
provokac.csv
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Model,PROVOC_1_num_q_EM,PROVOC_1_num_q_CC,PROVOC_1_num_q_PM,PROVOC_1_open_q_EM,PROVOC_1_open_q_F1,PROVOC_1_open_q_LR,PROVOC_1_avg,PROVOC_2_num_q_EM,PROVOC_2_num_q_CC,PROVOC_2_num_q_PM,PROVOC_2_open_q_EM,PROVOC_2_open_q_F1,PROVOC_2_open_q_LR,PROVOC_2_avg,PROVOC_3_num_q_EM,PROVOC_3_num_q_CC,PROVOC_3_num_q_PM,PROVOC_3_open_q_EM,PROVOC_3_open_q_F1,PROVOC_3_open_q_LR,PROVOC_3_avg,PROVOC_avg
|
2 |
+
gemma2:27b-instruct-q4_0,13.0885602583716,35.0501444841068,36.6054733979262,2.46710526315789,11.3438276640427,18.7450657894737,19.550029476179816,10.5151915455746,34.6895640686922,36.1426684280053,1.7353579175705,7.23438464320977,14.3036876355748,17.436809039771195,6.84281842818428,22.5948509485095,33.4688346883469,0.588235294117647,5.44191022355375,11.6352941176471,13.428657283393198,16.805165266448068
|
3 |
+
gemma2:9b-instruct-q4_0,55.4139044705082,60.4283528811831,64.5843957164712,27.3026315789474,34.9929958961886,52.3865131578947,49.184798950198875,55.2708058124174,60.105680317041,63.0779392338177,27.3318872017354,35.245661577345,52.6681127982647,48.950014490103534,44.2073170731707,52.4390243902439,56.8428184281843,18.4313725490196,25.6150995826153,44.0039215686275,40.25659226531022,46.13046856853753
|
4 |
+
gemma:7b-instruct-v1.1-q4_0,16.1652218255992,32.3814380418154,37.3363929967704,0.0,6.12514628005334,23.4473684210526,19.24259459421516,14.0290620871863,31.7040951122853,37.8071334214003,0.216919739696312,7.23992067543114,24.943600867679,19.323455317279723,8.87533875338753,22.9336043360434,33.6720867208672,0.0,4.83394851448634,22.8098039215686,15.520797041058843,18.028948984184574
|
5 |
+
ilyagusev/saiga_llama3,32.87438381778,35.5430902600714,49.7790243073262,6.41447368421053,20.8053473204584,38.7894736842105,30.700965512342844,33.8441215323646,36.4861294583884,49.4583883751651,9.54446854663774,25.5266537708331,44.7830802603037,33.27380699061544,19.8170731707317,26.1856368563686,45.2235772357724,5.09803921568627,20.0481873440546,39.2372549019608,25.934961454095728,29.969911319018006
|
6 |
+
llama2:13b,0.0,24.1543430222675,9.55294917559069,0.0,2.83808861437468,8.72203947368421,7.544570047652848,0.0,15.9313077939234,9.85468956406869,0.0,5.93808858848223,14.4577006507592,7.696964432872253,0.0,11.7547425474255,17.5982384823848,0.196078431372549,7.24939909960657,16.978431372549,8.962814988889738,8.068116489804948
|
7 |
+
llama3.1:70b-instruct-q4_0,22.0975692673806,59.9864014958355,65.0263471018188,18.0921052631579,49.4000831972529,65.4835526315789,46.681009826170765,21.558784676354,59.9471598414795,64.2536327608983,12.5813449023861,52.9417050541935,65.1952277657267,46.079642500173016,21.1382113821138,51.219512195122,57.1815718157182,11.3725490196078,44.6480415518083,60.621568627451,41.03024243197018,44.59696491943799
|
8 |
+
llama3.1:8b-instruct-q4_0,4.11354750977393,29.3047764745878,38.2457929627741,0.657894736842105,16.8408300317768,39.21875,21.39693195262579,4.38573315719947,31.7569352708058,41.889035667107,0.216919739696312,23.6358923040094,41.9913232104121,23.979306558205014,2.710027100271,18.5636856368564,43.4959349593496,0.392156862745098,18.0908295632752,37.9745098039216,20.20452398773648,21.860254166189094
|
9 |
+
llama3:70b-instruct-q4_0,53.7650858405575,62.0941696413395,63.9129695733469,13.1578947368421,44.7874198858453,60.8569078947368,49.762407928778025,51.7569352708058,61.9286657859974,60.5019815059445,7.15835140997831,47.0840736221898,61.0976138828633,48.25460357962985,39.5325203252033,56.3008130081301,54.0650406504065,9.80392156862745,42.8576254770552,59.5039215686275,43.677307099675005,47.23143953602762
|
10 |
+
llama3:8b-instruct-q4_0,14.4144144144144,32.296447390787,47.5437701852796,0.164473684210526,14.8289590829379,32.9309210526316,23.696497635043503,13.5799207397622,33.8705416116248,49.0885072655218,0.0,21.1064741325482,39.5336225596529,26.196511051518314,6.30081300813008,19.579945799458,44.2581300813008,0.0,16.2142171491163,34.2176470588235,20.095125516138115,23.329378067566648
|
11 |
+
mistral:7b-instruct-v0.3-q4_0,23.6613972463029,28.0469148393677,45.3340132585416,1.97368421052632,11.6339867326757,25.5740131578947,22.704001574218154,24.4121532364597,31.8361955085865,46.4861294583884,2.16919739696312,11.9370897080079,26.0325379609544,23.81221721156,12.7032520325203,16.8021680216802,41.8529810298103,3.33333333333333,11.2589813397905,26.4862745098039,18.739498377823086,21.751905721200412
|
12 |
+
mixtral:8x7b-instruct-v0.1-q4_0,5.16743158252592,43.8551759306476,52.0652728199898,1.15131578947368,6.41044460793093,11.1595394736842,19.968196700708692,3.93659180977543,44.8348745046235,54.8612945838837,0.0,4.81190475632898,9.01301518438178,19.57628013983223,1.6260162601626,32.079945799458,49.1531165311653,0.588235294117647,4.08394916230811,9.03137254901961,16.09377259937188,18.5460831466376
|
13 |
+
phi3:14b-medium-4k-instruct-q4_0,29.5427502974673,38.6367499575047,60.2583715791263,5.09868421052632,17.0255729274497,30.3042763157895,30.14440088131064,28.1638044914135,41.5852047556143,58.5204755614267,6.941431670282,19.6178110192585,35.4880694143167,31.719466152051954,16.6327913279133,22.5609756097561,52.4220867208672,3.92156862745098,14.1475921869919,29.7803921568627,23.24423443830703,28.36936715722321
|
14 |
+
qwen2:72b-instruct-q4_0,72.9389767125616,78.7013428522863,78.2168961414244,10.8552631578947,44.3525005243567,58.4539473684211,57.25315445949081,67.6882430647292,74.7424042272127,73.0515191545575,5.85683297180043,45.8644120654459,58.1301518438178,54.22226055459392,56.4363143631436,67.2086720867209,67.8184281842818,9.41176470588235,38.8817905312052,54.5607843137255,49.05295903082655,53.509458014970434
|
15 |
+
qwen2:7b-instruct-q4_0,40.3705592384838,41.9683834778174,54.3770185279619,3.78289473684211,12.2433821762303,34.2121710526316,31.159068201661185,38.1770145310436,40.105680317041,53.553500660502,4.55531453362256,12.2532095018336,33.5726681127983,30.369564609473503,23.5772357723577,25.8807588075881,50.9993224932249,2.94117647058823,9.74641760239777,30.3843137254902,23.921537478607817,28.483390096580838
|
16 |
+
qwen:7b,19.496855345912,19.9898011218766,29.7977222505524,0.0,2.53149921250892,16.6134868421053,14.738227462159204,20.2113606340819,20.5284015852048,29.8546895640687,0.0,2.9492578357003,18.6898047722343,15.372252398548332,10.6029810298103,11.0772357723577,27.10027100271,0.0,1.87065931868442,16.2372549019608,11.148067004253868,13.752848954987138
|
17 |
+
solar:10.7b-instruct-v1-q4_0,22.7265000849906,31.7525072242053,48.6996430392657,2.96052631578947,12.611043103768,25.0707236842105,23.97015724203826,26.1558784676354,34.7159841479524,50.1453104359313,4.12147505422994,11.1460215172048,23.6637744034707,24.991407337737424,11.6531165311653,18.3265582655827,45.0033875338753,3.33333333333333,10.2843989247715,20.5039215686274,18.184119359559254,22.38189464644498
|
18 |
+
wavecut/vikhr:7b-instruct_0.4-Q4_1,16.9471358150603,20.9246982831888,31.2850586435492,9.70394736842105,18.138570599769,32.7384868421053,21.62298292534894,18.6261558784676,23.7252311756935,34.1479524438573,9.76138828633406,20.2412450317039,35.527114967462,23.671514630586397,9.72222222222222,12.8048780487805,32.6388888888889,10.5882352941176,19.2141238256413,35.043137254902,20.001914255758752,21.76547060389803
|
19 |
+
yi:6b,5.9323474417814,25.922148563658,19.8368179500255,0.164473684210526,1.66768509191218,8.45559210526316,10.329844139475128,7.84676354029062,27.2655217965654,21.664464993395,0.433839479392625,2.71018597592006,11.409978308026,11.888459015598285,3.89566395663957,18.360433604336,20.0542005420054,0.0,2.66425497076314,11.3882352941176,9.393798061310283,10.5373670721279
|
20 |
+
yi:9b,15.4682984871664,26.6020737718851,33.2313445520993,0.986842105263158,3.19675483677559,14.8717105263158,15.726170713250893,14.7952443857332,29.4848084544254,34.6235138705416,0.433839479392625,3.52530523130186,14.0694143167028,16.155354289682915,10.1964769647696,23.0691056910569,32.8590785907859,0.392156862745098,3.71036337636581,16.4176470588235,14.440804757424466,15.440776586786093
|
21 |
+
GigaChat_Lite,60.2159468438538,62.4584717607973,65.0332225913621,33.0357142857143,40.9633292643836,60.4375,53.69069745768518,54.6666666666667,56.6666666666667,60.1333333333333,34.5238095238095,42.2388149962259,62.6666666666667,51.815992975561464,41.8487394957983,45.2100840336134,52.0168067226891,30.5263157894737,38.0252714022117,53.821052631579,43.57471167922753,49.693800704158065
|
22 |
+
GigaChat_Plus,60.2159468438538,62.4584717607973,65.0332225913621,32.1428571428571,40.6589114935278,58.9017857142857,53.235199257780636,54.6666666666667,56.6666666666667,60.1333333333333,34.5238095238095,41.8542728148981,62.8809523809524,51.787616897721115,41.8487394957983,45.2100840336134,52.0168067226891,32.6315789473684,38.1823841355402,55.3263157894737,44.202651520747175,49.74182255874965
|
23 |
+
GigaChat_Pro,69.2691029900332,70.1827242524917,72.7159468438538,43.75,51.2258923889299,67.6160714285714,62.45995631731333,65.0666666666667,67.0666666666667,69.8,41.6666666666667,48.8871336719648,67.2976190476191,59.964125453263996,51.4285714285714,54.453781512605,58.9075630252101,36.8421052631579,42.3231189954232,58.3684210526316,50.3872602129332,57.60378066117019
|
24 |
+
yandexgpt_lite,44.9335548172757,45.8471760797342,60.6727574750831,30.3571428571429,54.3516030514378,67.375,50.58953904677895,45.8666666666667,46.2666666666667,60.4,33.3333333333333,50.0530790747819,66.6309523809524,50.4251163537335,23.5294117647059,24.7058823529412,50.2521008403361,32.6315789473684,44.307314479042,60.5578947368421,39.33069718687261,46.78178419579502
|
25 |
+
yandexgpt_pro,69.3521594684385,70.0996677740864,76.7441860465116,8.92857142857143,54.2402693529171,66.5,57.64414234508751,60.2666666666667,60.6666666666667,70.1333333333333,27.3809523809524,53.1044518132574,63.5595238095238,55.851932445066716,44.2016806722689,45.7142857142857,60.8403361344538,12.6315789473684,44.5947129451386,60.5157894736842,44.749730647866606,52.748601812673606
|
26 |
+
random,15.4513003569607,16.8111507734149,25.5141934387217,,,,19.258881523032432,14.3196829590489,15.4821664464993,23.3157199471598,,,,17.705856450902665,10.0271002710027,11.4837398373984,25.0338753387534,,,,15.5149051490515,17.49321437432887
|
27 |
+
Среднее значение,31.34491803908213,42.219864884630134,49.256112126669386,10.547854010025063,23.883922639062664,38.28603735902256,32.490177046821906,29.992271246147073,41.522395420519615,48.51594187582562,11.020297489928728,24.8811270575865,39.48359221671315,32.420821235043306,20.774177313201694,30.260816196397265,44.191019334563094,9.40230478156174,21.178941320910287,36.05854833161335,26.843667353128367,30.584888544997867
|
setup.cfg
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
[flake8]
|
2 |
+
max-line-length = 88
|
streamlit_app.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
main_page = st.Page("main.py", title="Главная"
|
4 |
+
# , icon=":material/add_circle:"
|
5 |
+
)
|
6 |
+
Leaderboard_page = st.Page("Leaderboard.py", title="Лидерборд"
|
7 |
+
# , icon=":material/delete:"
|
8 |
+
)
|
9 |
+
|
10 |
+
pg = st.navigation(
|
11 |
+
{
|
12 |
+
"Main": [main_page, Leaderboard_page],
|
13 |
+
# "Reports": []
|
14 |
+
}
|
15 |
+
)
|
16 |
+
st.set_page_config(page_title="Leaderboard"
|
17 |
+
# , page_icon=":material/edit:"
|
18 |
+
)
|
19 |
+
pg.run()
|
20 |
+
|
21 |
+
# logo = '/Users/y1ov/Work/streamlits/senej/files/beta-1.png'
|
22 |
+
|
23 |
+
# st.logo(logo, icon_image=logo)
|
vidvopr.csv
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Model,NUM_Q_multich_EM,NUM_Q_multich_CC,NUM_Q_multich_PM,NUM_Q_onech_EM,NUM_Q_onech_CC,NUM_Q_onech_PM,NUM_Q_seq_EM,NUM_Q_seq_CC,NUM_Q_seq_PM,NUM_Q_map_EM,NUM_Q_map_CC,NUM_Q_map_PM,OPEN_Q_EM,OPEN_Q_F1,OPEN_Q_LR,Q_TYPE_avg
|
2 |
+
gemma2:27b-instruct-q4_0,1.44711640774633,6.89508406043839,36.7950627793147,22.1528861154446,63.0265210608424,46.628531807939,2.53968253968254,8.57142857142857,8.57142857142857,0.0,0.0828500414250207,0.787075393537697,1.64661177960735,8.23779148465489,15.1519949335022,14.835604369799482
|
3 |
+
gemma2:9b-instruct-q4_0,40.6469461587572,52.6282187699511,63.8540114918068,76.5817299358641,79.0084936730803,76.5990639625585,29.3121693121693,30.4761904761905,35.026455026455,4.30820215410108,4.88815244407622,9.48632974316487,24.4458518049398,32.0378038500143,49.7612412919569,40.604057339672394
|
4 |
+
gemma:7b-instruct-v1.1-q4_0,2.40476697169611,12.1302404766972,30.9640349010428,27.2837580169873,53.5794765123938,53.4408042988386,4.97354497354497,11.7460317460317,5.60846560846561,0.828500414250207,0.828500414250207,2.52692626346313,0.0633312222925902,6.03356941864105,23.6782773907536,15.739348575289926
|
5 |
+
ilyagusev/saiga_llama3,2.08555011704618,10.0021281123643,47.3824217918706,63.0438550875368,64.3092390362281,64.2052348760617,6.87830687830688,6.87830687830688,18.0952380952381,0.0,0.0,2.8169014084507,6.90310322989234,21.9392109592531,40.68397720076,23.681564911421045
|
6 |
+
llama2:13b,0.0,0.0425622472866567,28.2400510746967,0.0,41.0469752123418,1.73340266944011,0.0,0.105820105820106,2.75132275132275,0.0,0.0,0.124275062137531,0.0633312222925902,5.16796089780207,13.0633312222926,6.155935497695527
|
7 |
+
llama3.1:70b-instruct-q4_0,26.4311555650138,50.3298574164716,61.5450095765056,24.284971398856,82.6833073322933,81.1232449297972,8.35978835978836,15.6613756613757,26.5608465608466,1.49130074565037,2.23695111847556,10.149130074565,14.3128562381254,48.89922596918,63.82900569981,34.52653510978363
|
8 |
+
llama3.1:8b-instruct-q4_0,0.0638433709299851,0.617152585656523,46.7439880825708,8.40700294678454,59.4037094817126,48.6566129311839,0.0,1.90476190476191,6.34920634920635,0.0,0.0,4.63960231980116,0.443318556048132,19.2284319751355,39.6263457884737,15.738931752817674
|
9 |
+
llama3:70b-instruct-q4_0,33.4964886145989,59.8638008086827,59.1828048520962,78.1417923383602,79.9791991679667,78.1591263650546,16.6137566137566,17.8835978835979,28.4126984126984,4.14250207125104,5.2195526097763,7.24937862468931,10.3229892336922,44.8346410536552,60.4901836605447,38.93283415402807
|
10 |
+
llama3:8b-instruct-q4_0,0.319216854649925,1.00021281123643,47.4037029155139,26.4863927890449,64.1012307158953,63.5812099150633,0.529100529100529,1.48148148148148,4.07407407407407,0.0,0.0828500414250207,2.27837613918807,0.0633312222925902,17.1091465760483,35.2742241925269,17.58563668383605
|
11 |
+
mistral:7b-instruct-v0.3-q4_0,0.0,0.0212811236433284,44.5520323473079,46.6458658346334,58.0689894262437,56.612931183914,0.0,0.0,26.984126984127,0.0,0.0,3.93537696768848,2.46991766941102,11.6013570437946,26.0025332488917,18.45962745531034
|
12 |
+
mixtral:8x7b-instruct-v0.1-q4_0,0.0,11.8110236220472,51.0534156203448,8.66701334720055,77.8124458311666,67.2733576009707,0.105820105820106,16.9312169312169,19.4708994708995,0.0,1.65700082850041,10.4805302402651,0.633312222925902,5.19230683158126,9.84547181760608,18.72892096470301
|
13 |
+
phi3:14b-medium-4k-instruct-q4_0,0.0425622472866567,0.148967865503299,57.1398169823367,56.6302652106084,77.2404229502513,72.2828913156526,2.75132275132275,4.86772486772487,36.2962962962963,0.0,0.331400165700083,9.07207953603977,5.25649145028499,16.8528380209838,31.6485117162761,24.704106091751175
|
14 |
+
qwen2:72b-instruct-q4_0,55.5862949563737,69.9723345392637,71.8769951053416,85.6647599237303,89.3222395562489,85.9594383775351,62.010582010582,62.1164021164021,62.1164021164021,31.4001657000828,34.3827671913836,36.8682684341342,8.92970234325522,43.0269331550944,57.1019632678911,57.08901658624805
|
15 |
+
qwen2:7b-instruct-q4_0,10.3851883379442,13.024047669717,54.7137688869972,66.2159819726122,67.8973825619691,67.0653492806379,19.5767195767196,20.5291005291005,21.2698412698413,1.90555095277548,2.31980115990058,7.49792874896438,3.73654211526282,11.4397586578317,32.7891070297657,26.691071250002647
|
16 |
+
qwen:7b,0.0,0.0212811236433284,30.0915088316663,37.0948171260184,37.857514300572,37.0948171260184,8.99470899470899,10.0529100529101,11.957671957672,0.0,0.0,1.32560066280033,0.0,2.44002256870951,17.0981633945535,12.935267742618189
|
17 |
+
solar:10.7b-instruct-v1-q4_0,0.0,0.25537348371994,48.669929772292,46.2991853007454,64.2745709828393,62.2811579129832,0.0,0.211640211640212,18.2539682539683,0.0,0.0828500414250207,3.23115161557581,3.41988600379987,11.4318388715363,23.1849271690944,18.773098641307982
|
18 |
+
wavecut/vikhr:7b-instruct_0.4-Q4_1,0.0,0.0212811236433284,36.1247073845499,34.3213728549142,42.9883862021148,40.4402842780378,0.952380952380952,2.75132275132275,6.24338624338624,0.0,0.0,0.579950289975145,10.0063331222293,19.0998530939532,34.2970234325522,15.188418781937322
|
19 |
+
yi:6b,0.595871462013194,8.93807193019791,21.5896999361566,12.619171433524,44.6697867914717,26.7464031894609,0.317460317460317,9.73544973544974,1.74603174603175,0.165700082850041,0.828500414250207,0.414250207125104,0.189993666877771,2.29393179599174,10.265357821406,9.40771203535113
|
20 |
+
yi:9b,4.08597573951905,11.5130878910406,34.8691210895935,25.8970358814353,47.1658866354654,42.6417056682267,8.99470899470899,10.2645502645503,12.6984126984127,0.0,0.248550124275062,1.40845070422535,0.633312222925902,3.4585674314986,15.136795440152,14.601077385735296
|
21 |
+
GigaChat_Lite,51.19825708061,55.5555555555556,68.0283224400871,69.2691029900332,71.0963455149502,69.2691029900332,34.020618556701,34.020618556701,34.020618556701,6.00858369098712,6.00858369098712,8.15450643776824,32.6460481099656,40.3723509295672,58.9209621993127,42.57263848666402
|
22 |
+
GigaChat_Plus,51.19825708061,55.5555555555556,68.0283224400871,69.2691029900332,71.0963455149502,69.2691029900332,34.020618556701,34.020618556701,34.020618556701,6.00858369098712,6.00858369098712,8.15450643776824,32.9896907216495,40.1954759333432,58.8831615120275,42.58123628187567
|
23 |
+
GigaChat_Pro,63.2897603485839,66.2309368191721,75.4357298474945,76.3289036544851,77.7408637873754,76.3289036544851,52.0618556701031,52.0618556701031,52.0618556701031,11.587982832618,11.587982832618,15.8798283261803,40.893470790378,47.6443830947436,64.5051546391753,52.24263117584123
|
24 |
+
yandexgpt_lite,7.29847494553377,8.06100217864924,54.0849673202614,76.1627906976744,77.3255813953489,76.6611295681063,19.5876288659794,19.5876288659794,24.4845360824742,1.28755364806867,1.28755364806867,6.65236051502146,31.9587628865979,49.8317287269818,64.9347079037801,34.61376048323505
|
25 |
+
yandexgpt_pro,47.2766884531591,49.3464052287582,75.0544662309368,84.8837209302326,85.0498338870432,84.8837209302326,41.7525773195876,41.7525773195876,43.298969072165,5.5793991416309,5.5793991416309,7.29613733905579,15.4639175257732,50.7635115107509,63.6975945017182,46.77859456881752
|
26 |
+
random,4.04341349223239,7.59736114066823,32.698446477974,24.5103137458832,24.5103137458832,24.5103137458832,14.0740740740741,14.0740740740741,14.0740740740741,0.828500414250207,0.828500414250207,3.23115161557581,,,,13.748378084568563
|
27 |
+
Среднее значение,16.075833128172178,22.063312965582536,49.84489352715383,45.87447170090569,64.05020245106593,58.937913662725904,14.737097038127963,17.107467408498337,22.177897779959643,3.0217010215801214,3.379613200536212,6.569602924286439,10.312171056688333,23.297193327114417,37.91125068645098,26.276640176412442
|