陈俊杰
commited on
Commit
·
193e99f
1
Parent(s):
fe1125a
dataset
Browse files
app.py
CHANGED
@@ -2,102 +2,6 @@ import streamlit as st
|
|
2 |
from streamlit_option_menu import option_menu
|
3 |
import pandas as pd
|
4 |
|
5 |
-
# CSS样式
|
6 |
-
# st.markdown("""
|
7 |
-
# <style>
|
8 |
-
# h1 {
|
9 |
-
# font-size: 2.5em; /* 标题字体大小 */
|
10 |
-
# }
|
11 |
-
# .stDataFrame {
|
12 |
-
# font-family: Helvetica;
|
13 |
-
# }
|
14 |
-
# .dataframe th, .dataframe td {
|
15 |
-
# width: auto;
|
16 |
-
# min-width: 500px;
|
17 |
-
# }
|
18 |
-
# </style>
|
19 |
-
# """, unsafe_allow_html=True)
|
20 |
-
|
21 |
-
# # 标题
|
22 |
-
# st.title('🏆AEOLLM Leaderboard')
|
23 |
-
|
24 |
-
# # 描述
|
25 |
-
# st.markdown("""
|
26 |
-
# This leaderboard is used to show the performance of the **automatic evaluation methods of LLMs** submitted by the **AEOLLM team** on four tasks:
|
27 |
-
# - Dialogue Generation (DG)
|
28 |
-
# - Text Expansion (TE)
|
29 |
-
# - Summary Generation (SG)
|
30 |
-
# - Non-Factoid QA (NFQA)
|
31 |
-
|
32 |
-
# Details of AEOLLLM can be found at the link: [https://aeollm.github.io/](https://aeollm.github.io/)
|
33 |
-
# """, unsafe_allow_html=True)
|
34 |
-
# # 创建示例数据
|
35 |
-
|
36 |
-
# # teamId 唯一标识码
|
37 |
-
# DG = {
|
38 |
-
# "teamId": ["baseline1", "baseline2", "baseline3", "baseline4"],
|
39 |
-
# "methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
|
40 |
-
# "accuracy": [0.5806, 0.5483, 0.6001, 0.6472],
|
41 |
-
# "kendall's tau": [0.3243, 0.1739, 0.3042, 0.4167],
|
42 |
-
# "spearman": [0.3505, 0.1857, 0.3264, 0.4512]
|
43 |
-
# }
|
44 |
-
|
45 |
-
# df1 = pd.DataFrame(DG)
|
46 |
-
# for col in df1.select_dtypes(include=['float64', 'int64']).columns:
|
47 |
-
# df1[col] = df1[col].apply(lambda x: f"{x:.4f}")
|
48 |
-
|
49 |
-
# TE = {
|
50 |
-
# "teamId": ["baseline1", "baseline2", "baseline3", "baseline4"],
|
51 |
-
# "methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
|
52 |
-
# "accuracy": [0.5107, 0.5050, 0.5461, 0.5581],
|
53 |
-
# "kendall's tau": [0.1281, 0.0635, 0.2716, 0.3864],
|
54 |
-
# "spearman": [0.1352, 0.0667, 0.2867, 0.4157]
|
55 |
-
# }
|
56 |
-
# df2 = pd.DataFrame(TE)
|
57 |
-
# for col in df2.select_dtypes(include=['float64', 'int64']).columns:
|
58 |
-
# df2[col] = df2[col].apply(lambda x: f"{x:.4f}")
|
59 |
-
|
60 |
-
# SG = {
|
61 |
-
# "teamId": ["baseline1", "baseline2", "baseline3", "baseline4"],
|
62 |
-
# "methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
|
63 |
-
# "accuracy": [0.6504, 0.6014, 0.7162, 0.7441],
|
64 |
-
# "kendall's tau": [0.3957, 0.2688, 0.5092, 0.5001],
|
65 |
-
# "spearman": [0.4188, 0.2817, 0.5403, 0.5405],
|
66 |
-
# }
|
67 |
-
# df3 = pd.DataFrame(SG)
|
68 |
-
# for col in df3.select_dtypes(include=['float64', 'int64']).columns:
|
69 |
-
# df3[col] = df3[col].apply(lambda x: f"{x:.4f}")
|
70 |
-
|
71 |
-
# NFQA = {
|
72 |
-
# "teamId": ["baseline1", "baseline2", "baseline3", "baseline4"],
|
73 |
-
# "methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
|
74 |
-
# "accuracy": [0.5935, 0.5817, 0.7000, 0.7203],
|
75 |
-
# "kendall's tau": [0.2332, 0.2389, 0.4440, 0.4235],
|
76 |
-
# "spearman": [0.2443, 0.2492, 0.4630, 0.4511]
|
77 |
-
# }
|
78 |
-
# df4 = pd.DataFrame(NFQA)
|
79 |
-
# for col in df4.select_dtypes(include=['float64', 'int64']).columns:
|
80 |
-
# df4[col] = df4[col].apply(lambda x: f"{x:.4f}")
|
81 |
-
|
82 |
-
# # 创建标签页
|
83 |
-
# tab1, tab2, tab3, tab4 = st.tabs(["DG", "TE", "SG", "NFQA"])
|
84 |
-
|
85 |
-
# with tab1:
|
86 |
-
# st.markdown("""Task: Dialogue Generation; Dataset: DialyDialog""", unsafe_allow_html=True)
|
87 |
-
# st.dataframe(df1, use_container_width=True)
|
88 |
-
|
89 |
-
# with tab2:
|
90 |
-
# st.markdown("""Task: Text Expansion; Dataset: WritingPrompts""", unsafe_allow_html=True)
|
91 |
-
# st.dataframe(df2, use_container_width=True)
|
92 |
-
|
93 |
-
# with tab3:
|
94 |
-
# st.markdown("""Task: Summary Generation; Dataset: Xsum""", unsafe_allow_html=True)
|
95 |
-
# st.dataframe(df3, use_container_width=True)
|
96 |
-
|
97 |
-
# with tab4:
|
98 |
-
# st.markdown("""Task: Non-Factoid QA; Dataset: NF_CATS""", unsafe_allow_html=True)
|
99 |
-
# st.dataframe(df4, use_container_width=True)
|
100 |
-
|
101 |
# 设置页面标题和大标题
|
102 |
st.set_page_config(page_title="AEOLLM", page_icon="👋")
|
103 |
st.title("NTCIR-18 Automatic Evaluation of LLMs (AEOLLM) Task")
|
@@ -172,7 +76,7 @@ elif page == "Methodology":
|
|
172 |
<tr>
|
173 |
<td style="text-align: left">Text Expansion (TE)</td>
|
174 |
<td style="text-align: left">given a theme, participants need to generate stories related to the theme</td>
|
175 |
-
<td style="text-align: left">WritingPrompts: 303k story
|
176 |
</tr>
|
177 |
<tr>
|
178 |
<td style="text-align: left">Dialogue Generation (DG)</td>
|
@@ -187,15 +91,29 @@ elif page == "Methodology":
|
|
187 |
""",unsafe_allow_html=True)
|
188 |
|
189 |
elif page == "Datasets":
|
190 |
-
st.header("Datasets")
|
191 |
st.markdown("""
|
192 |
<p class='main-text'>A brief description of the specific dataset we used, along with the original download link, is provided below:</p>
|
193 |
<p class='main-text'>1. <strong>Summary Generation (SG): <a href="https://huggingface.co/datasets/EdinburghNLP/xsum">Xsum</a></strong>: A real-world single document news summary dataset collected from online articles by the British Broadcasting Corporation (BBC) and contains over 220 thousand news documents.</p>
|
194 |
<p class='main-text'>2. <strong>Non-Factoid QA (NFQA): <a href="https://github.com/Lurunchik/NF-CATS">NF_CATS</a></strong>: A dataset contains examples of 12k natural questions divided into eight categories.</p>
|
195 |
<p class='main-text'>3. <strong>Text Expansion (TE): <a href="https://huggingface.co/datasets/euclaise/writingprompts">WritingPrompts</a></strong>: A large dataset of 300K human-written stories paired with writing prompts from an online forum.</p>
|
196 |
<p class='main-text'>4. <strong>Dialogue Generation (DG): <a href="https://huggingface.co/datasets/daily_dialog">DailyDialog</a></strong>: A high-quality dataset of 13k multi-turn dialogues. The language is human-written and less noisy.</p>
|
197 |
-
<p class='main-text'>For your convenience, we have released <strong>the training set</strong> (with human-annotated results) and <strong>the test set</strong> (without human-annotated results) on <a href="https://huggingface.co/datasets/THUIR/AEOLLM">https://huggingface.co/datasets/THUIR/AEOLLM</a>, which you can easily download.</p>
|
198 |
""",unsafe_allow_html=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
199 |
|
200 |
elif page == "Important Dates":
|
201 |
st.header("Important Dates")
|
|
|
2 |
from streamlit_option_menu import option_menu
|
3 |
import pandas as pd
|
4 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
# 设置页面标题和大标题
|
6 |
st.set_page_config(page_title="AEOLLM", page_icon="👋")
|
7 |
st.title("NTCIR-18 Automatic Evaluation of LLMs (AEOLLM) Task")
|
|
|
76 |
<tr>
|
77 |
<td style="text-align: left">Text Expansion (TE)</td>
|
78 |
<td style="text-align: left">given a theme, participants need to generate stories related to the theme</td>
|
79 |
+
<td style="text-align: left">WritingPrompts: 303k story themes</td>
|
80 |
</tr>
|
81 |
<tr>
|
82 |
<td style="text-align: left">Dialogue Generation (DG)</td>
|
|
|
91 |
""",unsafe_allow_html=True)
|
92 |
|
93 |
elif page == "Datasets":
|
94 |
+
st.header("Introduction to Task Datasets")
|
95 |
st.markdown("""
|
96 |
<p class='main-text'>A brief description of the specific dataset we used, along with the original download link, is provided below:</p>
|
97 |
<p class='main-text'>1. <strong>Summary Generation (SG): <a href="https://huggingface.co/datasets/EdinburghNLP/xsum">Xsum</a></strong>: A real-world single document news summary dataset collected from online articles by the British Broadcasting Corporation (BBC) and contains over 220 thousand news documents.</p>
|
98 |
<p class='main-text'>2. <strong>Non-Factoid QA (NFQA): <a href="https://github.com/Lurunchik/NF-CATS">NF_CATS</a></strong>: A dataset contains examples of 12k natural questions divided into eight categories.</p>
|
99 |
<p class='main-text'>3. <strong>Text Expansion (TE): <a href="https://huggingface.co/datasets/euclaise/writingprompts">WritingPrompts</a></strong>: A large dataset of 300K human-written stories paired with writing prompts from an online forum.</p>
|
100 |
<p class='main-text'>4. <strong>Dialogue Generation (DG): <a href="https://huggingface.co/datasets/daily_dialog">DailyDialog</a></strong>: A high-quality dataset of 13k multi-turn dialogues. The language is human-written and less noisy.</p>
|
|
|
101 |
""",unsafe_allow_html=True)
|
102 |
+
st.header("Answer Generation and Human Annotation")
|
103 |
+
st.markdown("""
|
104 |
+
We randomly sampled **100 instances** from **each** dataset as the question set and selected **7 different LLMs** to generate answers, forming the answer set. As a result, each dataset produced 700 instances, totaling **2,800 instances across the four datasets**.
|
105 |
+
|
106 |
+
For each instance (question-answer pair), we employed human annotators to provide a score ranging from 1 to 5 and took the median of these scores as the final score. Based on this score, we calculated the rankings of the 7 answers for each question. If scores were identical, the answers were assigned the same rank, with the lowest rank being used.
|
107 |
+
""")
|
108 |
+
st.header("Data Acquisition and Usage")
|
109 |
+
st.markdown("""
|
110 |
+
We divided the 2,800 instances into three parts:
|
111 |
+
|
112 |
+
20% \of the data (covering all four datasets) was designated as the training set (including human annotations) for participants to reference when designing their methods.
|
113 |
+
Another 20% \of the data was set aside as the test set (excluding human annotations), used to evaluate the performance of participants' methods and to generate the **leaderboard**.
|
114 |
+
The remaining 60% \of the data was reserved for **the final evaluation**.
|
115 |
+
Both the training set and the test set can be downloaded from the provided link: [https://huggingface.co/datasets/THUIR/AEOLLM](https://huggingface.co/datasets/THUIR/AEOLLM)
|
116 |
+
""")
|
117 |
|
118 |
elif page == "Important Dates":
|
119 |
st.header("Important Dates")
|