陈俊杰 commited on
Commit
193e99f
·
1 Parent(s): fe1125a
Files changed (1) hide show
  1. app.py +17 -99
app.py CHANGED
@@ -2,102 +2,6 @@ import streamlit as st
2
  from streamlit_option_menu import option_menu
3
  import pandas as pd
4
 
5
- # CSS样式
6
- # st.markdown("""
7
- # <style>
8
- # h1 {
9
- # font-size: 2.5em; /* 标题字体大小 */
10
- # }
11
- # .stDataFrame {
12
- # font-family: Helvetica;
13
- # }
14
- # .dataframe th, .dataframe td {
15
- # width: auto;
16
- # min-width: 500px;
17
- # }
18
- # </style>
19
- # """, unsafe_allow_html=True)
20
-
21
- # # 标题
22
- # st.title('🏆AEOLLM Leaderboard')
23
-
24
- # # 描述
25
- # st.markdown("""
26
- # This leaderboard is used to show the performance of the **automatic evaluation methods of LLMs** submitted by the **AEOLLM team** on four tasks:
27
- # - Dialogue Generation (DG)
28
- # - Text Expansion (TE)
29
- # - Summary Generation (SG)
30
- # - Non-Factoid QA (NFQA)
31
-
32
- # Details of AEOLLLM can be found at the link: [https://aeollm.github.io/](https://aeollm.github.io/)
33
- # """, unsafe_allow_html=True)
34
- # # 创建示例数据
35
-
36
- # # teamId 唯一标识码
37
- # DG = {
38
- # "teamId": ["baseline1", "baseline2", "baseline3", "baseline4"],
39
- # "methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
40
- # "accuracy": [0.5806, 0.5483, 0.6001, 0.6472],
41
- # "kendall's tau": [0.3243, 0.1739, 0.3042, 0.4167],
42
- # "spearman": [0.3505, 0.1857, 0.3264, 0.4512]
43
- # }
44
-
45
- # df1 = pd.DataFrame(DG)
46
- # for col in df1.select_dtypes(include=['float64', 'int64']).columns:
47
- # df1[col] = df1[col].apply(lambda x: f"{x:.4f}")
48
-
49
- # TE = {
50
- # "teamId": ["baseline1", "baseline2", "baseline3", "baseline4"],
51
- # "methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
52
- # "accuracy": [0.5107, 0.5050, 0.5461, 0.5581],
53
- # "kendall's tau": [0.1281, 0.0635, 0.2716, 0.3864],
54
- # "spearman": [0.1352, 0.0667, 0.2867, 0.4157]
55
- # }
56
- # df2 = pd.DataFrame(TE)
57
- # for col in df2.select_dtypes(include=['float64', 'int64']).columns:
58
- # df2[col] = df2[col].apply(lambda x: f"{x:.4f}")
59
-
60
- # SG = {
61
- # "teamId": ["baseline1", "baseline2", "baseline3", "baseline4"],
62
- # "methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
63
- # "accuracy": [0.6504, 0.6014, 0.7162, 0.7441],
64
- # "kendall's tau": [0.3957, 0.2688, 0.5092, 0.5001],
65
- # "spearman": [0.4188, 0.2817, 0.5403, 0.5405],
66
- # }
67
- # df3 = pd.DataFrame(SG)
68
- # for col in df3.select_dtypes(include=['float64', 'int64']).columns:
69
- # df3[col] = df3[col].apply(lambda x: f"{x:.4f}")
70
-
71
- # NFQA = {
72
- # "teamId": ["baseline1", "baseline2", "baseline3", "baseline4"],
73
- # "methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
74
- # "accuracy": [0.5935, 0.5817, 0.7000, 0.7203],
75
- # "kendall's tau": [0.2332, 0.2389, 0.4440, 0.4235],
76
- # "spearman": [0.2443, 0.2492, 0.4630, 0.4511]
77
- # }
78
- # df4 = pd.DataFrame(NFQA)
79
- # for col in df4.select_dtypes(include=['float64', 'int64']).columns:
80
- # df4[col] = df4[col].apply(lambda x: f"{x:.4f}")
81
-
82
- # # 创建标签页
83
- # tab1, tab2, tab3, tab4 = st.tabs(["DG", "TE", "SG", "NFQA"])
84
-
85
- # with tab1:
86
- # st.markdown("""Task: Dialogue Generation; Dataset: DialyDialog""", unsafe_allow_html=True)
87
- # st.dataframe(df1, use_container_width=True)
88
-
89
- # with tab2:
90
- # st.markdown("""Task: Text Expansion; Dataset: WritingPrompts""", unsafe_allow_html=True)
91
- # st.dataframe(df2, use_container_width=True)
92
-
93
- # with tab3:
94
- # st.markdown("""Task: Summary Generation; Dataset: Xsum""", unsafe_allow_html=True)
95
- # st.dataframe(df3, use_container_width=True)
96
-
97
- # with tab4:
98
- # st.markdown("""Task: Non-Factoid QA; Dataset: NF_CATS""", unsafe_allow_html=True)
99
- # st.dataframe(df4, use_container_width=True)
100
-
101
  # 设置页面标题和大标题
102
  st.set_page_config(page_title="AEOLLM", page_icon="👋")
103
  st.title("NTCIR-18 Automatic Evaluation of LLMs (AEOLLM) Task")
@@ -172,7 +76,7 @@ elif page == "Methodology":
172
  <tr>
173
  <td style="text-align: left">Text Expansion (TE)</td>
174
  <td style="text-align: left">given a theme, participants need to generate stories related to the theme</td>
175
- <td style="text-align: left">WritingPrompts: 303k story themes2</td>
176
  </tr>
177
  <tr>
178
  <td style="text-align: left">Dialogue Generation (DG)</td>
@@ -187,15 +91,29 @@ elif page == "Methodology":
187
  """,unsafe_allow_html=True)
188
 
189
  elif page == "Datasets":
190
- st.header("Datasets")
191
  st.markdown("""
192
  <p class='main-text'>A brief description of the specific dataset we used, along with the original download link, is provided below:</p>
193
  <p class='main-text'>1. <strong>Summary Generation (SG): <a href="https://huggingface.co/datasets/EdinburghNLP/xsum">Xsum</a></strong>: A real-world single document news summary dataset collected from online articles by the British Broadcasting Corporation (BBC) and contains over 220 thousand news documents.</p>
194
  <p class='main-text'>2. <strong>Non-Factoid QA (NFQA): <a href="https://github.com/Lurunchik/NF-CATS">NF_CATS</a></strong>: A dataset contains examples of 12k natural questions divided into eight categories.</p>
195
  <p class='main-text'>3. <strong>Text Expansion (TE): <a href="https://huggingface.co/datasets/euclaise/writingprompts">WritingPrompts</a></strong>: A large dataset of 300K human-written stories paired with writing prompts from an online forum.</p>
196
  <p class='main-text'>4. <strong>Dialogue Generation (DG): <a href="https://huggingface.co/datasets/daily_dialog">DailyDialog</a></strong>: A high-quality dataset of 13k multi-turn dialogues. The language is human-written and less noisy.</p>
197
- <p class='main-text'>For your convenience, we have released <strong>the training set</strong> (with human-annotated results) and <strong>the test set</strong> (without human-annotated results) on <a href="https://huggingface.co/datasets/THUIR/AEOLLM">https://huggingface.co/datasets/THUIR/AEOLLM</a>, which you can easily download.</p>
198
  """,unsafe_allow_html=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
 
200
  elif page == "Important Dates":
201
  st.header("Important Dates")
 
2
  from streamlit_option_menu import option_menu
3
  import pandas as pd
4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  # 设置页面标题和大标题
6
  st.set_page_config(page_title="AEOLLM", page_icon="👋")
7
  st.title("NTCIR-18 Automatic Evaluation of LLMs (AEOLLM) Task")
 
76
  <tr>
77
  <td style="text-align: left">Text Expansion (TE)</td>
78
  <td style="text-align: left">given a theme, participants need to generate stories related to the theme</td>
79
+ <td style="text-align: left">WritingPrompts: 303k story themes</td>
80
  </tr>
81
  <tr>
82
  <td style="text-align: left">Dialogue Generation (DG)</td>
 
91
  """,unsafe_allow_html=True)
92
 
93
  elif page == "Datasets":
94
+ st.header("Introduction to Task Datasets")
95
  st.markdown("""
96
  <p class='main-text'>A brief description of the specific dataset we used, along with the original download link, is provided below:</p>
97
  <p class='main-text'>1. <strong>Summary Generation (SG): <a href="https://huggingface.co/datasets/EdinburghNLP/xsum">Xsum</a></strong>: A real-world single document news summary dataset collected from online articles by the British Broadcasting Corporation (BBC) and contains over 220 thousand news documents.</p>
98
  <p class='main-text'>2. <strong>Non-Factoid QA (NFQA): <a href="https://github.com/Lurunchik/NF-CATS">NF_CATS</a></strong>: A dataset contains examples of 12k natural questions divided into eight categories.</p>
99
  <p class='main-text'>3. <strong>Text Expansion (TE): <a href="https://huggingface.co/datasets/euclaise/writingprompts">WritingPrompts</a></strong>: A large dataset of 300K human-written stories paired with writing prompts from an online forum.</p>
100
  <p class='main-text'>4. <strong>Dialogue Generation (DG): <a href="https://huggingface.co/datasets/daily_dialog">DailyDialog</a></strong>: A high-quality dataset of 13k multi-turn dialogues. The language is human-written and less noisy.</p>
 
101
  """,unsafe_allow_html=True)
102
+ st.header("Answer Generation and Human Annotation")
103
+ st.markdown("""
104
+ We randomly sampled **100 instances** from **each** dataset as the question set and selected **7 different LLMs** to generate answers, forming the answer set. As a result, each dataset produced 700 instances, totaling **2,800 instances across the four datasets**.
105
+
106
+ For each instance (question-answer pair), we employed human annotators to provide a score ranging from 1 to 5 and took the median of these scores as the final score. Based on this score, we calculated the rankings of the 7 answers for each question. If scores were identical, the answers were assigned the same rank, with the lowest rank being used.
107
+ """)
108
+ st.header("Data Acquisition and Usage")
109
+ st.markdown("""
110
+ We divided the 2,800 instances into three parts:
111
+
112
+ 20% \of the data (covering all four datasets) was designated as the training set (including human annotations) for participants to reference when designing their methods.
113
+ Another 20% \of the data was set aside as the test set (excluding human annotations), used to evaluate the performance of participants' methods and to generate the **leaderboard**.
114
+ The remaining 60% \of the data was reserved for **the final evaluation**.
115
+ Both the training set and the test set can be downloaded from the provided link: [https://huggingface.co/datasets/THUIR/AEOLLM](https://huggingface.co/datasets/THUIR/AEOLLM)
116
+ """)
117
 
118
  elif page == "Important Dates":
119
  st.header("Important Dates")