import streamlit as st import pandas as pd # CSS样式 st.markdown(""" """, unsafe_allow_html=True) # 标题 st.title('🏆AEOLLM Leaderboard') # 描述 st.markdown(""" This leaderboard is used to show the performance of the **automatic evaluation methods of LLMs** submitted by the **AEOLLM team** on four tasks: - Dialogue Generation (DG) - Text Expansion (TE) - Summary Generation (SG) - Non-Factoid QA (NFQA) Details of AEOLLLM can be found at the link: [https://aeollm.github.io/](https://aeollm.github.io/) """, unsafe_allow_html=True) # 创建示例数据 # teamId 唯一标识码 DG = { "teamId": ["baseline1", "baseline2", "baseline3", "baseline4"], "methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"], "accuracy": [0.5806, 0.5483, 0.6001, 0.6472], "kendall's tau": [0.3243, 0.1739, 0.3042, 0.4167], "spearman": [0.3505, 0.1857, 0.3264, 0.4512] } df1 = pd.DataFrame(DG) for col in df1.select_dtypes(include=['float64', 'int64']).columns: df1[col] = df1[col].apply(lambda x: f"{x:.4f}") TE = { "teamId": ["baseline1", "baseline2", "baseline3", "baseline4"], "methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"], "accuracy": [0.5107, 0.5050, 0.5461, 0.5581], "kendall's tau": [0.1281, 0.0635, 0.2716, 0.3864], "spearman": [0.1352, 0.0667, 0.2867, 0.4157] } df2 = pd.DataFrame(TE) for col in df2.select_dtypes(include=['float64', 'int64']).columns: df2[col] = df2[col].apply(lambda x: f"{x:.4f}") SG = { "teamId": ["baseline1", "baseline2", "baseline3", "baseline4"], "methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"], "accuracy": [0.6504, 0.6014, 0.7162, 0.7441], "kendall's tau": [0.3957, 0.2688, 0.5092, 0.5001], "spearman": [0.4188, 0.2817, 0.5403, 0.5405], } df3 = pd.DataFrame(SG) for col in df3.select_dtypes(include=['float64', 'int64']).columns: df3[col] = df3[col].apply(lambda x: f"{x:.4f}") NFQA = { "teamId": ["baseline1", "baseline2", "baseline3", "baseline4"], "methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"], "accuracy": [0.5935, 0.5817, 0.7000, 0.7203], "kendall's tau": [0.2332, 0.2389, 0.4440, 0.4235], "spearman": [0.2443, 0.2492, 0.4630, 0.4511] } df4 = pd.DataFrame(NFQA) for col in df4.select_dtypes(include=['float64', 'int64']).columns: df4[col] = df4[col].apply(lambda x: f"{x:.4f}") # 创建标签页 tab1, tab2, tab3, tab4 = st.tabs(["DG", "TE", "SG", "NFQA"]) with tab1: st.markdown("""Task: Dialogue Generation; Dataset: DialyDialog""", unsafe_allow_html=True) st.dataframe(df1, use_container_width=True) with tab2: st.markdown("""Task: Text Expansion; Dataset: WritingPrompts""", unsafe_allow_html=True) st.dataframe(df2, use_container_width=True) with tab3: st.markdown("""Task: Summary Generation; Dataset: Xsum""", unsafe_allow_html=True) st.dataframe(df3, use_container_width=True) with tab4: st.markdown("""Task: Non-Factoid QA; Dataset: NF_CATS""", unsafe_allow_html=True) st.dataframe(df4, use_container_width=True)