import gradio as gr
import pandas as pd
file1 = '综合排名.csv'
df1 = pd.read_csv(file1)
file2 ='basic-leaderboard.csv'
df2 = pd.read_csv(file2)
file3 ='学科.csv'
df3 = pd.read_csv(file3)
file4 ='安全.csv'
df4 = pd.read_csv(file4)
file5 ='大学试题.csv'
df5 = pd.read_csv(file5)
file6 ='中学试题.csv'
df6 = pd.read_csv(file6)
file7 ='通用语言能力排行榜-大模型裁判.csv'
df7 = pd.read_csv(file7)
def show_general_leaderboard():
return df1
def show_basic_leaderboard():
return df2
def show_subject_leaderboard():
return df3
def show_safety_leaderboard():
return df4
def show_college_leaderboard():
return df5
def show_midschool_leaderboard():
return df6
def show_LLM_judge_leaderboard():
return df7
with gr.Blocks() as demo:
gr.Markdown(
"""
# Large Language Model Assessment in the Chinese Context / 中文语境下的人工智能大语言模型评测
by Zhenhui(Jack) Jiang, Jiaxin Li, Xiaoyu Miao / 蒋镇辉,李佳欣,苗霄宇
HKU Business School Shenzhen Research Institute
""")
gr.HTML(
value= "
Please refer to the report for details on metrics, tasks and models.
Updated 01/2024.
")
# 创建一个包含Markdown说明的示例块
with gr.Tab("🏅 综合榜单(人类裁判)"):
text_input = None
text_output = gr.DataFrame(value=show_general_leaderboard(),
label='Leaderboard',
interactive=False,
wrap=True)
with gr.Tab("💬 通用语言能力榜单(人类裁判)"):
text_input = None
text_output = gr.DataFrame(value=show_basic_leaderboard(),
label='Leaderboard',
interactive=False,
wrap=True)
with gr.Tab("📚 专业学科能力榜单(正确率)"):
with gr.Tab("学科能力榜单(正确率)"):
text_input = None
text_output = gr.DataFrame(value=show_subject_leaderboard(),
label='Leaderboard',
interactive=False,
wrap=True)
with gr.Tab("中学难度学科能力榜单(正确率)"):
text_input = None
text_output = gr.DataFrame(value=show_midschool_leaderboard(),
label='Leaderboard',
interactive=False,
wrap=True)
with gr.Tab("大学难度学科能力榜单(正确率)"):
text_input = None
text_output = gr.DataFrame(value=show_college_leaderboard(),
label='Leaderboard',
interactive=False,
wrap=True)
with gr.Tab("⭕️ 安全与责任榜单(人类裁判)"):
text_input = None
text_output = gr.DataFrame(value=show_safety_leaderboard(),
label='Leaderboard',
interactive=False,
wrap=True)
gr.Markdown(
"""
We also employed a fine-tuned GPT-3.5 Turbo as a judge to evaluate large language models through pairwise comparisons.
The findings are presented below.
""")
with gr.Tab("🤖 通用语言能力榜单(大模型裁判)"):
text_input = None
text_output = gr.DataFrame(value=show_LLM_judge_leaderboard(),
label='Leaderboard',
interactive=False,
wrap=True)
demo.launch(share=True)