import gradio as gr import pandas as pd file1 = '综合排名.csv' df1 = pd.read_csv(file1) file2 ='basic-leaderboard.csv' df2 = pd.read_csv(file2) file3 ='学科.csv' df3 = pd.read_csv(file3) file4 ='安全.csv' df4 = pd.read_csv(file4) file5 ='大学试题.csv' df5 = pd.read_csv(file5) file6 ='中学试题.csv' df6 = pd.read_csv(file6) file7 ='通用语言能力排行榜-大模型裁判.csv' df7 = pd.read_csv(file7) def show_general_leaderboard(): return df1 def show_basic_leaderboard(): return df2 def show_subject_leaderboard(): return df3 def show_safety_leaderboard(): return df4 def show_college_leaderboard(): return df5 def show_midschool_leaderboard(): return df6 def show_LLM_judge_leaderboard(): return df7 with gr.Blocks() as demo: gr.Markdown( """ # Large Language Model Assessment in the Chinese Context / 中文语境下的人工智能大语言模型评测 by Zhenhui(Jack) Jiang, Jiaxin Li, Xiaoyu Miao / 蒋镇辉,李佳欣,苗霄宇 HKU Business School Shenzhen Research Institute """) gr.HTML( value= "

Please refer to the report for details on metrics, tasks and models.
Updated 01/2024.

") # 创建一个包含Markdown说明的示例块 with gr.Tab("🏅 综合榜单(人类裁判)"): text_input = None text_output = gr.DataFrame(value=show_general_leaderboard(), label='Leaderboard', interactive=False, wrap=True) with gr.Tab("💬 通用语言能力榜单(人类裁判)"): text_input = None text_output = gr.DataFrame(value=show_basic_leaderboard(), label='Leaderboard', interactive=False, wrap=True) with gr.Tab("📚 专业学科能力榜单(正确率)"): with gr.Tab("学科能力榜单(正确率)"): text_input = None text_output = gr.DataFrame(value=show_subject_leaderboard(), label='Leaderboard', interactive=False, wrap=True) with gr.Tab("中学难度学科能力榜单(正确率)"): text_input = None text_output = gr.DataFrame(value=show_midschool_leaderboard(), label='Leaderboard', interactive=False, wrap=True) with gr.Tab("大学难度学科能力榜单(正确率)"): text_input = None text_output = gr.DataFrame(value=show_college_leaderboard(), label='Leaderboard', interactive=False, wrap=True) with gr.Tab("⭕️ 安全与责任榜单(人类裁判)"): text_input = None text_output = gr.DataFrame(value=show_safety_leaderboard(), label='Leaderboard', interactive=False, wrap=True) gr.Markdown( """ We also employed a fine-tuned GPT-3.5 Turbo as a judge to evaluate large language models through pairwise comparisons. The findings are presented below. """) with gr.Tab("🤖 通用语言能力榜单(大模型裁判)"): text_input = None text_output = gr.DataFrame(value=show_LLM_judge_leaderboard(), label='Leaderboard', interactive=False, wrap=True) demo.launch(share=True)