__all__ = ['block', 'make_clickable_model', 'make_clickable_user', 'get_submissions'] import gradio as gr import pandas as pd BENCHMARK_RESULTS = '''[gpt4](https://platform.openai.com/docs/models/gpt-4) & 93.0 & 96.0 & 97.0 & 96.7 & 62.9 & 23.0 / 23.5 & 0.0 & 0.0 & 81.0 \\ [text-davinci-003](https://platform.openai.com/docs/models/gpt-3) & 99.0 & 98.0 & 97.0 & 89.2 & 62.9 & 31.0 / 25.1 & 0.0 & 0.0 & 66.7 \\ [gpt-3.5-turbo](https://platform.openai.com/docs/models/gpt-3-5) & 90.0 & 92.0 & 80.0 & 85.8 & 51.4 & 20.0 / 18.9 & 0.0 & 1.8 & 33.3 \\ [text-curie-001](https://platform.openai.com/docs/models/gpt-3) & 8.0 & 58.0 & 6.0 & 6.7 & 1.4 & 12.0 / 4.1 & 0.0 & 0.0 & 1.0 \\ [llama-65b](https://huggingface.co/huggyllama/llama-65b) & 90.0 & 80.0 & 84.0 & 65.8 & 32.9 & 32.0 / 20.3 & 0.0 & 41.2 & 30.5 \\ [llama-30b](https://huggingface.co/huggyllama/llama-30b) & 78.0 & 84.0 & 66.0 & 45.0 & 37.1 & 27.0 / 21.7 & 0.0 & 30.6 & 34.3 \\ [llama-13b](https://huggingface.co/huggyllama/llama-13b) & 70.0 & 74.0 & 45.0 & 35.8 & 5.7 & 28.0 / 18.9 & 0.0 & 27.6 & 17.1 \\ [llama-13b-alpaca](https://huggingface.co/chavinlo/gpt4-x-alpaca) & 62.0 & 43.0 & 44.0 & 40.8 & 11.4 & 1.0 / 1.6 & 0.0 & 2.7 & 9.5 \\ [starcoder](https://huggingface.co/bigcode/starcoder) & 91.0 & 84.0 & 82.0 & 51.7 & 48.0 & 23.0 / 19.4 & 2.6 & 0.0 & 21.9 \\ [starcoderbase](https://huggingface.co/bigcode/starcoderbase) & 90.0 & 86.0 & 79.0 & 63.3 & 42.9 & 24.0 / 16.3 & 5.8 & 23.1 & 17.1 \\ [codegen-16B-nl](https://huggingface.co/Salesforce/codegen-16B-nl) & 51.0 & 75.0 & 37.0 & 21.7 & 7.1 & 43.0 / 18.0 & 0.0 & 0.0 & 16.2 \\ [codegen-16B-multi](https://huggingface.co/Salesforce/codegen-16B-multi) & 56.0 & 75.0 & 47.0 & 7.5 & 21.4 & 31.0 / 14.1 & 0.0 & 0.5 & 8.6 \\ [codegen-16B-mono](https://huggingface.co/Salesforce/codegen-16B-mono) & 63.7 & 72.0 & 52.0 & 28.3 & 31.5 & 28.0 / 15.7 & 1.5 & 6.6 & 15.2 \\ [bloomz](https://huggingface.co/bigscience/bloomz) & 58.0 & 85.0 & 36.0 & 22.5 & 14.3 & 9.0 / 4.9 & 0.0 & 1.0 & 1.0 \\ [opt-iml-30b](https://huggingface.co/facebook/opt-iml-30b) & 44.0 & 48.0 & 5.0 & 3.3 & 2.9 & 13.0 / 8.3 & 0.0 & 0.0 & 1.0 \\ [opt-30b](https://huggingface.co/facebook/opt-30b) & 46.0 & 35.0 & 2.0 & 3.3 & 8.6 & 24.0 / 11.7 & 0.0 & 0.0 & 1.0 \\ [opt-iml-1.3b](https://huggingface.co/facebook/opt-iml-1.3b) & 20.0 & 28.0 & 0.0 & 0.0 & 4.3 & 13.0 / 3.1 & 0.0 & 0.0 & 1.0 \\ [opt-1.3b](https://huggingface.co/facebook/opt-1.3b) & 18.0 & 30.0 & 0.0 & 0.0 & 1.4 & 31.0 / 9.7 & 0.0 & 0.0 & 1.0 \\ [neox-20b](https://huggingface.co/EleutherAI/gpt-neox-20b) & 55.0 & 69.0 & 27.0 & 10.8 & 18.6 & 28.0 / 15.3 & 0.0 & 8.8 & 6.7 \\ [GPT-NeoXT-Chat-Base-20B](https://huggingface.co/togethercomputer/GPT-NeoXT-Chat-Base-20B) & 43.0 & 73.0 & 28.0 & 10.8 & 4.3 & 26.0 / 13.1 & 0.0 & 0.7 & 7.6 \\ [pythia-12b](https://huggingface.co/EleutherAI/pythia-12b) & 53.0 & 65.0 & 12.0 & 0.8 & 11.4 & 17.0 / 12.1 & 0.0 & 0.0 & 1.9 \\ [dolly-v2-12b]() & 0.0 & 1.0 & 10.0 & 5.0 & 7.1 & 11.0 / 8.9 & 0.0 & 0.0 & 7.6 \\ [pythia-6.9b](https://huggingface.co/EleutherAI/pythia-6.9b) & 41.0 & 72.0 & 8.0 & 7.5 & 4.3 & 29.0 / 14.0 & 0.0 & 0.0 & 8.6 \\ [pythia-2.8b](https://huggingface.co/EleutherAI/pythia-2.8b) & 49.0 & 54.0 & 7.0 & 3.3 & 12.9 & 24.0 / 14.8 & 0.0 & 0.0 & 7.6 \\ [pythia-1.4b](https://huggingface.co/EleutherAI/pythia-1.4b) & 37.0 & 48.0 & 4.0 & 5.0 & 10.0 & 22.0 / 10.7 & 0.0 & 5.2 & 7.6 \\ [stablelm-base-alpha-7b](https://huggingface.co/stabilityai/stablelm-base-alpha-7b) & 22.0 & 47.0 & 0.0 & 0.0 & 4.3 & 28.0 / 10.3 & 0.0 & 0.0 & 2.9 \\ [stablelm-tuned-alpha-7b](https://huggingface.co/stabilityai/stablelm-tuned-alpha-7b) & 23.0 & 38.0 & 0.0 & 0.0 & 1.4 & 26.0 / 7.3 & 0.0 & 0.0 & 3.8 \\ [stablelm-base-alpha-3b](https://huggingface.co/stabilityai/stablelm-base-alpha-3b) & 6.0 & 28.0 & 0.0 & 0.0 & 1.4 & 29.0 / 5.3 & 0.0 & 0.0 & 1.0 \\ [stablelm-tuned-alpha-3b](https://huggingface.co/stabilityai/stablelm-tuned-alpha-3b) & 14.0 & 31.0 & 0.0 & 0.8 & 0.0 & 8.0 / 5.6 & 0.0 & 0.0 & 1.0 \\ [llama-30b-toolbench](https://huggingface.co/sambanovasystems/LLaMA-30b-toolbench) & 100.0 & 94.0 & 87.0 & 85.8 & 2.9 & 16.0/ 24.3& 0.0 & 0.0 & 7.5 \\ [starcoder-toolbench](https://huggingface.co/sambanovasystems/starcoder-toolbench) & 99.0 & 97.0 & 83.0 & 80.8 & 21.2 & 31.0/ 18.4& 0.0 & 0.0 & 13.9 \\ [codegen-16B-mono-toolbench](https://huggingface.co/sambanovasystems/codegen-16B-mono-toolbench) & 97.7 & 99.0 & 82.0 & 77.5 & 19.8 & 29.0/ 17.2& 0.0 & 3.5 & 16.2 \\''' def get_baseline_df(): lines = BENCHMARK_RESULTS.split("\n") df_data = [] for line in lines: model_results = line.replace(" ", "").strip("\\").split("&") assert len(model_results) == 10 df_data.append(model_results) print(len(df_data)) df = pd.DataFrame(df_data, columns=column_names) return df CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results" CITATION_BUTTON_TEXT = r"""@misc{xu2023tool, title={On the Tool Manipulation Capability of Open-source Large Language Models}, author={Qiantong Xu and Fenglu Hong and Bo Li and Changran Hu and Zhengyu Chen and Jian Zhang}, year={2023}, eprint={2305.16504}, archivePrefix={arXiv}, primaryClass={cs.CL} } }""" block = gr.Blocks() with block: gr.Markdown( """# Toolbench Leaderboard Welcome to the leaderboard of the ToolBench! 🏆 This is a community where participants create language models and action generation algorithms to generate API function calls based goals described in natural lanugage! Please refer to [our paper](https://arxiv.org/abs/2305.16504) for more details and join our [Discord](https://discord.com/invite/JehFG5HXKb) for further discussion. The [evaluation suite](https://github.com/sambanova/toolbench/) is now alive on Github. """ ) with gr.Row(): with gr.Accordion("Citation", open=False): citation_button = gr.Textbox( value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, elem_id="citation-button", ).style(show_copy_button=True) with gr.Row(): data = gr.components.Dataframe( type="pandas", datatype=["markdown", "number", "number", "number", "number", "number", "number", "number", "number", "number"] ) data_run = gr.Button("Refresh") data_run.click( get_baseline_df, outputs=data ) block.load(get_baseline_df, outputs=data) block.launch()