import start | |
import gradio as gr | |
import pandas as pd | |
from glob import glob | |
from pathlib import Path | |
from tabs.dashboard import df | |
from tabs.faq import ( | |
about_olas_predict_benchmark, | |
about_olas_predict, | |
about_the_dataset, | |
about_the_tools, | |
) | |
from tabs.howto_benchmark import how_to_run | |
# disabling temporarily | |
# from tabs.run_benchmark import run_benchmark_main | |
demo = gr.Blocks() | |
def run_benchmark_gradio( | |
tool_name, | |
model_name, | |
num_questions, | |
openai_api_key, | |
anthropic_api_key, | |
openrouter_api_key, | |
): | |
"""Run the benchmark using inputs.""" | |
if tool_name is None: | |
return "Please enter the name of your tool." | |
if ( | |
openai_api_key is None | |
and anthropic_api_key is None | |
and openrouter_api_key is None | |
): | |
return "Please enter either OpenAI or Anthropic or OpenRouter API key." | |
result = run_benchmark_main( | |
tool_name, | |
model_name, | |
num_questions, | |
openai_api_key, | |
anthropic_api_key, | |
openrouter_api_key, | |
) | |
if result == "completed": | |
# get the results file in the results directory | |
fns = glob("results/*.csv") | |
print(f"Number of files in results directory: {len(fns)}") | |
# convert to Path | |
files = [Path(file) for file in fns] | |
# get results and summary files | |
results_files = [file for file in files if "results" in file.name] | |
# the other file is the summary file | |
summary_files = [file for file in files if "summary" in file.name] | |
print(results_files, summary_files) | |
# get the path with results | |
results_df = pd.read_csv(results_files[0]) | |
summary_df = pd.read_csv(summary_files[0]) | |
# make sure all df float values are rounded to 4 decimal places | |
results_df = results_df.round(4) | |
summary_df = summary_df.round(4) | |
return gr.Dataframe(value=results_df), gr.Dataframe(value=summary_df) | |
return gr.Textbox( | |
label="Benchmark Result", value=result, interactive=False | |
), gr.Textbox(label="Summary", value="") | |
with demo: | |
gr.HTML("<h1>Olas Predict Benchmark</hjson>") | |
gr.Markdown( | |
"Leaderboard showing the performance of Olas Predict tools on the Autocast dataset and overview of the project." | |
) | |
with gr.Tabs() as tabs: | |
# first tab - leaderboard | |
with gr.TabItem("π Benchmark Leaderboard", id=0): | |
gr.components.Dataframe( | |
value=df, | |
) | |
# second tab - about | |
with gr.TabItem("βΉοΈ About"): | |
with gr.Row(): | |
with gr.Accordion("About the Benchmark", open=False): | |
gr.Markdown(about_olas_predict_benchmark) | |
with gr.Row(): | |
with gr.Accordion("About the Tools", open=False): | |
gr.Markdown(about_the_tools) | |
with gr.Row(): | |
with gr.Accordion("About the Autocast Dataset", open=False): | |
gr.Markdown(about_the_dataset) | |
with gr.Row(): | |
with gr.Accordion("About Olas", open=False): | |
gr.Markdown(about_olas_predict) | |
# third tab - how to run the benchmark | |
with gr.TabItem("π Contribute"): | |
gr.Markdown(how_to_run) | |
# fourth tab - run the benchmark | |
# with gr.TabItem("π₯ Run the Benchmark"): | |
# with gr.Row(): | |
# tool_name = gr.Dropdown( | |
# [ | |
# "prediction-offline", | |
# "prediction-online", | |
# # "prediction-online-summarized-info", | |
# # "prediction-offline-sme", | |
# # "prediction-online-sme", | |
# "prediction-request-rag", | |
# "prediction-request-reasoning", | |
# # "prediction-url-cot-claude", | |
# # "prediction-request-rag-cohere", | |
# # "prediction-with-research-conservative", | |
# # "prediction-with-research-bold", | |
# ], | |
# label="Tool Name", | |
# info="Choose the tool to run", | |
# ) | |
# model_name = gr.Dropdown( | |
# [ | |
# "gpt-3.5-turbo-0125", | |
# "gpt-4-0125-preview", | |
# "claude-3-haiku-20240307", | |
# "claude-3-sonnet-20240229", | |
# "claude-3-opus-20240229", | |
# "databricks/dbrx-instruct:nitro", | |
# "nousresearch/nous-hermes-2-mixtral-8x7b-sft", | |
# # "cohere/command-r-plus", | |
# ], | |
# label="Model Name", | |
# info="Choose the model to use", | |
# ) | |
# with gr.Row(): | |
# openai_api_key = gr.Textbox( | |
# label="OpenAI API Key", | |
# placeholder="Enter your OpenAI API key here", | |
# type="password", | |
# ) | |
# anthropic_api_key = gr.Textbox( | |
# label="Anthropic API Key", | |
# placeholder="Enter your Anthropic API key here", | |
# type="password", | |
# ) | |
# openrouter_api_key = gr.Textbox( | |
# label="OpenRouter API Key", | |
# placeholder="Enter your OpenRouter API key here", | |
# type="password", | |
# ) | |
# with gr.Row(): | |
# num_questions = gr.Slider( | |
# minimum=1, | |
# maximum=340, | |
# value=10, | |
# label="Number of questions to run the benchmark on", | |
# ) | |
# with gr.Row(): | |
# run_button = gr.Button("Run Benchmark") | |
# with gr.Row(): | |
# with gr.Accordion("Results", open=True): | |
# result = gr.Dataframe() | |
# with gr.Row(): | |
# with gr.Accordion("Summary", open=False): | |
# summary = gr.Dataframe() | |
# run_button.click( | |
# run_benchmark_gradio, | |
# inputs=[ | |
# tool_name, | |
# model_name, | |
# num_questions, | |
# openai_api_key, | |
# anthropic_api_key, | |
# openrouter_api_key, | |
# ], | |
# outputs=[result, summary], | |
# ) | |
demo.queue(default_concurrency_limit=40).launch() | |