cyberosa
disabling temporarily the run_benchmark tab
a9bd212
import start
import gradio as gr
import pandas as pd
from glob import glob
from pathlib import Path
from tabs.dashboard import df
from tabs.faq import (
about_olas_predict_benchmark,
about_olas_predict,
about_the_dataset,
about_the_tools,
)
from tabs.howto_benchmark import how_to_run
# disabling temporarily
# from tabs.run_benchmark import run_benchmark_main
demo = gr.Blocks()
def run_benchmark_gradio(
tool_name,
model_name,
num_questions,
openai_api_key,
anthropic_api_key,
openrouter_api_key,
):
"""Run the benchmark using inputs."""
if tool_name is None:
return "Please enter the name of your tool."
if (
openai_api_key is None
and anthropic_api_key is None
and openrouter_api_key is None
):
return "Please enter either OpenAI or Anthropic or OpenRouter API key."
result = run_benchmark_main(
tool_name,
model_name,
num_questions,
openai_api_key,
anthropic_api_key,
openrouter_api_key,
)
if result == "completed":
# get the results file in the results directory
fns = glob("results/*.csv")
print(f"Number of files in results directory: {len(fns)}")
# convert to Path
files = [Path(file) for file in fns]
# get results and summary files
results_files = [file for file in files if "results" in file.name]
# the other file is the summary file
summary_files = [file for file in files if "summary" in file.name]
print(results_files, summary_files)
# get the path with results
results_df = pd.read_csv(results_files[0])
summary_df = pd.read_csv(summary_files[0])
# make sure all df float values are rounded to 4 decimal places
results_df = results_df.round(4)
summary_df = summary_df.round(4)
return gr.Dataframe(value=results_df), gr.Dataframe(value=summary_df)
return gr.Textbox(
label="Benchmark Result", value=result, interactive=False
), gr.Textbox(label="Summary", value="")
with demo:
gr.HTML("<h1>Olas Predict Benchmark</hjson>")
gr.Markdown(
"Leaderboard showing the performance of Olas Predict tools on the Autocast dataset and overview of the project."
)
with gr.Tabs() as tabs:
# first tab - leaderboard
with gr.TabItem("πŸ… Benchmark Leaderboard", id=0):
gr.components.Dataframe(
value=df,
)
# second tab - about
with gr.TabItem("ℹ️ About"):
with gr.Row():
with gr.Accordion("About the Benchmark", open=False):
gr.Markdown(about_olas_predict_benchmark)
with gr.Row():
with gr.Accordion("About the Tools", open=False):
gr.Markdown(about_the_tools)
with gr.Row():
with gr.Accordion("About the Autocast Dataset", open=False):
gr.Markdown(about_the_dataset)
with gr.Row():
with gr.Accordion("About Olas", open=False):
gr.Markdown(about_olas_predict)
# third tab - how to run the benchmark
with gr.TabItem("πŸš€ Contribute"):
gr.Markdown(how_to_run)
# fourth tab - run the benchmark
# with gr.TabItem("πŸ”₯ Run the Benchmark"):
# with gr.Row():
# tool_name = gr.Dropdown(
# [
# "prediction-offline",
# "prediction-online",
# # "prediction-online-summarized-info",
# # "prediction-offline-sme",
# # "prediction-online-sme",
# "prediction-request-rag",
# "prediction-request-reasoning",
# # "prediction-url-cot-claude",
# # "prediction-request-rag-cohere",
# # "prediction-with-research-conservative",
# # "prediction-with-research-bold",
# ],
# label="Tool Name",
# info="Choose the tool to run",
# )
# model_name = gr.Dropdown(
# [
# "gpt-3.5-turbo-0125",
# "gpt-4-0125-preview",
# "claude-3-haiku-20240307",
# "claude-3-sonnet-20240229",
# "claude-3-opus-20240229",
# "databricks/dbrx-instruct:nitro",
# "nousresearch/nous-hermes-2-mixtral-8x7b-sft",
# # "cohere/command-r-plus",
# ],
# label="Model Name",
# info="Choose the model to use",
# )
# with gr.Row():
# openai_api_key = gr.Textbox(
# label="OpenAI API Key",
# placeholder="Enter your OpenAI API key here",
# type="password",
# )
# anthropic_api_key = gr.Textbox(
# label="Anthropic API Key",
# placeholder="Enter your Anthropic API key here",
# type="password",
# )
# openrouter_api_key = gr.Textbox(
# label="OpenRouter API Key",
# placeholder="Enter your OpenRouter API key here",
# type="password",
# )
# with gr.Row():
# num_questions = gr.Slider(
# minimum=1,
# maximum=340,
# value=10,
# label="Number of questions to run the benchmark on",
# )
# with gr.Row():
# run_button = gr.Button("Run Benchmark")
# with gr.Row():
# with gr.Accordion("Results", open=True):
# result = gr.Dataframe()
# with gr.Row():
# with gr.Accordion("Summary", open=False):
# summary = gr.Dataframe()
# run_button.click(
# run_benchmark_gradio,
# inputs=[
# tool_name,
# model_name,
# num_questions,
# openai_api_key,
# anthropic_api_key,
# openrouter_api_key,
# ],
# outputs=[result, summary],
# )
demo.queue(default_concurrency_limit=40).launch()