Spaces:

bardsai
/

performance-llm-board

Running

App Files Files Community

piotr-szleg-bards-ai commited on Mar 15, 2024

Commit

15822f7

1 Parent(s): 8980b16

2024-03-15 14:12:25 Publish script update

Browse files

Files changed (3) hide show

app.py +11 -5
app_constants.py +2 -2
data/combined_plots.csv +0 -0

app.py CHANGED Viewed

@@ -10,6 +10,7 @@ from pandas.api.types import is_numeric_dtype
 from pipeline.config import LLMBoardConfig, QueriesConfig
 from app_constants import README, JS, TIME_PERIODS_EXPLANATION_DF
 queries_config = QueriesConfig()
@@ -134,9 +135,11 @@ def display_plot(plot_df_row):
 def display_filtered_plot(plot_df_row):
     row = dict(plot_df_row)
     plot_element, plot = display_plot(plot_df_row)
-    plots.append((plot_element, plot, row))
     if "description" in row and pd.notna(row["description"]):
-        gr.Markdown(str(row["description"]))
 def filter_plots(searched_query: str):
     searched_model_names = searched_query.split("|")
@@ -144,7 +147,7 @@ def filter_plots(searched_query: str):
     searched_model_names = [n for n in searched_model_names if n]
     results = []
-    for plot_display, plot, row in plots:
         visible = True
         if "df" in row and pd.notna(row["df"]):
             buffer = io.StringIO(row["df"])
@@ -162,6 +165,9 @@ def filter_plots(searched_query: str):
                 visible = False
         results.append(gr.Plot(plot, visible=visible))
     return results
@@ -296,13 +302,13 @@ To compare the parameters more thoroughly use the filtering box on top of this p
     filter_button.click(
         fn=filter_plots,
         inputs=filter_textbox,
-        outputs=[v[0] for v in plots],
         api_name="filter_plots",
     )
     filter_textbox.submit(
         fn=filter_plots,
         inputs=filter_textbox,
-        outputs=[v[0] for v in plots],
         api_name="filter_plots",
     )
     collapse_languages_button.click(

 from pipeline.config import LLMBoardConfig, QueriesConfig
 from app_constants import README, JS, TIME_PERIODS_EXPLANATION_DF
+from itertools import chain
 queries_config = QueriesConfig()
 def display_filtered_plot(plot_df_row):
     row = dict(plot_df_row)
     plot_element, plot = display_plot(plot_df_row)
     if "description" in row and pd.notna(row["description"]):
+        description_element = gr.Markdown(str(row["description"]))
+    else:
+        description_element = gr.Markdown(value="", visible=False)
+    plots.append((plot_element, description_element, plot, row))
 def filter_plots(searched_query: str):
     searched_model_names = searched_query.split("|")
     searched_model_names = [n for n in searched_model_names if n]
     results = []
+    for plot_display, description_element, plot, row in plots:
         visible = True
         if "df" in row and pd.notna(row["df"]):
             buffer = io.StringIO(row["df"])
                 visible = False
         results.append(gr.Plot(plot, visible=visible))
+        if not description_element.value:
+            visible = False
+        results.append(gr.Markdown(visible=visible))
     return results
     filter_button.click(
         fn=filter_plots,
         inputs=filter_textbox,
+        outputs=list(chain.from_iterable([v[0:2] for v in plots])),
         api_name="filter_plots",
     )
     filter_textbox.submit(
         fn=filter_plots,
         inputs=filter_textbox,
+        outputs=list(chain.from_iterable([v[0:2] for v in plots])),
         api_name="filter_plots",
     )
     collapse_languages_button.click(

app_constants.py CHANGED Viewed

@@ -2,10 +2,10 @@ import pandas as pd
 README = """
 This project compares different large language models and their providers for real time applications and mass data processing.
-While other benchmarks compare LLMs on different human intelligence tasks this benchmark focus on features related to business and engineering aspects such as response times, pricing and data streaming capabilities.
 To preform evaluation we chose a task of newspaper articles summarization from [GEM/xlsum](https://huggingface.co/datasets/GEM/xlsum) dataset as it represents a very standard type of task where model has to understand unstructured natural language text, process it and output text in a specified format.
-For this version we chose English, Ukrainian and Japanese languages, with Japanese representing languages using logographic alphabets. This enable us also validate the effectiveness of the LLM for different language groups.
 Each of the models was asked to summarize the text using the following prompt:

 README = """
 This project compares different large language models and their providers for real time applications and mass data processing.
+While other benchmarks compare LLMs on different human intelligence tasks this benchmark focuses on features related to business and engineering aspects such as response times, pricing and data streaming capabilities.
 To preform evaluation we chose a task of newspaper articles summarization from [GEM/xlsum](https://huggingface.co/datasets/GEM/xlsum) dataset as it represents a very standard type of task where model has to understand unstructured natural language text, process it and output text in a specified format.
+For this version we chose English, Ukrainian and Japanese languages, with Japanese representing languages using logographic alphabets. This enables us to also validate the effectiveness of the LLM for different language groups.
 Each of the models was asked to summarize the text using the following prompt:

data/combined_plots.csv CHANGED Viewed

The diff for this file is too large to render. See raw diff