nits
Browse files- src/logic/data_fetching.py +5 -7
- src/view/help_tab.py +4 -4
- src/view/metric_view_tab.py +14 -27
- src/view/reverse_search_tab.py +1 -1
- src/view/view.py +2 -3
src/logic/data_fetching.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
from functools import partial
|
2 |
import os
|
3 |
import json
|
4 |
import re
|
@@ -17,19 +17,17 @@ def find_folders(base_folder: str, path: str) -> List[str]:
|
|
17 |
base_folder_df = get_datafolder(base_folder)
|
18 |
if not base_folder_df.exists(path):
|
19 |
return []
|
20 |
-
return
|
21 |
-
[
|
22 |
folder
|
23 |
for folder,info in base_folder_df.find(path, maxdepth=1, withdirs=True, detail=True).items()
|
24 |
if info["type"] == "directory" and not (folder.rstrip("/") == path.rstrip("/"))
|
25 |
]
|
26 |
-
)
|
27 |
|
28 |
-
def fetch_datasets(base_folder: str):
|
29 |
-
datasets = sorted(find_folders(base_folder, ""))
|
30 |
if len(datasets) == 0:
|
31 |
raise ValueError("No datasets found")
|
32 |
-
return datasets
|
33 |
|
34 |
def fetch_groups(base_folder: str, datasets: List[str], old_groups: str, type: str = "intersection"):
|
35 |
if not datasets:
|
|
|
1 |
+
from functools import lru_cache, partial
|
2 |
import os
|
3 |
import json
|
4 |
import re
|
|
|
17 |
base_folder_df = get_datafolder(base_folder)
|
18 |
if not base_folder_df.exists(path):
|
19 |
return []
|
20 |
+
return [
|
|
|
21 |
folder
|
22 |
for folder,info in base_folder_df.find(path, maxdepth=1, withdirs=True, detail=True).items()
|
23 |
if info["type"] == "directory" and not (folder.rstrip("/") == path.rstrip("/"))
|
24 |
]
|
|
|
25 |
|
26 |
+
def fetch_datasets(base_folder: str, progress=gr.Progress()):
|
27 |
+
datasets = sorted(progress.tqdm(find_folders(base_folder, "")))
|
28 |
if len(datasets) == 0:
|
29 |
raise ValueError("No datasets found")
|
30 |
+
return datasets, None
|
31 |
|
32 |
def fetch_groups(base_folder: str, datasets: List[str], old_groups: str, type: str = "intersection"):
|
33 |
if not datasets:
|
src/view/help_tab.py
CHANGED
@@ -7,10 +7,10 @@ def create_help_tab():
|
|
7 |
|
8 |
# Dataset Metrics Explorer
|
9 |
## Features:
|
10 |
-
-
|
11 |
-
- Search for
|
12 |
|
13 |
-
## View
|
14 |
1) Specify Metrics location (Stats block `output_folder`) and click "Fetch Datasets"
|
15 |
2) Select datasets you are interested in using the dropdown or regex filter
|
16 |
3) Specify Grouping (histogram/summary/fqdn/suffix) and Metric name
|
@@ -27,7 +27,7 @@ def create_help_tab():
|
|
27 |
- **summary**: Shows the average value of given metric for every dataset
|
28 |
* show_stds: Show the standard deviation from mean for every datasets
|
29 |
|
30 |
-
## Reverse
|
31 |
To search for datasets containing a grouping and certain metric, use the Reverse search section.
|
32 |
Specify the search parameters and click "Search". This will show you found datasets in the "Found datasets" textbox. You can modify the selection after search by removing unwanted lines and clicking "Add to selection".
|
33 |
|
|
|
7 |
|
8 |
# Dataset Metrics Explorer
|
9 |
## Features:
|
10 |
+
- Inspect datasets throught various metrics computed using datatrove
|
11 |
+
- Search for datasets containing certain metrics
|
12 |
|
13 |
+
## Metrics View Usage:
|
14 |
1) Specify Metrics location (Stats block `output_folder`) and click "Fetch Datasets"
|
15 |
2) Select datasets you are interested in using the dropdown or regex filter
|
16 |
3) Specify Grouping (histogram/summary/fqdn/suffix) and Metric name
|
|
|
27 |
- **summary**: Shows the average value of given metric for every dataset
|
28 |
* show_stds: Show the standard deviation from mean for every datasets
|
29 |
|
30 |
+
## Reverse Metrics Search Usage:
|
31 |
To search for datasets containing a grouping and certain metric, use the Reverse search section.
|
32 |
Specify the search parameters and click "Search". This will show you found datasets in the "Found datasets" textbox. You can modify the selection after search by removing unwanted lines and clicking "Add to selection".
|
33 |
|
src/view/metric_view_tab.py
CHANGED
@@ -6,12 +6,12 @@ from functools import partial
|
|
6 |
import re
|
7 |
import json
|
8 |
|
9 |
-
from src.logic.data_fetching import fetch_datasets, fetch_graph_data, fetch_groups, fetch_metrics, update_datasets_with_regex
|
10 |
from src.logic.data_processing import export_data
|
11 |
from src.logic.graph_settings import update_graph_options
|
12 |
from src.logic.plotting import plot_data
|
13 |
|
14 |
-
def create_metric_view_tab(METRICS_LOCATION_DEFAULT: str, available_datasets: gr.State
|
15 |
metric_data = gr.State([])
|
16 |
|
17 |
with gr.Row():
|
@@ -120,53 +120,40 @@ def create_metric_view_tab(METRICS_LOCATION_DEFAULT: str, available_datasets: gr
|
|
120 |
|
121 |
|
122 |
|
123 |
-
def update_selected_datasets_dropdown(available_datasets,
|
124 |
-
|
|
|
|
|
125 |
|
126 |
|
127 |
datasets_fetch.click(
|
128 |
fn=fetch_datasets,
|
129 |
inputs=[base_folder],
|
130 |
-
outputs=[available_datasets],
|
131 |
)
|
132 |
|
133 |
available_datasets.change(
|
134 |
fn=update_selected_datasets_dropdown,
|
135 |
-
inputs=[available_datasets,
|
136 |
outputs=selected_datasets_dropdown,
|
137 |
)
|
138 |
|
139 |
regex_button.click(
|
140 |
fn=update_datasets_with_regex,
|
141 |
-
inputs=[regex_select,
|
142 |
-
outputs=
|
143 |
)
|
144 |
|
145 |
-
def update_selected_datasets(selected_datasets_dropdown):
|
146 |
-
return selected_datasets_dropdown
|
147 |
|
148 |
selected_datasets_dropdown.change(
|
149 |
-
fn=update_selected_datasets,
|
150 |
-
inputs=[selected_datasets_dropdown],
|
151 |
-
outputs=selected_datasets,
|
152 |
-
)
|
153 |
-
|
154 |
-
selected_datasets.change(
|
155 |
-
fn=update_selected_datasets_dropdown,
|
156 |
-
inputs=[available_datasets, selected_datasets],
|
157 |
-
outputs=selected_datasets_dropdown,
|
158 |
-
)
|
159 |
-
|
160 |
-
|
161 |
-
selected_datasets.change(
|
162 |
fn=fetch_groups,
|
163 |
-
inputs=[base_folder,
|
164 |
outputs=grouping_dropdown,
|
165 |
)
|
166 |
|
167 |
grouping_dropdown.change(
|
168 |
fn=fetch_metrics,
|
169 |
-
inputs=[base_folder,
|
170 |
outputs=metric_name_dropdown,
|
171 |
)
|
172 |
|
@@ -174,7 +161,7 @@ def create_metric_view_tab(METRICS_LOCATION_DEFAULT: str, available_datasets: gr
|
|
174 |
fn=fetch_graph_data,
|
175 |
inputs=[
|
176 |
base_folder,
|
177 |
-
|
178 |
metric_name_dropdown,
|
179 |
grouping_dropdown,
|
180 |
],
|
@@ -219,4 +206,4 @@ def create_metric_view_tab(METRICS_LOCATION_DEFAULT: str, available_datasets: gr
|
|
219 |
outputs=[export_data_json],
|
220 |
)
|
221 |
|
222 |
-
return base_folder
|
|
|
6 |
import re
|
7 |
import json
|
8 |
|
9 |
+
from src.logic.data_fetching import fetch_datasets, fetch_graph_data, fetch_groups, fetch_metrics, update_datasets_with_regex, update_datasets_with_regex
|
10 |
from src.logic.data_processing import export_data
|
11 |
from src.logic.graph_settings import update_graph_options
|
12 |
from src.logic.plotting import plot_data
|
13 |
|
14 |
+
def create_metric_view_tab(METRICS_LOCATION_DEFAULT: str, available_datasets: gr.State):
|
15 |
metric_data = gr.State([])
|
16 |
|
17 |
with gr.Row():
|
|
|
120 |
|
121 |
|
122 |
|
123 |
+
def update_selected_datasets_dropdown(available_datasets, selected_datasets_dropdown):
|
124 |
+
selected_datasets = selected_datasets_dropdown or []
|
125 |
+
selected_datasets = set(selected_datasets) & set(available_datasets)
|
126 |
+
return gr.Dropdown(choices=available_datasets, value=sorted(list(selected_datasets)))
|
127 |
|
128 |
|
129 |
datasets_fetch.click(
|
130 |
fn=fetch_datasets,
|
131 |
inputs=[base_folder],
|
132 |
+
outputs=[available_datasets, selected_datasets_dropdown],
|
133 |
)
|
134 |
|
135 |
available_datasets.change(
|
136 |
fn=update_selected_datasets_dropdown,
|
137 |
+
inputs=[available_datasets, selected_datasets_dropdown],
|
138 |
outputs=selected_datasets_dropdown,
|
139 |
)
|
140 |
|
141 |
regex_button.click(
|
142 |
fn=update_datasets_with_regex,
|
143 |
+
inputs=[regex_select, selected_datasets_dropdown, available_datasets],
|
144 |
+
outputs=selected_datasets_dropdown,
|
145 |
)
|
146 |
|
|
|
|
|
147 |
|
148 |
selected_datasets_dropdown.change(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
149 |
fn=fetch_groups,
|
150 |
+
inputs=[base_folder, selected_datasets_dropdown, grouping_dropdown],
|
151 |
outputs=grouping_dropdown,
|
152 |
)
|
153 |
|
154 |
grouping_dropdown.change(
|
155 |
fn=fetch_metrics,
|
156 |
+
inputs=[base_folder, selected_datasets_dropdown, grouping_dropdown, metric_name_dropdown],
|
157 |
outputs=metric_name_dropdown,
|
158 |
)
|
159 |
|
|
|
161 |
fn=fetch_graph_data,
|
162 |
inputs=[
|
163 |
base_folder,
|
164 |
+
selected_datasets_dropdown,
|
165 |
metric_name_dropdown,
|
166 |
grouping_dropdown,
|
167 |
],
|
|
|
206 |
outputs=[export_data_json],
|
207 |
)
|
208 |
|
209 |
+
return base_folder, selected_datasets_dropdown
|
src/view/reverse_search_tab.py
CHANGED
@@ -3,7 +3,7 @@ import gradio as gr
|
|
3 |
|
4 |
from src.logic.data_fetching import fetch_groups, fetch_metrics, reverse_search, reverse_search_add
|
5 |
|
6 |
-
def create_reverse_search_tab(base_folder: gr.Textbox, datasets_available: gr.State, datasets_selected: gr.
|
7 |
reverse_search_headline = gr.Markdown(value="# Reverse Metrics Search")
|
8 |
|
9 |
with gr.Row():
|
|
|
3 |
|
4 |
from src.logic.data_fetching import fetch_groups, fetch_metrics, reverse_search, reverse_search_add
|
5 |
|
6 |
+
def create_reverse_search_tab(base_folder: gr.Textbox, datasets_available: gr.State, datasets_selected: gr.Dropdown):
|
7 |
reverse_search_headline = gr.Markdown(value="# Reverse Metrics Search")
|
8 |
|
9 |
with gr.Row():
|
src/view/view.py
CHANGED
@@ -11,16 +11,15 @@ METRICS_LOCATION_DEFAULT = os.getenv("METRICS_LOCATION_DEFAULT", "hf://datasets/
|
|
11 |
|
12 |
def create_interface():
|
13 |
with gr.Blocks() as demo:
|
14 |
-
metrics_headline = gr.Markdown(value="# Metrics
|
15 |
available_datasets = gr.State([])
|
16 |
-
selected_datasets = gr.State([])
|
17 |
|
18 |
with gr.Tabs():
|
19 |
with gr.Tab("Help"):
|
20 |
create_help_tab()
|
21 |
|
22 |
with gr.TabItem("Metric View"):
|
23 |
-
base_folder = create_metric_view_tab(METRICS_LOCATION_DEFAULT, available_datasets
|
24 |
|
25 |
with gr.TabItem("Reverse Metrics Search"):
|
26 |
create_reverse_search_tab(base_folder, available_datasets, selected_datasets)
|
|
|
11 |
|
12 |
def create_interface():
|
13 |
with gr.Blocks() as demo:
|
14 |
+
metrics_headline = gr.Markdown(value="# Datasets Metrics Explorer")
|
15 |
available_datasets = gr.State([])
|
|
|
16 |
|
17 |
with gr.Tabs():
|
18 |
with gr.Tab("Help"):
|
19 |
create_help_tab()
|
20 |
|
21 |
with gr.TabItem("Metric View"):
|
22 |
+
base_folder, selected_datasets = create_metric_view_tab(METRICS_LOCATION_DEFAULT, available_datasets)
|
23 |
|
24 |
with gr.TabItem("Reverse Metrics Search"):
|
25 |
create_reverse_search_tab(base_folder, available_datasets, selected_datasets)
|