Spaces:
Runtime error
Runtime error
datapoints explorer app
Browse files- README.md +4 -5
- app.py +94 -0
- assets/data/datapoints_embeddings.parquet +3 -0
- requirements.txt +12 -0
README.md
CHANGED
@@ -1,13 +1,12 @@
|
|
1 |
---
|
2 |
title: Datapoints Explorer
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: streamlit
|
7 |
-
sdk_version: 1.
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: apache-2.0
|
11 |
---
|
12 |
|
13 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
title: Datapoints Explorer
|
3 |
+
emoji: π
|
4 |
+
colorFrom: red
|
5 |
+
colorTo: purple
|
6 |
sdk: streamlit
|
7 |
+
sdk_version: 1.9.0
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: apache-2.0
|
11 |
---
|
12 |
|
|
app.py
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## LIBRARIES ###
|
2 |
+
## Data
|
3 |
+
import pandas as pd
|
4 |
+
pd.options.display.float_format = '${:,.2f}'.format
|
5 |
+
|
6 |
+
# Analysis
|
7 |
+
|
8 |
+
# App & Visualization
|
9 |
+
import streamlit as st
|
10 |
+
from bokeh.models import CustomJS, ColumnDataSource, TextInput, DataTable, TableColumn
|
11 |
+
from bokeh.plotting import figure
|
12 |
+
from bokeh.transform import factor_cmap
|
13 |
+
from bokeh.palettes import Category20c_20
|
14 |
+
from bokeh.layouts import column, row
|
15 |
+
|
16 |
+
# utils
|
17 |
+
|
18 |
+
def datasets_explorer_viz(df):
|
19 |
+
s = ColumnDataSource(df)
|
20 |
+
TOOLTIPS= [("dataset_id", "@dataset_id"), ("text", "@text")]
|
21 |
+
color = factor_cmap('dataset_id', palette=Category20c_20, factors=df['dataset_id'].unique())
|
22 |
+
p = figure(plot_width=1000, plot_height=800, tools="hover,wheel_zoom,pan,box_select", tooltips=TOOLTIPS, toolbar_location="above")
|
23 |
+
p.scatter('x', 'y', size=5, source=s, alpha=0.8,marker='circle',fill_color = color, line_color=color, legend_field = 'dataset_id')
|
24 |
+
p.legend.location = "bottom_right"
|
25 |
+
p.legend.click_policy="mute"
|
26 |
+
p.legend.label_text_font_size="8pt"
|
27 |
+
table_source = ColumnDataSource(data=dict())
|
28 |
+
selection_source = ColumnDataSource(data=dict())
|
29 |
+
columns = [
|
30 |
+
# TableColumn(field="x", title="X data"),
|
31 |
+
# TableColumn(field="y", title="Y data"),
|
32 |
+
TableColumn(field="dataset_id", title="Dataset ID"),
|
33 |
+
TableColumn(field="text", title="Text"),
|
34 |
+
]
|
35 |
+
data_table = DataTable(source=table_source, columns=columns, width=800)
|
36 |
+
p.circle('x', 'y',source=selection_source, size=5, color= 'red')
|
37 |
+
s.selected.js_on_change('indices', CustomJS(args=dict(umap_source=s, table_source=table_source), code="""
|
38 |
+
const inds = cb_obj.indices;
|
39 |
+
const tableData = table_source.data;
|
40 |
+
const umapData = umap_source.data;
|
41 |
+
|
42 |
+
tableData['text'] = []
|
43 |
+
tableData['dataset_id'] = []
|
44 |
+
|
45 |
+
for (let i = 0; i < inds.length; i++) {
|
46 |
+
tableData['text'].push(umapData['text'][inds[i]])
|
47 |
+
tableData['dataset_id'].push(umapData['dataset_id'][inds[i]])
|
48 |
+
}
|
49 |
+
table_source.data = tableData;
|
50 |
+
table_source.change.emit();
|
51 |
+
"""
|
52 |
+
))
|
53 |
+
text_input = TextInput(value="", title="Search")
|
54 |
+
|
55 |
+
text_input.js_on_change('value', CustomJS(args=dict(plot_source=s, selection_source=selection_source), code="""
|
56 |
+
const plot_data = plot_source.data;
|
57 |
+
const selectData = selection_source.data
|
58 |
+
const value = cb_obj.value
|
59 |
+
|
60 |
+
selectData['x'] = []
|
61 |
+
selectData['y'] = []
|
62 |
+
selectData['dataset_id'] = []
|
63 |
+
selectData['text'] = []
|
64 |
+
|
65 |
+
for (var i = 0; i < plot_data['dataset_id'].length; i++) {
|
66 |
+
if (plot_data['dataset_id'][i].includes(value) || plot_data['text'][i].includes(value)) {
|
67 |
+
selectData['x'].push(plot_data['x'][i])
|
68 |
+
selectData['y'].push(plot_data['y'][i])
|
69 |
+
selectData['dataset_id'].push(plot_data['dataset_id'][i])
|
70 |
+
selectData['text'].push(plot_data['text'][i])
|
71 |
+
}
|
72 |
+
}
|
73 |
+
selection_source.change.emit()
|
74 |
+
"""))
|
75 |
+
|
76 |
+
st.bokeh_chart(row(column(text_input,p), data_table))
|
77 |
+
|
78 |
+
|
79 |
+
if __name__ == "__main__":
|
80 |
+
### STREAMLIT APP CONGFIG ###
|
81 |
+
st.set_page_config(layout="wide", page_title="Datapoints Explorer")
|
82 |
+
st.title('Interactive Datapoints Explorer for Text Classification')
|
83 |
+
#lcol, rcol = st.columns([2, 2])
|
84 |
+
# ******* loading the mode and the data
|
85 |
+
|
86 |
+
### LOAD DATA AND SESSION VARIABLES ###
|
87 |
+
with st.expander("How to interact with the plot:"):
|
88 |
+
st.markdown("* Each point in the plot represents an example from the HF hub text classification datasets.")
|
89 |
+
st.markdown("* The datapoints are emebdded using sentence embeddings of their `text` field.")
|
90 |
+
st.markdown("* You can either search for a datapoint or drag and select to peek into the cluster content.")
|
91 |
+
st.markdown("* If the term you are searching for matches `dataset_id` or `text` it will be highlighted in *red*. The selected points will be summarized as a dataframe on the right.")
|
92 |
+
datasets_df = pd.read_parquet('./assets/data/datapoints_embeddings.parquet')
|
93 |
+
st.warning("Hugging Face π€ Datapoints Explorer for Text Classification")
|
94 |
+
datasets_explorer_viz(datasets_df)
|
assets/data/datapoints_embeddings.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7285649e64341de3b1bc7261f093be42ea7119fe1e079d7ae849115f834b9fd9
|
3 |
+
size 1667528
|
requirements.txt
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
bokeh==2.4.1
|
2 |
+
Jinja2==3.1.2
|
3 |
+
PyYAML==6.0
|
4 |
+
numpy==1.22.4
|
5 |
+
packaging==21.3
|
6 |
+
Pillow==9.1.1
|
7 |
+
tornado==6.1
|
8 |
+
typing_extensions==4.2.0
|
9 |
+
MarkupSafe==2.1.1
|
10 |
+
pyparsing==3.0.9
|
11 |
+
pandas==1.4.2
|
12 |
+
streamlit==1.2.0
|