Spaces:
Build error
Build error
meg-huggingface
commited on
Commit
•
85cf91c
1
Parent(s):
6a9c993
Modularization and caching of text length widget
Browse files- app.py +3 -12
- data_measurements/dataset_statistics.py +59 -14
- data_measurements/streamlit_utils.py +8 -17
app.py
CHANGED
@@ -177,15 +177,7 @@ def show_column(dstats, ds_name_to_dict, show_embeddings, column_id, use_cache=T
|
|
177 |
logs.info("showing general stats")
|
178 |
st_utils.expander_general_stats(dstats, column_id)
|
179 |
st_utils.expander_label_distribution(dstats.fig_labels, column_id)
|
180 |
-
st_utils.expander_text_lengths(
|
181 |
-
dstats.tokenized_df,
|
182 |
-
dstats.fig_tok_length,
|
183 |
-
dstats.avg_length,
|
184 |
-
dstats.std_length,
|
185 |
-
OUR_TEXT_FIELD,
|
186 |
-
LENGTH_FIELD,
|
187 |
-
column_id,
|
188 |
-
)
|
189 |
st_utils.expander_text_duplicates(dstats, column_id)
|
190 |
|
191 |
# We do the loading of these after the others in order to have some time
|
@@ -197,8 +189,7 @@ def show_column(dstats, ds_name_to_dict, show_embeddings, column_id, use_cache=T
|
|
197 |
)
|
198 |
available_terms = npmi_stats.get_available_terms()
|
199 |
st_utils.npmi_widget(
|
200 |
-
column_id, available_terms, npmi_stats, _MIN_VOCAB_COUNT
|
201 |
-
)
|
202 |
logs.info("showing zipf")
|
203 |
st_utils.expander_zipf(dstats.z, dstats.zipf_fig, column_id)
|
204 |
if show_embeddings:
|
@@ -222,7 +213,7 @@ def main():
|
|
222 |
compare_mode = st.sidebar.checkbox("Comparison mode")
|
223 |
|
224 |
# When not doing new development, use the cache.
|
225 |
-
use_cache =
|
226 |
show_embeddings = st.sidebar.checkbox("Show embeddings")
|
227 |
# List of datasets for which embeddings are hard to compute:
|
228 |
|
|
|
177 |
logs.info("showing general stats")
|
178 |
st_utils.expander_general_stats(dstats, column_id)
|
179 |
st_utils.expander_label_distribution(dstats.fig_labels, column_id)
|
180 |
+
st_utils.expander_text_lengths(dstats, column_id)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
181 |
st_utils.expander_text_duplicates(dstats, column_id)
|
182 |
|
183 |
# We do the loading of these after the others in order to have some time
|
|
|
189 |
)
|
190 |
available_terms = npmi_stats.get_available_terms()
|
191 |
st_utils.npmi_widget(
|
192 |
+
column_id, available_terms, npmi_stats, _MIN_VOCAB_COUNT)
|
|
|
193 |
logs.info("showing zipf")
|
194 |
st_utils.expander_zipf(dstats.z, dstats.zipf_fig, column_id)
|
195 |
if show_embeddings:
|
|
|
213 |
compare_mode = st.sidebar.checkbox("Comparison mode")
|
214 |
|
215 |
# When not doing new development, use the cache.
|
216 |
+
use_cache = True
|
217 |
show_embeddings = st.sidebar.checkbox("Show embeddings")
|
218 |
# List of datasets for which embeddings are hard to compute:
|
219 |
|
data_measurements/dataset_statistics.py
CHANGED
@@ -197,6 +197,7 @@ class DatasetStatisticsCacheClass:
|
|
197 |
# Tokenized text
|
198 |
self.tokenized_df = None
|
199 |
# save sentence length histogram in the class so it doesn't ge re-computed
|
|
|
200 |
self.fig_tok_length = None
|
201 |
# Data Frame version of self.label_dset
|
202 |
self.label_df = None
|
@@ -262,6 +263,8 @@ class DatasetStatisticsCacheClass:
|
|
262 |
self.text_dset_fid = pjoin(self.cache_path, "text_dset")
|
263 |
self.tokenized_df_fid = pjoin(self.cache_path, "tokenized_df.feather")
|
264 |
self.label_dset_fid = pjoin(self.cache_path, "label_dset")
|
|
|
|
|
265 |
self.vocab_counts_df_fid = pjoin(self.cache_path, "vocab_counts.feather")
|
266 |
self.general_stats_fid = pjoin(self.cache_path, "general_stats_dict.json")
|
267 |
self.dup_counts_df_fid = pjoin(
|
@@ -317,24 +320,66 @@ class DatasetStatisticsCacheClass:
|
|
317 |
|
318 |
|
319 |
def load_or_prepare_text_lengths(self, save=True):
|
320 |
-
|
321 |
-
|
322 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
323 |
self.fig_tok_length = read_plotly(self.fig_tok_length_fid)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
324 |
if self.tokenized_df is None:
|
325 |
self.tokenized_df = self.do_tokenization()
|
326 |
-
self.tokenized_df[LENGTH_FIELD] = self.tokenized_df[
|
327 |
-
|
328 |
-
|
329 |
-
|
330 |
-
|
331 |
-
)
|
332 |
-
self.std_length = round(
|
333 |
-
statistics.stdev(self.tokenized_df[self.our_length_field]), 1
|
334 |
)
|
335 |
-
|
336 |
-
|
337 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
338 |
|
339 |
def load_or_prepare_embeddings(self, save=True):
|
340 |
if self.use_cache and exists(self.node_list_fid) and exists(self.fig_tree_fid):
|
|
|
197 |
# Tokenized text
|
198 |
self.tokenized_df = None
|
199 |
# save sentence length histogram in the class so it doesn't ge re-computed
|
200 |
+
self.length_df = None
|
201 |
self.fig_tok_length = None
|
202 |
# Data Frame version of self.label_dset
|
203 |
self.label_df = None
|
|
|
263 |
self.text_dset_fid = pjoin(self.cache_path, "text_dset")
|
264 |
self.tokenized_df_fid = pjoin(self.cache_path, "tokenized_df.feather")
|
265 |
self.label_dset_fid = pjoin(self.cache_path, "label_dset")
|
266 |
+
self.length_df_fid = pjoin(self.cache_path, "length_df.feather")
|
267 |
+
self.length_stats_fid = pjoin(self.cache_path, "length_stats.json")
|
268 |
self.vocab_counts_df_fid = pjoin(self.cache_path, "vocab_counts.feather")
|
269 |
self.general_stats_fid = pjoin(self.cache_path, "general_stats_dict.json")
|
270 |
self.dup_counts_df_fid = pjoin(
|
|
|
320 |
|
321 |
|
322 |
def load_or_prepare_text_lengths(self, save=True):
|
323 |
+
"""
|
324 |
+
The text length widget relies on this function, which provides
|
325 |
+
a figure of the text lengths, some text length statistics, and
|
326 |
+
a text length dataframe to peruse.
|
327 |
+
Args:
|
328 |
+
save:
|
329 |
+
Returns:
|
330 |
+
|
331 |
+
"""
|
332 |
+
# Text length figure
|
333 |
+
if (self.use_cache and exists(self.fig_tok_length_fid)):
|
334 |
self.fig_tok_length = read_plotly(self.fig_tok_length_fid)
|
335 |
+
else:
|
336 |
+
self.prepare_fig_text_lengths()
|
337 |
+
if save:
|
338 |
+
write_plotly(self.fig_tok_length, self.fig_tok_length_fid)
|
339 |
+
|
340 |
+
# Text length dataframe
|
341 |
+
if self.use_cache and exists(self.length_df_fid):
|
342 |
+
self.length_df = feather.read_feather(self.length_df_fid)
|
343 |
+
else:
|
344 |
+
self.prepare_length_df()
|
345 |
+
if save:
|
346 |
+
write_df(self.length_df, self.length_df_fid)
|
347 |
+
|
348 |
+
# Text length stats.
|
349 |
+
if self.use_cache and exists(self.length_stats_fid):
|
350 |
+
with open(self.length_stats_fid, "r") as f:
|
351 |
+
self.length_stats_dict = json.load(f)
|
352 |
+
self.avg_length = self.length_stats_dict["avg length"]
|
353 |
+
self.std_length = self.length_stats_dict["std length"]
|
354 |
+
else:
|
355 |
+
self.prepare_text_length_stats()
|
356 |
+
if save:
|
357 |
+
write_json(self.length_stats_dict, self.length_stats_fid)
|
358 |
+
|
359 |
+
def prepare_length_df(self):
|
360 |
if self.tokenized_df is None:
|
361 |
self.tokenized_df = self.do_tokenization()
|
362 |
+
self.tokenized_df[LENGTH_FIELD] = self.tokenized_df[
|
363 |
+
TOKENIZED_FIELD].apply(len)
|
364 |
+
self.length_df = self.tokenized_df[
|
365 |
+
[LENGTH_FIELD, OUR_TEXT_FIELD]].sort_values(
|
366 |
+
by=[LENGTH_FIELD], ascending=True
|
|
|
|
|
|
|
367 |
)
|
368 |
+
|
369 |
+
def prepare_text_length_stats(self):
|
370 |
+
if self.tokenized_df is None or LENGTH_FIELD not in self.tokenized_df.columns:
|
371 |
+
self.prepare_length_df()
|
372 |
+
avg_length = sum(self.tokenized_df[LENGTH_FIELD])/len(self.tokenized_df[LENGTH_FIELD])
|
373 |
+
self.avg_length = round(avg_length, 1)
|
374 |
+
std_length = statistics.stdev(self.tokenized_df[LENGTH_FIELD])
|
375 |
+
self.std_length = round(std_length, 1)
|
376 |
+
self.length_stats_dict = {"avg length": self.avg_length,
|
377 |
+
"std length": self.std_length}
|
378 |
+
|
379 |
+
def prepare_fig_text_lengths(self):
|
380 |
+
if self.tokenized_df is None or LENGTH_FIELD not in self.tokenized_df.columns:
|
381 |
+
self.prepare_length_df()
|
382 |
+
self.fig_tok_length = make_fig_lengths(self.tokenized_df, LENGTH_FIELD)
|
383 |
|
384 |
def load_or_prepare_embeddings(self, save=True):
|
385 |
if self.use_cache and exists(self.node_list_fid) and exists(self.fig_tree_fid):
|
data_measurements/streamlit_utils.py
CHANGED
@@ -147,13 +147,7 @@ def expander_label_distribution(fig_labels, column_id):
|
|
147 |
st.markdown("No labels were found in the dataset")
|
148 |
|
149 |
|
150 |
-
def expander_text_lengths(
|
151 |
-
tokenized_df,
|
152 |
-
fig_tok_length,
|
153 |
-
avg_length,
|
154 |
-
std_length,
|
155 |
-
text_field_name,
|
156 |
-
length_field_name,
|
157 |
column_id,
|
158 |
):
|
159 |
_TEXT_LENGTH_CAPTION = (
|
@@ -165,31 +159,28 @@ def expander_text_lengths(
|
|
165 |
"Below, you can see how the lengths of the text instances in your dataset are distributed."
|
166 |
)
|
167 |
st.markdown(
|
168 |
-
"Any unexpected peaks or valleys in the distribution may help to identify
|
169 |
)
|
170 |
st.markdown(
|
171 |
"### Here is the relative frequency of different text lengths in your dataset:"
|
172 |
)
|
173 |
-
st.plotly_chart(fig_tok_length, use_container_width=True)
|
174 |
-
data = tokenized_df[[length_field_name, text_field_name]].sort_values(
|
175 |
-
by=["length"], ascending=True
|
176 |
-
)
|
177 |
st.markdown(
|
178 |
"The average length of text instances is **"
|
179 |
-
+ str(avg_length)
|
180 |
+ " words**, with a standard deviation of **"
|
181 |
-
+ str(std_length)
|
182 |
+ "**."
|
183 |
)
|
184 |
|
185 |
start_id_show_lengths = st.slider(
|
186 |
f"Show the shortest sentences{column_id} starting at:",
|
187 |
0,
|
188 |
-
len(
|
189 |
value=0,
|
190 |
step=1,
|
191 |
)
|
192 |
-
st.dataframe(
|
193 |
|
194 |
|
195 |
### Third, use a sentence embedding model
|
@@ -404,7 +395,7 @@ with an ideal α value of 1."""
|
|
404 |
|
405 |
|
406 |
### Finally finally finally, show nPMI stuff.
|
407 |
-
def npmi_widget(column_id, available_terms, npmi_stats, min_vocab
|
408 |
"""
|
409 |
Part of the main app, but uses a user interaction so pulled out as its own f'n.
|
410 |
:param use_cache:
|
|
|
147 |
st.markdown("No labels were found in the dataset")
|
148 |
|
149 |
|
150 |
+
def expander_text_lengths(dstats,
|
|
|
|
|
|
|
|
|
|
|
|
|
151 |
column_id,
|
152 |
):
|
153 |
_TEXT_LENGTH_CAPTION = (
|
|
|
159 |
"Below, you can see how the lengths of the text instances in your dataset are distributed."
|
160 |
)
|
161 |
st.markdown(
|
162 |
+
"Any unexpected peaks or valleys in the distribution may help to identify instances you want to remove or augment."
|
163 |
)
|
164 |
st.markdown(
|
165 |
"### Here is the relative frequency of different text lengths in your dataset:"
|
166 |
)
|
167 |
+
st.plotly_chart(dstats.fig_tok_length, use_container_width=True)
|
|
|
|
|
|
|
168 |
st.markdown(
|
169 |
"The average length of text instances is **"
|
170 |
+
+ str(dstats.avg_length)
|
171 |
+ " words**, with a standard deviation of **"
|
172 |
+
+ str(dstats.std_length)
|
173 |
+ "**."
|
174 |
)
|
175 |
|
176 |
start_id_show_lengths = st.slider(
|
177 |
f"Show the shortest sentences{column_id} starting at:",
|
178 |
0,
|
179 |
+
len(dstats.length_df["length"].unique()),
|
180 |
value=0,
|
181 |
step=1,
|
182 |
)
|
183 |
+
st.dataframe(dstats.length_df[dstats.length_df["length"] == start_id_show_lengths].set_index("length"))
|
184 |
|
185 |
|
186 |
### Third, use a sentence embedding model
|
|
|
395 |
|
396 |
|
397 |
### Finally finally finally, show nPMI stuff.
|
398 |
+
def npmi_widget(column_id, available_terms, npmi_stats, min_vocab):
|
399 |
"""
|
400 |
Part of the main app, but uses a user interaction so pulled out as its own f'n.
|
401 |
:param use_cache:
|