mj-new commited on
Commit
4d749f0
1 Parent(s): a958ea3

Added taxonomy tab for data catalog

Browse files
Files changed (3) hide show
  1. __pycache__/contants.cpython-310.pyc +0 -0
  2. app.py +10 -1
  3. contants.py +4 -0
__pycache__/contants.cpython-310.pyc CHANGED
Binary files a/__pycache__/contants.cpython-310.pyc and b/__pycache__/contants.cpython-310.pyc differ
 
app.py CHANGED
@@ -2,7 +2,7 @@ import pandas as pd
2
  import streamlit as st
3
 
4
  from app_utils import filter_dataframe, calculate_height_to_display
5
- from contants import INFO_CATALOG, CITATION_CATALOG, HOWTO_CATALOG,INFO_BENCHMARK, CITATION_BENCHMARK, HOWTO_BENCHMARK, INFO_MAIN, CITATION_MAIN
6
  from utils import BASE_SUMMARY_METRICS
7
  from utils import load_data_catalog, load_data_taxonomy, load_bench_catalog, load_bench_taxonomy
8
  from utils import datasets_count_and_size, datasets_count_and_size_standard, metadata_coverage, catalog_summary_statistics
@@ -126,6 +126,15 @@ with data_survey:
126
  df_datasets_per_device = datasets_count_and_size(df_data_cat_available_paid, col_groupby, col_sort=col_groupby, col_percent = ['Size audio transcribed [hours]'], col_sum = ['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
127
  st.dataframe(df_datasets_per_device, use_container_width=False)
128
 
 
 
 
 
 
 
 
 
 
129
  with bench_cat:
130
  st.write("Benchmarks catalog")
131
  # TODO - load and display benchmarks catalog
 
2
  import streamlit as st
3
 
4
  from app_utils import filter_dataframe, calculate_height_to_display
5
+ from contants import INFO_CATALOG, CITATION_CATALOG, HOWTO_CATALOG,INFO_BENCHMARK, CITATION_BENCHMARK, HOWTO_BENCHMARK, INFO_MAIN, CITATION_MAIN, HOWTO_TAXONOMY_CAT
6
  from utils import BASE_SUMMARY_METRICS
7
  from utils import load_data_catalog, load_data_taxonomy, load_bench_catalog, load_bench_taxonomy
8
  from utils import datasets_count_and_size, datasets_count_and_size_standard, metadata_coverage, catalog_summary_statistics
 
126
  df_datasets_per_device = datasets_count_and_size(df_data_cat_available_paid, col_groupby, col_sort=col_groupby, col_percent = ['Size audio transcribed [hours]'], col_sum = ['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
127
  st.dataframe(df_datasets_per_device, use_container_width=False)
128
 
129
+ with data_taxonomy:
130
+ st.title("Polish ASR Speech Data Taxonomy")
131
+ st.header("How to use?")
132
+ st.markdown(HOWTO_TAXONOMY_CAT, unsafe_allow_html=True)
133
+ st.dataframe(df_data_tax, hide_index=True, use_container_width=True)
134
+ st.header("How to cite?")
135
+ st.markdown(CITATION_CATALOG, unsafe_allow_html=True)
136
+
137
+
138
  with bench_cat:
139
  st.write("Benchmarks catalog")
140
  # TODO - load and display benchmarks catalog
contants.py CHANGED
@@ -52,6 +52,10 @@ Depending on the column type, you can use the search box to filter the content.
52
  Please refer to the **ASR Speech Data Taxonomy** tab for the explanation of the columns. <br> \
53
  If you looking for insights derived from the collected in the catalog, please go to **Polish ASR Speech Data Survey** tab. <br>"
54
 
 
 
 
 
55
  ############################################################################################################
56
  INFO_BENCHMARK = "TODO"
57
 
 
52
  Please refer to the **ASR Speech Data Taxonomy** tab for the explanation of the columns. <br> \
53
  If you looking for insights derived from the collected in the catalog, please go to **Polish ASR Speech Data Survey** tab. <br>"
54
 
55
+
56
+ HOWTO_TAXONOMY_CAT = "This table presents descriptors (columns) used in the *Polish ASR Speech Datasets Catalog* <br> \
57
+ Taxonomy is also provided on [GitHub as TSV file](https://github.com/goodmike31/pl-asr-speech-data-survey/blob/main/snapshots/pl-asr-speech-datasets-catalog-latest.tsv) and [Google Sheets](https://docs.google.com/spreadsheets/d/181EDfwZNtHgHFOMaKNtgKssrYDX4tXTJ9POMzBsCRlI/edit#gid=2015613057)"
58
+
59
  ############################################################################################################
60
  INFO_BENCHMARK = "TODO"
61