Spaces:

amu-cai
/

pl-asr-survey

Sleeping

mj-new commited on Mar 11

Commit

4d749f0

•

1 Parent(s): a958ea3

Added taxonomy tab for data catalog

Files changed (3) hide show

__pycache__/contants.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/contants.cpython-310.pyc and b/__pycache__/contants.cpython-310.pyc differ

app.py CHANGED Viewed

@@ -2,7 +2,7 @@ import pandas as pd
 import streamlit as st
 from app_utils import filter_dataframe, calculate_height_to_display
-from contants import INFO_CATALOG, CITATION_CATALOG, HOWTO_CATALOG,INFO_BENCHMARK, CITATION_BENCHMARK, HOWTO_BENCHMARK, INFO_MAIN, CITATION_MAIN
 from utils import BASE_SUMMARY_METRICS
 from utils import  load_data_catalog, load_data_taxonomy, load_bench_catalog, load_bench_taxonomy
 from utils import datasets_count_and_size, datasets_count_and_size_standard, metadata_coverage, catalog_summary_statistics
@@ -126,6 +126,15 @@ with data_survey:
     df_datasets_per_device = datasets_count_and_size(df_data_cat_available_paid, col_groupby, col_sort=col_groupby, col_percent = ['Size audio transcribed [hours]'], col_sum = ['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
     st.dataframe(df_datasets_per_device, use_container_width=False)
 with bench_cat:
     st.write("Benchmarks catalog")
     # TODO - load and display benchmarks catalog

 import streamlit as st
 from app_utils import filter_dataframe, calculate_height_to_display
+from contants import INFO_CATALOG, CITATION_CATALOG, HOWTO_CATALOG,INFO_BENCHMARK, CITATION_BENCHMARK, HOWTO_BENCHMARK, INFO_MAIN, CITATION_MAIN, HOWTO_TAXONOMY_CAT
 from utils import BASE_SUMMARY_METRICS
 from utils import  load_data_catalog, load_data_taxonomy, load_bench_catalog, load_bench_taxonomy
 from utils import datasets_count_and_size, datasets_count_and_size_standard, metadata_coverage, catalog_summary_statistics
     df_datasets_per_device = datasets_count_and_size(df_data_cat_available_paid, col_groupby, col_sort=col_groupby, col_percent = ['Size audio transcribed [hours]'], col_sum = ['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
     st.dataframe(df_datasets_per_device, use_container_width=False)
+with data_taxonomy:
+    st.title("Polish ASR Speech Data Taxonomy")
+    st.header("How to use?")
+    st.markdown(HOWTO_TAXONOMY_CAT, unsafe_allow_html=True)
+    st.dataframe(df_data_tax, hide_index=True, use_container_width=True)
+    st.header("How to cite?")
+    st.markdown(CITATION_CATALOG, unsafe_allow_html=True)
 with bench_cat:
     st.write("Benchmarks catalog")
     # TODO - load and display benchmarks catalog

contants.py CHANGED Viewed

@@ -52,6 +52,10 @@ Depending on the column type, you can use the search box to filter the content.
 Please refer to the **ASR Speech Data Taxonomy** tab for the explanation of the columns. <br> \
 If you looking for insights derived from the collected in the catalog, please go to **Polish ASR Speech Data Survey** tab. <br>"
 ############################################################################################################
 INFO_BENCHMARK = "TODO"

 Please refer to the **ASR Speech Data Taxonomy** tab for the explanation of the columns. <br> \
 If you looking for insights derived from the collected in the catalog, please go to **Polish ASR Speech Data Survey** tab. <br>"
+HOWTO_TAXONOMY_CAT = "This table presents descriptors (columns) used in the *Polish ASR Speech Datasets Catalog* <br> \
+Taxonomy is also provided on [GitHub as TSV file](https://github.com/goodmike31/pl-asr-speech-data-survey/blob/main/snapshots/pl-asr-speech-datasets-catalog-latest.tsv) and [Google Sheets](https://docs.google.com/spreadsheets/d/181EDfwZNtHgHFOMaKNtgKssrYDX4tXTJ9POMzBsCRlI/edit#gid=2015613057)"
 ############################################################################################################
 INFO_BENCHMARK = "TODO"