Spaces:
Runtime error
Runtime error
import json | |
import streamlit as st | |
st.set_page_config( | |
page_title="BigScience Training Corpus", | |
page_icon="https://avatars.githubusercontent.com/u/82455566", | |
layout="wide", | |
initial_sidebar_state="auto", | |
) | |
query_params = st.experimental_get_query_params() | |
def load_catalogue(): | |
full_catalogue = dict( | |
[ | |
(source_name, source) | |
for source_name, source in json.load( | |
open("resources/sources_with_info_cards.json") | |
) | |
if source_name != "aggregated" | |
] | |
) | |
language_catalogues = { | |
"all": full_catalogue, | |
} | |
for source_name, source in full_catalogue.items(): | |
for ln_dct in source["languages"]: | |
ln_code = "zh" if ln_dct["ln_code"].startswith("zh") else ln_dct["ln_code"] | |
language_catalogues[ln_code] = language_catalogues.get(ln_code, {}) | |
language_catalogues[ln_code][source_name] = source | |
for ln in language_catalogues: | |
if ln != "all": | |
language_catalogues[ln] = dict( | |
sorted( | |
language_catalogues[ln].items(), | |
key=lambda x: [ | |
ln_dct["size"] | |
for ln_dct in x[1]["languages"] | |
if ln_dct["ln_code"] == ln | |
][0], | |
reverse=True, | |
) | |
) | |
return dict(sorted(language_catalogues.items())) | |
catalogue_by_ln = load_catalogue() | |
with st.sidebar: | |
ln_select = st.selectbox( | |
"Show source list for language:", | |
catalogue_by_ln, | |
) | |
source_select = st.selectbox( | |
"Show information for source:", | |
catalogue_by_ln[ln_select], | |
index=list(catalogue_by_ln[ln_select]).index( | |
query_params.get("source", [list(catalogue_by_ln[ln_select].keys())[0]])[0] | |
) if ln_select == "all" else 0, | |
) | |
st.experimental_set_query_params(**{"source": source_select}) | |
with st.expander(f"Dataset Card for {source_select}"): | |
st.markdown(catalogue_by_ln["all"][source_select]["data_card"]) | |
if "catalogue_info" in catalogue_by_ln["all"][source_select]: | |
with st.expander(f"Catalogue Information for {source_select}"): | |
st.write(catalogue_by_ln["all"][source_select]["catalogue_info"]) | |
if "seed_info" in catalogue_by_ln["all"][source_select]: | |
with st.expander(f"Pseudocrawl Seed Information for {source_select}"): | |
st.write(catalogue_by_ln["all"][source_select]["seed_info"]) | |
if "hf_info" in catalogue_by_ln["all"][source_select]: | |
with st.expander(f"HF Dataset Information for {source_select}"): | |
st.write(catalogue_by_ln["all"][source_select]["hf_info"]) | |