Spaces:

bigscience-data
/

bigscience-corpus

Runtime error

App Files Files Community

bigscience-corpus / app.py

cakiki

Update app.py

8874b76 over 1 year ago

raw

history blame

2.72 kB

	import json

	import streamlit as st

	st.set_page_config(
	page_title="BigScience Training Corpus",
	page_icon="https://avatars.githubusercontent.com/u/82455566",
	layout="wide",
	initial_sidebar_state="auto",
	)

	query_params = st.experimental_get_query_params()


	@st.cache_data
	def load_catalogue():
	full_catalogue = dict(
	[
	(source_name, source)
	for source_name, source in json.load(
	open("resources/sources_with_info_cards.json")
	)
	if source_name != "aggregated"
	]
	)
	language_catalogues = {
	"all": full_catalogue,
	}
	for source_name, source in full_catalogue.items():
	for ln_dct in source["languages"]:
	ln_code = "zh" if ln_dct["ln_code"].startswith("zh") else ln_dct["ln_code"]
	language_catalogues[ln_code] = language_catalogues.get(ln_code, {})
	language_catalogues[ln_code][source_name] = source
	for ln in language_catalogues:
	if ln != "all":
	language_catalogues[ln] = dict(
	sorted(
	language_catalogues[ln].items(),
	key=lambda x: [
	ln_dct["size"]
	for ln_dct in x[1]["languages"]
	if ln_dct["ln_code"] == ln
	][0],
	reverse=True,
	)
	)
	return dict(sorted(language_catalogues.items()))


	catalogue_by_ln = load_catalogue()

	with st.sidebar:
	ln_select = st.selectbox(
	"Show source list for language:",
	catalogue_by_ln,
	)
	source_select = st.selectbox(
	"Show information for source:",
	catalogue_by_ln[ln_select],
	index=list(catalogue_by_ln[ln_select]).index(
	query_params.get("source", [list(catalogue_by_ln[ln_select].keys())[0]])[0]
	) if ln_select == "all" else 0,
	)
	st.experimental_set_query_params(**{"source": source_select})

	with st.expander(f"Dataset Card for {source_select}"):
	st.markdown(catalogue_by_ln["all"][source_select]["data_card"])

	if "catalogue_info" in catalogue_by_ln["all"][source_select]:
	with st.expander(f"Catalogue Information for {source_select}"):
	st.write(catalogue_by_ln["all"][source_select]["catalogue_info"])

	if "seed_info" in catalogue_by_ln["all"][source_select]:
	with st.expander(f"Pseudocrawl Seed Information for {source_select}"):
	st.write(catalogue_by_ln["all"][source_select]["seed_info"])

	if "hf_info" in catalogue_by_ln["all"][source_select]:
	with st.expander(f"HF Dataset Information for {source_select}"):
	st.write(catalogue_by_ln["all"][source_select]["hf_info"])