import streamlit as st import json import pandas as pd from datasets import load_dataset st.set_page_config(page_title="The Stack data Inspection", layout="wide") st.sidebar.title("The Stack data Inspection") df = pd.read_csv("new_extension_distribution.csv") all_extensions = df["extension"].tolist() tags = {} for index, row in df.iterrows(): if row["language"] not in tags: tags[row["language"]] = [] tags[row["language"]].append(row["extension"]) all_languages = list(tags.keys()) @st.cache_data() def load_data(language, ext): ds = load_dataset( "loubnabnl/the-stack-inspection-data", data_dir=f"data/{language}/{ext}", split="train", ) return ds col1, col2, _ = st.columns([1, 1, 4]) with col1: chosen_language = st.sidebar.selectbox( label="Select a programming language", options=all_languages, index=0 ) with col2: chosen_ext = st.sidebar.selectbox( label="Select an extension", options=tags[chosen_language], index=0 ) st.sidebar.header("Filters") not_lexable = st.sidebar.checkbox("Not lexable") min_alphanum = st.sidebar.slider("Minimum alphanumeric fraction", 0.0, 1.0, 1.0) max_line_length = st.sidebar.slider("Maximum line length", 0, 1000, 0) max_mean_line_length = st.sidebar.slider("Maximum average line length", 0, 500, 0) st.sidebar.markdown("Printed files have `max_line_length` and `average_line_length` larger than the selected values.\ `alphanumeric_fraction` is smaller than the selected value.") # load and filter dataset samples = load_data(chosen_language, chosen_ext) samples = samples.filter(lambda x: x["alphanum_fraction"] < min_alphanum) samples = samples.filter(lambda x: x["max_line_length"] > max_line_length) samples = samples.filter(lambda x: x["avg_line_length"] > max_mean_line_length) if not_lexable: samples = samples.filter(lambda x: not x["lexable"]) max_docs = len(samples) if max_docs > 0: col_1, _ = st.columns([3, 3]) with col_1: index_example = st.number_input( f"Extension {chosen_ext} has {max_docs} files, choose one to visualize:", min_value=0, max_value=max_docs - 1, value=0, step=1, ) example = samples[index_example] st.markdown("#### File content:") if example["lexable"]: st.code(example["content"], language=chosen_language) else: st.text(f"File can't be lexed so we remove syntax highlighting.\nContent:\n") st.text(str(example["content"])) else: st.text("The dataset is empty after the filtering!")