parquet-viewer / app.py
lhoestq's picture
lhoestq HF staff
revert
940f5c2 verified
from typing import Optional
import gradio as gr
import pandas as pd
import pyarrow.parquet as pq
from gradio_huggingfacehub_search import HuggingfaceHubSearch
from huggingface_hub import HfFileSystem
css = """
.settings {
background: transparent;
}
.settings button span {
color: var(--body-text-color-subdued);
}
"""
with gr.Blocks(css=css) as demo:
with gr.Row():
with gr.Column(scale=10):
gr.Markdown("# πŸ‘€ Parquet Viewer πŸ“š")
gr.Markdown("View the content of Parquet files inside a dataset repository or pull request.")
dataset_search = HuggingfaceHubSearch(
label="Hub Dataset ID",
placeholder="Search for dataset id on Huggingface",
search_type="dataset",
)
with gr.Row():
revision_dropdown = gr.Dropdown("main", label="Revision", allow_custom_value=True)
parquet_file_dropdown = gr.Dropdown(label="Parquet file", allow_custom_value=True)
gr.Markdown("Parquet content:")
output_dataframe = gr.DataFrame()
with gr.Column(scale=4, min_width="200px"):
with gr.Accordion("Settings", open=False, elem_classes="settings"):
gr.Markdown("Access private/gated repos")
gr.LoginButton()
@dataset_search.change(inputs=[dataset_search], outputs=[revision_dropdown, parquet_file_dropdown, output_dataframe])
def dataset_update(dataset, oauth_token: Optional[gr.OAuthToken] = None):
fs = HfFileSystem(token=oauth_token.token if oauth_token else None)
if "/" not in dataset:
return {revision_dropdown: gr.Dropdown(choices=[], value="", info="")}
try:
prs = [f"{dataset}@refs/pr/{pr.num}" for pr in fs._api.get_repo_discussions(dataset, repo_type="dataset", discussion_type="pull_request")]
revision = f"{dataset}@main"
return {revision_dropdown: gr.Dropdown(choices=[revision] + prs, value=revision, info=f"{len(prs)} pull request{'s' if len(prs) > 1 else ''} available" if prs else None)}
except Exception:
return {revision_dropdown: gr.Dropdown(choices=[], value="", info="no revisions available")}
@revision_dropdown.change(inputs=[revision_dropdown], outputs=[parquet_file_dropdown, output_dataframe])
def revision_update(dataset_and_revision, oauth_token: Optional[gr.OAuthToken] = None):
fs = HfFileSystem(token=oauth_token.token if oauth_token else None)
try:
parquet_files = ["hf://" + path for path in fs.glob(f"datasets/{dataset_and_revision}/**/*.parquet")]
parquet_file = parquet_files[0] if parquet_files else None
return {parquet_file_dropdown: gr.Dropdown(choices=parquet_files, value=parquet_file, info=f"{len(parquet_files)} parquet file{'s' if len(parquet_files) > 1 else ''} available")}
except Exception:
return {parquet_file_dropdown: gr.Dropdown(choices=[], value="", info="")}
@parquet_file_dropdown.change(inputs=[parquet_file_dropdown], outputs=[output_dataframe])
def parquet_file_update(parquet_file, oauth_token: Optional[gr.OAuthToken] = None):
fs = HfFileSystem(token=oauth_token.token if oauth_token else None)
try:
return {output_dataframe: pd.DataFrame([{k: str(v)[:1000] for k, v in x.items()} for x in pq.ParquetFile(parquet_file, filesystem=fs).read_row_group(0).to_pylist()] if parquet_file else [])}
except Exception:
return {output_dataframe: []}
demo.launch()