import gradio as gr import pandas as pd from huggingface_hub.hf_api import create_repo, upload_file, HfApi from huggingface_hub.repository import Repository import subprocess import os import tempfile import sweetviz as sv def analyze_datasets(dataset, dataset_name, token, column=None, pairwise="off"): df = pd.read_csv(dataset.name) username = HfApi().whoami(token=token)["name"] if column is not None: analyze_report = sv.analyze(df, target_feat=column, pairwise_analysis=pairwise) else: analyze_report = sv.analyze(df, pairwise_analysis=pairwise) analyze_report.show_html('./index.html', open_browser=False) repo_url = create_repo(f"{username}/{dataset_name}", repo_type = "space", token = token, space_sdk = "static", private=False) upload_file(path_or_fileobj ="./index.html", path_in_repo = "index.html", repo_id =f"{username}/{dataset_name}", repo_type = "space", token=token) readme = f"---\ntitle: {dataset_name}\nemoji: ✨\ncolorFrom: green\ncolorTo: red\nsdk: static\npinned: false\ntags:\n- dataset-report\n---" with open("README.md", "w+") as f: f.write(readme) upload_file(path_or_fileobj ="./README.md", path_in_repo = "README.md", repo_id =f"{username}/{dataset_name}", repo_type = "space", token=token) return f"Your dataset report will be ready at {repo_url}" def compare_column_values(dataset, dataset_name, token, column, category): df = pd.read_csv(dataset.name) username = HfApi().whoami(token=token)["name"] arr = df[column].unique() arr = list(arr[arr != column]) compare_report = sv.compare_intra(df, df[column] == category, arr[0]) compare_report.show_html('./index.html', open_browser=False) repo_url = create_repo(f"{username}/{dataset_name}", repo_type = "space", token = token, space_sdk = "static", private=False) upload_file(path_or_fileobj ="./index.html", path_in_repo = "index.html", repo_id =f"{username}/{dataset_name}", repo_type = "space", token=token) readme = f"---\ntitle: {dataset_name}\nemoji: ✨\ncolorFrom: green\ncolorTo: red\nsdk: static\npinned: false\ntags:\n- dataset-report\n---" with open("README.md", "w+") as f: f.write(readme) upload_file(path_or_fileobj ="./README.md", path_in_repo = "README.md", repo_id =f"{username}/{dataset_name}", repo_type = "space", token=token) return f"Your dataset report will be ready at {repo_url}" def compare_dataset_splits(dataset, dataset_name, token, splits): df = pd.read_csv(dataset.name) train = df.sample(frac=splits) test = df.loc[df.index.difference(train.index)] username = HfApi().whoami(token=token)["name"] compare_report = sv.compare([train, "Training Data"], [test, "Test Data"]) compare_report.show_html('./index.html', open_browser=False) repo_url = create_repo(f"{username}/{dataset_name}", repo_type = "space", token = token, space_sdk = "static", private=False) upload_file(path_or_fileobj ="./index.html", path_in_repo = "index.html", repo_id =f"{username}/{dataset_name}", repo_type = "space", token=token) readme = f"---\ntitle: {dataset_name}\nemoji: ✨\ncolorFrom: green\ncolorTo: red\nsdk: static\npinned: false\ntags:\n- dataset-report\n---" with open("README.md", "w+") as f: f.write(readme) upload_file(path_or_fileobj ="./README.md", path_in_repo = "README.md", repo_id =f"{username}/{dataset_name}", repo_type = "space", token=token) return f"Your dataset report will be ready at {repo_url}" with gr.Blocks() as demo: main_title = gr.Markdown("""# Easy Analysis🪄🌟✨""") main_desc = gr.Markdown("""This app enables you to run three type of dataset analysis and pushes the interactive reports to your Hugging Face Hub profile as a Space. It uses SweetViz in the back.""") with gr.Tabs(): with gr.TabItem("Analyze") as analyze: with gr.Row(): with gr.Column(): title = gr.Markdown(""" ## Analyze Dataset """) description = gr.Markdown("Analyze a dataset or predictive variables against a target variable in a dataset (enter a column name to column section if you want to compare against target value). You can also do pairwise analysis, but it has quadratic complexity.") dataset = gr.File(label = "Dataset") column = gr.Text(label = "Compare dataset against a target variable (Optional)") pairwise = gr.Radio(["off", "on"], label = "Enable pairwise analysis") token = gr.Textbox(label = "Your Hugging Face Token") dataset_name = gr.Textbox(label = "Dataset Name") pushing_desc = gr.Markdown("This app needs your Hugging Face Hub token and a unique name for your dataset report.") inference_run = gr.Button("Infer") inference_progress = gr.StatusTracker(cover_container=True) outcome = gr.outputs.Textbox() inference_run.click( analyze_datasets, inputs=[dataset, dataset_name, token, column, pairwise], outputs=outcome, status_tracker=inference_progress, ) with gr.TabItem("Compare Splits") as compare_splits: with gr.Row(): with gr.Column(): title = gr.Markdown(""" ## Compare Splits""") description = gr.Markdown("Split a dataset and compare splits. You need to give a fraction, e.g. 0.8.") dataset = gr.File(label = "Dataset") split_ratio = gr.Number(label = "Split Ratios") pushing_desc = gr.Markdown("This app needs your Hugging Face Hub token and a unique name for your dataset report.") token = gr.Textbox(label = "Your Hugging Face Token") dataset_name = gr.Textbox(label = "Dataset Name") inference_run = gr.Button("Infer") inference_progress = gr.StatusTracker(cover_container=True) outcome = gr.outputs.Textbox() inference_run.click( compare_dataset_splits, inputs=[dataset, dataset_name, token, split_ratio], outputs=outcome, status_tracker=inference_progress, ) with gr.TabItem("Compare Subsets") as compare_subsets: with gr.Row(): with gr.Column(): title = gr.Markdown(""" ## Compare Subsets""") description = gr.Markdown("Compare subsets of a dataset, e.g. you can pick Age Group column and compare adult category against young.") dataset = gr.File(label = "Dataset") column = gr.Text(label = "Enter column:") category = gr.Text(label = "Enter category:") pushing_desc = gr.Markdown("This app needs your Hugging Face Hub token and a unique name for your dataset report.") token = gr.Textbox(label = "Your Hugging Face Token") dataset_name = gr.Textbox(label = "Dataset Name") inference_run = gr.Button("Run Analysis") inference_progress = gr.StatusTracker(cover_container=True) outcome = gr.outputs.Textbox() inference_run.click( compare_column_values, inputs=[dataset, dataset_name, token, column, category ], outputs=outcome, status_tracker=inference_progress, ) demo.launch(debug=True)