import gradio as gr from gradio_huggingfacehub_search import HuggingfaceHubSearch import nbformat as nbf from huggingface_hub import HfApi from httpx import Client import logging """ TODOs: - Add more commands to the notebook - Parametrize the commands (Move to another file) - Let user choose the framework and get if from /compatible-libraries - Use an LLM to suggest commands by column types - Add commands for auto training - Enable 'generate notebook' button only if dataset is available and supports library """ # Configuration BASE_DATASETS_SERVER_URL = "https://datasets-server.huggingface.co" HEADERS = {"Accept": "application/json", "Content-Type": "application/json"} client = Client(headers=HEADERS) logging.basicConfig(level=logging.INFO) def get_compatible_libraries(dataset: str): try: resp = client.get( f"{BASE_DATASETS_SERVER_URL}/compatible-libraries?dataset={dataset}" ) resp.raise_for_status() return resp.json() except Exception as err: logging.error(f"Failed to fetch compatible libraries: {err}") return None def create_notebook_file(cell_commands, notebook_name): nb = nbf.v4.new_notebook() nb["cells"] = [nbf.v4.new_code_cell(command) for command in cell_commands] with open(notebook_name, "w") as f: nbf.write(nb, f) logging.info(f"Notebook {notebook_name} created successfully") def push_notebook(file_path, dataset_id, token): notebook_name = "dataset_analysis.ipynb" api = HfApi(token=token) try: api.upload_file( path_or_fileobj=file_path, path_in_repo=notebook_name, repo_id=dataset_id, repo_type="dataset", ) link = f"https://huggingface.co/datasets/{dataset_id}/blob/main/{notebook_name}" return gr.HTML( value=f'See notebook', visible=True, ) except Exception as err: logging.error(f"Failed to push notebook: {err}") return gr.HTML(value="Failed to push notebook", visible=True) def generate_notebook(dataset_id): first_code = f"import pandas as pd\n\ndf = pd.read_parquet('hf://datasets/{dataset_id}/data/train-00000-of-00001.parquet')" libraries = get_compatible_libraries(dataset_id) if not libraries: return gr.File(visible=False), gr.Row.update(visible=False) pandas_library = next( (lib for lib in libraries.get("libraries", []) if lib["library"] == "pandas"), None, ) if pandas_library: first_code = pandas_library["loading_codes"][0]["code"] else: return gr.File(visible=False), gr.Row.update(visible=False) html_code = f"" commands = [ "!pip install pandas", first_code, "df.head()", f'from IPython.display import HTML\n\ndisplay(HTML("{html_code}"))', "print(df.shape)", "df.columns", "df.describe()", "df.info()", # TODO: Generate more commands according to column types for EDA and then for auto training? ] notebook_name = f"{dataset_id.replace('/', '-')}.ipynb" create_notebook_file(commands, notebook_name=notebook_name) return gr.File(value=notebook_name, visible=True), gr.Row.update(visible=True) with gr.Blocks() as demo: gr.Markdown("# 🤖 Dataset notebook creator 🕵️") dataset_name = HuggingfaceHubSearch( label="Hub Dataset ID", placeholder="Search for dataset id on Huggingface", search_type="dataset", value="", ) @gr.render(inputs=dataset_name) def embed(name): if not name: return gr.Markdown("### No dataset provided") html_code = f""" """ return gr.HTML(value=html_code) generate_btn = gr.Button("Generate notebook") download_link = gr.File(label="Download notebook", visible=False) with gr.Row(visible=False) as auth_page: with gr.Column(): gr.Markdown( "Want to push to hub? Enter your token ([settings](https://huggingface.co/settings/tokens)):" ) token_box = gr.Textbox( "", label="token", placeholder="hf_xxx", type="password" ) auth_error = gr.Markdown("", visible=False) push_btn = gr.Button("Push notebook to hub", visible=False) output_lbl = gr.HTML(value="", visible=False) generate_btn.click( generate_notebook, inputs=[dataset_name], outputs=[download_link, auth_page], ) def auth(token): if not token: return { auth_error: gr.Markdown(value="", visible=False), push_btn: gr.Button(visible=False), } return { auth_error: gr.Markdown(value="", visible=False), push_btn: gr.Button("Push notebook to hub", visible=True), } token_box.change( auth, inputs=token_box, outputs=[auth_error, push_btn], ) push_btn.click( push_notebook, inputs=[download_link, dataset_name, token_box], outputs=output_lbl, ) demo.launch()