import gradio as gr from gradio_huggingfacehub_search import HuggingfaceHubSearch import nbformat as nbf from huggingface_hub import HfApi from httpx import Client import logging from huggingface_hub import InferenceClient import json import re import pandas as pd """ TODOs: - Need feedback on the output commands to validate if operations are appropiate to data types - Refactor - Make the notebook generation more dynamic, add loading components to do not freeze the UI - Fix errors: - When generating output - When parsing output - When pushing notebook - Add target tasks to choose for the notebook: - Exploratory data analysis - Auto training - RAG - etc. - Enable 'generate notebook' button only if dataset is available and supports library - First get compatible-libraries and let user choose the library """ # Configuration BASE_DATASETS_SERVER_URL = "https://datasets-server.huggingface.co" HEADERS = {"Accept": "application/json", "Content-Type": "application/json"} client = Client(headers=HEADERS) inference_client = InferenceClient("meta-llama/Meta-Llama-3-8B-Instruct") logging.basicConfig(level=logging.INFO) def get_compatible_libraries(dataset: str): resp = client.get( f"{BASE_DATASETS_SERVER_URL}/compatible-libraries?dataset={dataset}" ) resp.raise_for_status() return resp.json() def generate_eda_prompt(columns_info, df, first_code): sample_data = df.head(5).to_dict(orient="records") format_instructions = """ The output should be a markdown code snippet formatted in the following schema, including the leading and trailing "```json" and "```": ```json [ { "cell_type": string // This refers either is a markdown or code cell type. "source": list of string separated by comma // This is the list of text or python code. } ] ``` """ prompt = """ You are an expert data analyst tasked with generating an exploratory data analysis (EDA) Jupyter notebook. The data is provided as a pandas DataFrame with the following structure: Columns and Data Types: {columns_info} Sample Data: {sample_data} Please create a pandas EDA notebook that includes the following: 1. Summary statistics for numerical columns. 2. Distribution plots for numerical columns. 3. Bar plots or count plots for categorical columns. 4. Correlation matrix and heatmap for numerical columns. 5. Any additional relevant visualizations or analyses you deem appropriate. Ensure the notebook is well-organized, with explanations for each step. It is mandatory that you use the following code to load the dataset, DO NOT try to load the dataset in any other way: {first_code} {format_instructions} """ return prompt.format( columns_info=columns_info, sample_data=sample_data, first_code=first_code, format_instructions=format_instructions, ) def create_notebook_file(cell_commands, notebook_name): nb = nbf.v4.new_notebook() nb["cells"] = [ nbf.v4.new_code_cell(command["source"]) if command["cell_type"] == "code" else nbf.v4.new_markdown_cell(command["source"]) for command in cell_commands ] with open(notebook_name, "w") as f: nbf.write(nb, f) logging.info(f"Notebook {notebook_name} created successfully") def push_notebook(file_path, dataset_id, token): notebook_name = "dataset_analysis.ipynb" api = HfApi(token=token) try: api.upload_file( path_or_fileobj=file_path, path_in_repo=notebook_name, repo_id=dataset_id, repo_type="dataset", ) link = f"https://huggingface.co/datasets/{dataset_id}/blob/main/{notebook_name}" return gr.HTML( value=f'See notebook', visible=True, ) except Exception as err: logging.error(f"Failed to push notebook: {err}") return gr.HTML(value="Failed to push notebook", visible=True) def get_first_rows_as_df(dataset: str, config: str, split: str, limit: int): resp = client.get( f"{BASE_DATASETS_SERVER_URL}/first-rows?dataset={dataset}&config={config}&split={split}" ) resp.raise_for_status() content = resp.json() rows = content["rows"] rows = [row["row"] for row in rows] first_rows_df = pd.DataFrame.from_dict(rows).sample(frac=1).head(limit) features = content["features"] features_dict = {feature["name"]: feature["type"] for feature in features} return features_dict, first_rows_df def content_from_output(output): pattern = r"`json(.*?)`" logging.info("--------> Getting data from output") match = re.search(pattern, output, re.DOTALL) if not match: pattern = r"```(.*?)```" logging.info("--------> Getting data from output, second try") match = re.search(pattern, output, re.DOTALL) if not match: raise Exception("Unable to generate jupyter notebook.") extracted_text = match.group(1) logging.info(extracted_text) content = json.loads(extracted_text) logging.info(content) return content def get_notebook_cells(prompt): messages = [{"role": "user", "content": prompt}] output = inference_client.chat_completion(messages=messages, max_tokens=2500) output = output.choices[0].message.content return content_from_output(output) def generate_notebook(dataset_id): try: libraries = get_compatible_libraries(dataset_id) except Exception as err: gr.Error("Unable to retrieve dataset info from HF Hub.") logging.error(f"Failed to fetch compatible libraries: {err}") return None if not libraries: gr.Warning("Dataset not compatible with pandas library.") logging.error(f"Dataset not compatible with pandas library") return gr.File(visible=False), gr.Row.update(visible=False) pandas_library = next( (lib for lib in libraries.get("libraries", []) if lib["library"] == "pandas"), None, ) if not pandas_library: gr.Warning("Dataset not compatible with pandas library.") logging.error(f"Dataset not compatible with pandas library") return gr.File(visible=False), gr.Row.update(visible=False) first_config_loading_code = pandas_library["loading_codes"][0] first_code = first_config_loading_code["code"] first_config = first_config_loading_code["config_name"] first_split = list(first_config_loading_code["arguments"]["splits"].keys())[0] logging.info(f"First config: {first_config} - first split: {first_split}") first_file = f"hf://datasets/{dataset_id}/{first_config_loading_code['arguments']['splits'][first_split]}" logging.info(f"First split file: {first_file}") html_code = f"" features, df = get_first_rows_as_df(dataset_id, first_config, first_split, 3) prompt = generate_eda_prompt(features, df, first_code) logging.info(f"Prompt: {prompt}") commands = get_notebook_cells(prompt) # Adding dataset viewer on the first part commands.insert(0, {"cell_type": "code", "source": f'from IPython.display import HTML\n\ndisplay(HTML("{html_code}"))'}) commands.insert(0, {"cell_type": "markdown", "source": "# Dataset Viewer"}) notebook_name = f"{dataset_id.replace('/', '-')}.ipynb" create_notebook_file(commands, notebook_name=notebook_name) return gr.File(value=notebook_name, visible=True), gr.Row.update(visible=True) with gr.Blocks() as demo: gr.Markdown("# 🤖 Dataset notebook creator 🕵️") dataset_name = HuggingfaceHubSearch( label="Hub Dataset ID", placeholder="Search for dataset id on Huggingface", search_type="dataset", value="", ) @gr.render(inputs=dataset_name) def embed(name): if not name: return gr.Markdown("### No dataset provided") html_code = f""" """ return gr.HTML(value=html_code) generate_btn = gr.Button("Generate notebook") download_link = gr.File(label="Download notebook", visible=False) with gr.Row(visible=False) as auth_page: with gr.Column(): gr.Markdown( "Want to push to hub? Enter your token ([settings](https://huggingface.co/settings/tokens)):" ) token_box = gr.Textbox( "", label="token", placeholder="hf_xxx", type="password" ) auth_error = gr.Markdown("", visible=False) push_btn = gr.Button("Push notebook to hub", visible=False) output_lbl = gr.HTML(value="", visible=False) generate_btn.click( generate_notebook, inputs=[dataset_name], outputs=[download_link, auth_page], ) def auth(token): if not token: return { auth_error: gr.Markdown(value="", visible=False), push_btn: gr.Button(visible=False), } return { auth_error: gr.Markdown(value="", visible=False), push_btn: gr.Button("Push notebook to hub", visible=True), } token_box.change( auth, inputs=token_box, outputs=[auth_error, push_btn], ) push_btn.click( push_notebook, inputs=[download_link, dataset_name, token_box], outputs=output_lbl, ) demo.launch()