import gradio as gr
from gradio_huggingfacehub_search import HuggingfaceHubSearch
import nbformat as nbf
from huggingface_hub import HfApi
from httpx import Client
import logging
from huggingface_hub import InferenceClient
import json
import re
import pandas as pd
"""
TODOs:
- Need feedback on the output commands to validate if operations are appropiate to data types
- Refactor
- Make the notebook generation more dynamic, add loading components to do not freeze the UI
- Fix errors:
- When generating output
- When parsing output
- When pushing notebook
- Add target tasks to choose for the notebook:
- Exploratory data analysis
- Auto training
- RAG
- etc.
- Enable 'generate notebook' button only if dataset is available and supports library
- First get compatible-libraries and let user choose the library
"""
# Configuration
BASE_DATASETS_SERVER_URL = "https://datasets-server.huggingface.co"
HEADERS = {"Accept": "application/json", "Content-Type": "application/json"}
client = Client(headers=HEADERS)
inference_client = InferenceClient("meta-llama/Meta-Llama-3-8B-Instruct")
logging.basicConfig(level=logging.INFO)
def get_compatible_libraries(dataset: str):
resp = client.get(
f"{BASE_DATASETS_SERVER_URL}/compatible-libraries?dataset={dataset}"
)
resp.raise_for_status()
return resp.json()
def generate_eda_prompt(columns_info, df, first_code):
sample_data = df.head(5).to_dict(orient="records")
format_instructions = """
The output should be a markdown code snippet formatted in the
following schema, including the leading and trailing "```json" and "```":
```json
[
{
"cell_type": string // This refers either is a markdown or code cell type.
"source": list of string separated by comma // This is the list of text or python code.
}
]
```
"""
prompt = """
You are an expert data analyst tasked with generating an exploratory data analysis (EDA) Jupyter notebook. The data is provided as a pandas DataFrame with the following structure:
Columns and Data Types:
{columns_info}
Sample Data:
{sample_data}
Please create a pandas EDA notebook that includes the following:
1. Summary statistics for numerical columns.
2. Distribution plots for numerical columns.
3. Bar plots or count plots for categorical columns.
4. Correlation matrix and heatmap for numerical columns.
5. Any additional relevant visualizations or analyses you deem appropriate.
Ensure the notebook is well-organized, with explanations for each step.
It is mandatory that you use the following code to load the dataset, DO NOT try to load the dataset in any other way:
{first_code}
{format_instructions}
"""
return prompt.format(
columns_info=columns_info,
sample_data=sample_data,
first_code=first_code,
format_instructions=format_instructions,
)
def create_notebook_file(cell_commands, notebook_name):
nb = nbf.v4.new_notebook()
nb["cells"] = [
nbf.v4.new_code_cell(command["source"])
if command["cell_type"] == "code"
else nbf.v4.new_markdown_cell(command["source"])
for command in cell_commands
]
with open(notebook_name, "w") as f:
nbf.write(nb, f)
logging.info(f"Notebook {notebook_name} created successfully")
def push_notebook(file_path, dataset_id, token):
notebook_name = "dataset_analysis.ipynb"
api = HfApi(token=token)
try:
api.upload_file(
path_or_fileobj=file_path,
path_in_repo=notebook_name,
repo_id=dataset_id,
repo_type="dataset",
)
link = f"https://huggingface.co/datasets/{dataset_id}/blob/main/{notebook_name}"
return gr.HTML(
value=f'See notebook',
visible=True,
)
except Exception as err:
logging.error(f"Failed to push notebook: {err}")
return gr.HTML(value="Failed to push notebook", visible=True)
def get_first_rows_as_df(dataset: str, config: str, split: str, limit: int):
resp = client.get(
f"{BASE_DATASETS_SERVER_URL}/first-rows?dataset={dataset}&config={config}&split={split}"
)
resp.raise_for_status()
content = resp.json()
rows = content["rows"]
rows = [row["row"] for row in rows]
first_rows_df = pd.DataFrame.from_dict(rows).sample(frac=1).head(limit)
features = content["features"]
features_dict = {feature["name"]: feature["type"] for feature in features}
return features_dict, first_rows_df
def content_from_output(output):
pattern = r"`json(.*?)`"
logging.info("--------> Getting data from output")
match = re.search(pattern, output, re.DOTALL)
if not match:
pattern = r"```(.*?)```"
logging.info("--------> Getting data from output, second try")
match = re.search(pattern, output, re.DOTALL)
if not match:
raise Exception("Unable to generate jupyter notebook.")
extracted_text = match.group(1)
logging.info(extracted_text)
content = json.loads(extracted_text)
logging.info(content)
return content
def get_notebook_cells(prompt):
messages = [{"role": "user", "content": prompt}]
output = inference_client.chat_completion(messages=messages, max_tokens=2500)
output = output.choices[0].message.content
return content_from_output(output)
def generate_notebook(dataset_id):
try:
libraries = get_compatible_libraries(dataset_id)
except Exception as err:
gr.Error("Unable to retrieve dataset info from HF Hub.")
logging.error(f"Failed to fetch compatible libraries: {err}")
return None
if not libraries:
gr.Warning("Dataset not compatible with pandas library.")
logging.error(f"Dataset not compatible with pandas library")
return gr.File(visible=False), gr.Row.update(visible=False)
pandas_library = next(
(lib for lib in libraries.get("libraries", []) if lib["library"] == "pandas"),
None,
)
if not pandas_library:
gr.Warning("Dataset not compatible with pandas library.")
logging.error(f"Dataset not compatible with pandas library")
return gr.File(visible=False), gr.Row.update(visible=False)
first_config_loading_code = pandas_library["loading_codes"][0]
first_code = first_config_loading_code["code"]
first_config = first_config_loading_code["config_name"]
first_split = list(first_config_loading_code["arguments"]["splits"].keys())[0]
logging.info(f"First config: {first_config} - first split: {first_split}")
first_file = f"hf://datasets/{dataset_id}/{first_config_loading_code['arguments']['splits'][first_split]}"
logging.info(f"First split file: {first_file}")
html_code = f""
features, df = get_first_rows_as_df(dataset_id, first_config, first_split, 3)
prompt = generate_eda_prompt(features, df, first_code)
logging.info(f"Prompt: {prompt}")
commands = get_notebook_cells(prompt)
# Adding dataset viewer on the first part
commands.insert(0, {"cell_type": "code", "source": f'from IPython.display import HTML\n\ndisplay(HTML("{html_code}"))'})
commands.insert(0, {"cell_type": "markdown", "source": "# Dataset Viewer"})
notebook_name = f"{dataset_id.replace('/', '-')}.ipynb"
create_notebook_file(commands, notebook_name=notebook_name)
return gr.File(value=notebook_name, visible=True), gr.Row.update(visible=True)
with gr.Blocks() as demo:
gr.Markdown("# 🤖 Dataset notebook creator 🕵️")
dataset_name = HuggingfaceHubSearch(
label="Hub Dataset ID",
placeholder="Search for dataset id on Huggingface",
search_type="dataset",
value="",
)
@gr.render(inputs=dataset_name)
def embed(name):
if not name:
return gr.Markdown("### No dataset provided")
html_code = f"""
"""
return gr.HTML(value=html_code)
generate_btn = gr.Button("Generate notebook")
download_link = gr.File(label="Download notebook", visible=False)
with gr.Row(visible=False) as auth_page:
with gr.Column():
gr.Markdown(
"Want to push to hub? Enter your token ([settings](https://huggingface.co/settings/tokens)):"
)
token_box = gr.Textbox(
"", label="token", placeholder="hf_xxx", type="password"
)
auth_error = gr.Markdown("", visible=False)
push_btn = gr.Button("Push notebook to hub", visible=False)
output_lbl = gr.HTML(value="", visible=False)
generate_btn.click(
generate_notebook,
inputs=[dataset_name],
outputs=[download_link, auth_page],
)
def auth(token):
if not token:
return {
auth_error: gr.Markdown(value="", visible=False),
push_btn: gr.Button(visible=False),
}
return {
auth_error: gr.Markdown(value="", visible=False),
push_btn: gr.Button("Push notebook to hub", visible=True),
}
token_box.change(
auth,
inputs=token_box,
outputs=[auth_error, push_btn],
)
push_btn.click(
push_notebook,
inputs=[download_link, dataset_name, token_box],
outputs=output_lbl,
)
demo.launch()