asoria's picture
asoria HF staff
Integration with compatible-libraries and other commands
f327376
raw
history blame
5.57 kB
import gradio as gr
from gradio_huggingfacehub_search import HuggingfaceHubSearch
import nbformat as nbf
from huggingface_hub import HfApi
from httpx import Client
import logging
"""
TODOs:
- Add more commands to the notebook
- Parametrize the commands (Move to another file)
- Let user choose the framework and get if from /compatible-libraries
- Use an LLM to suggest commands by column types
- Add commands for auto training
- Enable 'generate notebook' button only if dataset is available and supports library
"""
# Configuration
BASE_DATASETS_SERVER_URL = "https://datasets-server.huggingface.co"
HEADERS = {"Accept": "application/json", "Content-Type": "application/json"}
client = Client(headers=HEADERS)
logging.basicConfig(level=logging.INFO)
def get_compatible_libraries(dataset: str):
try:
resp = client.get(
f"{BASE_DATASETS_SERVER_URL}/compatible-libraries?dataset={dataset}"
)
resp.raise_for_status()
return resp.json()
except Exception as err:
logging.error(f"Failed to fetch compatible libraries: {err}")
return None
def create_notebook_file(cell_commands, notebook_name):
nb = nbf.v4.new_notebook()
nb["cells"] = [nbf.v4.new_code_cell(command) for command in cell_commands]
with open(notebook_name, "w") as f:
nbf.write(nb, f)
logging.info(f"Notebook {notebook_name} created successfully")
def push_notebook(file_path, dataset_id, token):
notebook_name = "dataset_analysis.ipynb"
api = HfApi(token=token)
try:
api.upload_file(
path_or_fileobj=file_path,
path_in_repo=notebook_name,
repo_id=dataset_id,
repo_type="dataset",
)
link = f"https://huggingface.co/datasets/{dataset_id}/blob/main/{notebook_name}"
return gr.HTML(
value=f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline; text-decoration-style: dotted;">See notebook</a>',
visible=True,
)
except Exception as err:
logging.error(f"Failed to push notebook: {err}")
return gr.HTML(value="Failed to push notebook", visible=True)
def generate_notebook(dataset_id):
first_code = f"import pandas as pd\n\ndf = pd.read_parquet('hf://datasets/{dataset_id}/data/train-00000-of-00001.parquet')"
libraries = get_compatible_libraries(dataset_id)
if not libraries:
return gr.File(visible=False), gr.Row.update(visible=False)
pandas_library = next(
(lib for lib in libraries.get("libraries", []) if lib["library"] == "pandas"),
None,
)
if pandas_library:
first_code = pandas_library["loading_codes"][0]["code"]
else:
return gr.File(visible=False), gr.Row.update(visible=False)
html_code = f"<iframe src='https://huggingface.co/datasets/{dataset_id}/embed/viewer' width='80%' height='560px'></iframe>"
commands = [
"!pip install pandas",
first_code,
"df.head()",
f'from IPython.display import HTML\n\ndisplay(HTML("{html_code}"))',
"print(df.shape)",
"df.columns",
"df.describe()",
"df.info()",
# TODO: Generate more commands according to column types for EDA and then for auto training?
]
notebook_name = f"{dataset_id.replace('/', '-')}.ipynb"
create_notebook_file(commands, notebook_name=notebook_name)
return gr.File(value=notebook_name, visible=True), gr.Row.update(visible=True)
with gr.Blocks() as demo:
gr.Markdown("# πŸ€– Dataset notebook creator πŸ•΅οΈ")
dataset_name = HuggingfaceHubSearch(
label="Hub Dataset ID",
placeholder="Search for dataset id on Huggingface",
search_type="dataset",
value="",
)
@gr.render(inputs=dataset_name)
def embed(name):
if not name:
return gr.Markdown("### No dataset provided")
html_code = f"""
<iframe
src="https://huggingface.co/datasets/{name}/embed/viewer/default/train"
frameborder="0"
width="100%"
height="350px"
></iframe>
"""
return gr.HTML(value=html_code)
generate_btn = gr.Button("Generate notebook")
download_link = gr.File(label="Download notebook", visible=False)
with gr.Row(visible=False) as auth_page:
with gr.Column():
gr.Markdown(
"Want to push to hub? Enter your token ([settings](https://huggingface.co/settings/tokens)):"
)
token_box = gr.Textbox(
"", label="token", placeholder="hf_xxx", type="password"
)
auth_error = gr.Markdown("", visible=False)
push_btn = gr.Button("Push notebook to hub", visible=False)
output_lbl = gr.HTML(value="", visible=False)
generate_btn.click(
generate_notebook,
inputs=[dataset_name],
outputs=[download_link, auth_page],
)
def auth(token):
if not token:
return {
auth_error: gr.Markdown(value="", visible=False),
push_btn: gr.Button(visible=False),
}
return {
auth_error: gr.Markdown(value="", visible=False),
push_btn: gr.Button("Push notebook to hub", visible=True),
}
token_box.change(
auth,
inputs=token_box,
outputs=[auth_error, push_btn],
)
push_btn.click(
push_notebook,
inputs=[download_link, dataset_name, token_box],
outputs=output_lbl,
)
demo.launch()