Spaces:
Sleeping
Sleeping
import gradio as gr | |
from gradio_huggingfacehub_search import HuggingfaceHubSearch | |
import nbformat as nbf | |
from huggingface_hub import HfApi | |
from httpx import Client | |
import logging | |
""" | |
TODOs: | |
- Add more commands to the notebook | |
- Parametrize the commands (Move to another file) | |
- Let user choose the framework and get if from /compatible-libraries | |
- Use an LLM to suggest commands by column types | |
- Add commands for auto training | |
- Enable 'generate notebook' button only if dataset is available and supports library | |
""" | |
# Configuration | |
BASE_DATASETS_SERVER_URL = "https://datasets-server.huggingface.co" | |
HEADERS = {"Accept": "application/json", "Content-Type": "application/json"} | |
client = Client(headers=HEADERS) | |
logging.basicConfig(level=logging.INFO) | |
def get_compatible_libraries(dataset: str): | |
try: | |
resp = client.get( | |
f"{BASE_DATASETS_SERVER_URL}/compatible-libraries?dataset={dataset}" | |
) | |
resp.raise_for_status() | |
return resp.json() | |
except Exception as err: | |
logging.error(f"Failed to fetch compatible libraries: {err}") | |
return None | |
def create_notebook_file(cell_commands, notebook_name): | |
nb = nbf.v4.new_notebook() | |
nb["cells"] = [nbf.v4.new_code_cell(command) for command in cell_commands] | |
with open(notebook_name, "w") as f: | |
nbf.write(nb, f) | |
logging.info(f"Notebook {notebook_name} created successfully") | |
def push_notebook(file_path, dataset_id, token): | |
notebook_name = "dataset_analysis.ipynb" | |
api = HfApi(token=token) | |
try: | |
api.upload_file( | |
path_or_fileobj=file_path, | |
path_in_repo=notebook_name, | |
repo_id=dataset_id, | |
repo_type="dataset", | |
) | |
link = f"https://huggingface.co/datasets/{dataset_id}/blob/main/{notebook_name}" | |
return gr.HTML( | |
value=f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline; text-decoration-style: dotted;">See notebook</a>', | |
visible=True, | |
) | |
except Exception as err: | |
logging.error(f"Failed to push notebook: {err}") | |
return gr.HTML(value="Failed to push notebook", visible=True) | |
def generate_notebook(dataset_id): | |
first_code = f"import pandas as pd\n\ndf = pd.read_parquet('hf://datasets/{dataset_id}/data/train-00000-of-00001.parquet')" | |
libraries = get_compatible_libraries(dataset_id) | |
if not libraries: | |
return gr.File(visible=False), gr.Row.update(visible=False) | |
pandas_library = next( | |
(lib for lib in libraries.get("libraries", []) if lib["library"] == "pandas"), | |
None, | |
) | |
if pandas_library: | |
first_code = pandas_library["loading_codes"][0]["code"] | |
else: | |
return gr.File(visible=False), gr.Row.update(visible=False) | |
html_code = f"<iframe src='https://huggingface.co/datasets/{dataset_id}/embed/viewer' width='80%' height='560px'></iframe>" | |
commands = [ | |
"!pip install pandas", | |
first_code, | |
"df.head()", | |
f'from IPython.display import HTML\n\ndisplay(HTML("{html_code}"))', | |
"print(df.shape)", | |
"df.columns", | |
"df.describe()", | |
"df.info()", | |
# TODO: Generate more commands according to column types for EDA and then for auto training? | |
] | |
notebook_name = f"{dataset_id.replace('/', '-')}.ipynb" | |
create_notebook_file(commands, notebook_name=notebook_name) | |
return gr.File(value=notebook_name, visible=True), gr.Row.update(visible=True) | |
with gr.Blocks() as demo: | |
gr.Markdown("# π€ Dataset notebook creator π΅οΈ") | |
dataset_name = HuggingfaceHubSearch( | |
label="Hub Dataset ID", | |
placeholder="Search for dataset id on Huggingface", | |
search_type="dataset", | |
value="", | |
) | |
def embed(name): | |
if not name: | |
return gr.Markdown("### No dataset provided") | |
html_code = f""" | |
<iframe | |
src="https://huggingface.co/datasets/{name}/embed/viewer/default/train" | |
frameborder="0" | |
width="100%" | |
height="350px" | |
></iframe> | |
""" | |
return gr.HTML(value=html_code) | |
generate_btn = gr.Button("Generate notebook") | |
download_link = gr.File(label="Download notebook", visible=False) | |
with gr.Row(visible=False) as auth_page: | |
with gr.Column(): | |
gr.Markdown( | |
"Want to push to hub? Enter your token ([settings](https://huggingface.co/settings/tokens)):" | |
) | |
token_box = gr.Textbox( | |
"", label="token", placeholder="hf_xxx", type="password" | |
) | |
auth_error = gr.Markdown("", visible=False) | |
push_btn = gr.Button("Push notebook to hub", visible=False) | |
output_lbl = gr.HTML(value="", visible=False) | |
generate_btn.click( | |
generate_notebook, | |
inputs=[dataset_name], | |
outputs=[download_link, auth_page], | |
) | |
def auth(token): | |
if not token: | |
return { | |
auth_error: gr.Markdown(value="", visible=False), | |
push_btn: gr.Button(visible=False), | |
} | |
return { | |
auth_error: gr.Markdown(value="", visible=False), | |
push_btn: gr.Button("Push notebook to hub", visible=True), | |
} | |
token_box.change( | |
auth, | |
inputs=token_box, | |
outputs=[auth_error, push_btn], | |
) | |
push_btn.click( | |
push_notebook, | |
inputs=[download_link, dataset_name, token_box], | |
outputs=output_lbl, | |
) | |
demo.launch() | |