Spaces:
Sleeping
Sleeping
File size: 5,573 Bytes
421b068 ca1279e f327376 7d529e0 734af25 7af3e0d f327376 ca1279e 734af25 f327376 ca1279e f327376 ca1279e f327376 734af25 ca1279e 421b068 7d529e0 421b068 7d529e0 421b068 f327376 7d529e0 6d0709a 2d53b10 6d0709a f327376 6d0709a 7d529e0 ca1279e f327376 ca1279e f327376 ca1279e f327376 421b068 7d529e0 ca1279e 7d529e0 f327376 421b068 2d53b10 421b068 7d529e0 421b068 7af3e0d 421b068 7d529e0 421b068 2d53b10 421b068 2d53b10 421b068 2d53b10 6d0709a f327376 2d53b10 6d0709a 7d529e0 6d0709a 2d53b10 6d0709a 7d529e0 2d53b10 7d529e0 6d0709a 2d53b10 6d0709a 7d529e0 2d53b10 7d529e0 2d53b10 7d529e0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 |
import gradio as gr
from gradio_huggingfacehub_search import HuggingfaceHubSearch
import nbformat as nbf
from huggingface_hub import HfApi
from httpx import Client
import logging
"""
TODOs:
- Add more commands to the notebook
- Parametrize the commands (Move to another file)
- Let user choose the framework and get if from /compatible-libraries
- Use an LLM to suggest commands by column types
- Add commands for auto training
- Enable 'generate notebook' button only if dataset is available and supports library
"""
# Configuration
BASE_DATASETS_SERVER_URL = "https://datasets-server.huggingface.co"
HEADERS = {"Accept": "application/json", "Content-Type": "application/json"}
client = Client(headers=HEADERS)
logging.basicConfig(level=logging.INFO)
def get_compatible_libraries(dataset: str):
try:
resp = client.get(
f"{BASE_DATASETS_SERVER_URL}/compatible-libraries?dataset={dataset}"
)
resp.raise_for_status()
return resp.json()
except Exception as err:
logging.error(f"Failed to fetch compatible libraries: {err}")
return None
def create_notebook_file(cell_commands, notebook_name):
nb = nbf.v4.new_notebook()
nb["cells"] = [nbf.v4.new_code_cell(command) for command in cell_commands]
with open(notebook_name, "w") as f:
nbf.write(nb, f)
logging.info(f"Notebook {notebook_name} created successfully")
def push_notebook(file_path, dataset_id, token):
notebook_name = "dataset_analysis.ipynb"
api = HfApi(token=token)
try:
api.upload_file(
path_or_fileobj=file_path,
path_in_repo=notebook_name,
repo_id=dataset_id,
repo_type="dataset",
)
link = f"https://huggingface.co/datasets/{dataset_id}/blob/main/{notebook_name}"
return gr.HTML(
value=f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline; text-decoration-style: dotted;">See notebook</a>',
visible=True,
)
except Exception as err:
logging.error(f"Failed to push notebook: {err}")
return gr.HTML(value="Failed to push notebook", visible=True)
def generate_notebook(dataset_id):
first_code = f"import pandas as pd\n\ndf = pd.read_parquet('hf://datasets/{dataset_id}/data/train-00000-of-00001.parquet')"
libraries = get_compatible_libraries(dataset_id)
if not libraries:
return gr.File(visible=False), gr.Row.update(visible=False)
pandas_library = next(
(lib for lib in libraries.get("libraries", []) if lib["library"] == "pandas"),
None,
)
if pandas_library:
first_code = pandas_library["loading_codes"][0]["code"]
else:
return gr.File(visible=False), gr.Row.update(visible=False)
html_code = f"<iframe src='https://huggingface.co/datasets/{dataset_id}/embed/viewer' width='80%' height='560px'></iframe>"
commands = [
"!pip install pandas",
first_code,
"df.head()",
f'from IPython.display import HTML\n\ndisplay(HTML("{html_code}"))',
"print(df.shape)",
"df.columns",
"df.describe()",
"df.info()",
# TODO: Generate more commands according to column types for EDA and then for auto training?
]
notebook_name = f"{dataset_id.replace('/', '-')}.ipynb"
create_notebook_file(commands, notebook_name=notebook_name)
return gr.File(value=notebook_name, visible=True), gr.Row.update(visible=True)
with gr.Blocks() as demo:
gr.Markdown("# 🤖 Dataset notebook creator 🕵️")
dataset_name = HuggingfaceHubSearch(
label="Hub Dataset ID",
placeholder="Search for dataset id on Huggingface",
search_type="dataset",
value="",
)
@gr.render(inputs=dataset_name)
def embed(name):
if not name:
return gr.Markdown("### No dataset provided")
html_code = f"""
<iframe
src="https://huggingface.co/datasets/{name}/embed/viewer/default/train"
frameborder="0"
width="100%"
height="350px"
></iframe>
"""
return gr.HTML(value=html_code)
generate_btn = gr.Button("Generate notebook")
download_link = gr.File(label="Download notebook", visible=False)
with gr.Row(visible=False) as auth_page:
with gr.Column():
gr.Markdown(
"Want to push to hub? Enter your token ([settings](https://huggingface.co/settings/tokens)):"
)
token_box = gr.Textbox(
"", label="token", placeholder="hf_xxx", type="password"
)
auth_error = gr.Markdown("", visible=False)
push_btn = gr.Button("Push notebook to hub", visible=False)
output_lbl = gr.HTML(value="", visible=False)
generate_btn.click(
generate_notebook,
inputs=[dataset_name],
outputs=[download_link, auth_page],
)
def auth(token):
if not token:
return {
auth_error: gr.Markdown(value="", visible=False),
push_btn: gr.Button(visible=False),
}
return {
auth_error: gr.Markdown(value="", visible=False),
push_btn: gr.Button("Push notebook to hub", visible=True),
}
token_box.change(
auth,
inputs=token_box,
outputs=[auth_error, push_btn],
)
push_btn.click(
push_notebook,
inputs=[download_link, dataset_name, token_box],
outputs=output_lbl,
)
demo.launch()
|