File size: 5,573 Bytes
421b068
 
 
 
ca1279e
f327376
 
7d529e0
734af25
 
 
7af3e0d
f327376
 
ca1279e
 
734af25
 
f327376
ca1279e
f327376
 
 
 
ca1279e
 
 
f327376
 
 
 
 
 
 
 
 
734af25
ca1279e
 
421b068
7d529e0
421b068
7d529e0
421b068
f327376
7d529e0
 
6d0709a
2d53b10
6d0709a
f327376
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6d0709a
7d529e0
 
ca1279e
f327376
 
 
ca1279e
f327376
 
 
 
 
 
ca1279e
 
 
 
f327376
421b068
7d529e0
ca1279e
7d529e0
f327376
 
 
 
 
 
421b068
 
 
2d53b10
421b068
7d529e0
421b068
7af3e0d
421b068
7d529e0
 
 
 
 
421b068
 
 
 
 
 
 
 
 
 
2d53b10
421b068
2d53b10
421b068
 
2d53b10
 
 
6d0709a
f327376
2d53b10
6d0709a
7d529e0
 
6d0709a
 
 
2d53b10
 
 
 
 
 
 
 
 
6d0709a
 
 
7d529e0
2d53b10
7d529e0
6d0709a
 
2d53b10
6d0709a
 
 
 
 
 
 
 
7d529e0
 
 
2d53b10
7d529e0
2d53b10
7d529e0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
import gradio as gr
from gradio_huggingfacehub_search import HuggingfaceHubSearch
import nbformat as nbf
from huggingface_hub import HfApi
from httpx import Client
import logging


"""
TODOs:
- Add more commands to the notebook
- Parametrize the commands (Move to another file)
- Let user choose the framework and get if from /compatible-libraries
- Use an LLM to suggest commands by column types
- Add commands for auto training
- Enable 'generate notebook' button only if dataset is available and supports library
"""

# Configuration
BASE_DATASETS_SERVER_URL = "https://datasets-server.huggingface.co"
HEADERS = {"Accept": "application/json", "Content-Type": "application/json"}
client = Client(headers=HEADERS)

logging.basicConfig(level=logging.INFO)


def get_compatible_libraries(dataset: str):
    try:
        resp = client.get(
            f"{BASE_DATASETS_SERVER_URL}/compatible-libraries?dataset={dataset}"
        )
        resp.raise_for_status()
        return resp.json()
    except Exception as err:
        logging.error(f"Failed to fetch compatible libraries: {err}")
        return None


def create_notebook_file(cell_commands, notebook_name):
    nb = nbf.v4.new_notebook()
    nb["cells"] = [nbf.v4.new_code_cell(command) for command in cell_commands]

    with open(notebook_name, "w") as f:
        nbf.write(nb, f)
    logging.info(f"Notebook {notebook_name} created successfully")


def push_notebook(file_path, dataset_id, token):
    notebook_name = "dataset_analysis.ipynb"
    api = HfApi(token=token)
    try:
        api.upload_file(
            path_or_fileobj=file_path,
            path_in_repo=notebook_name,
            repo_id=dataset_id,
            repo_type="dataset",
        )
        link = f"https://huggingface.co/datasets/{dataset_id}/blob/main/{notebook_name}"
        return gr.HTML(
            value=f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline; text-decoration-style: dotted;">See notebook</a>',
            visible=True,
        )
    except Exception as err:
        logging.error(f"Failed to push notebook: {err}")
        return gr.HTML(value="Failed to push notebook", visible=True)


def generate_notebook(dataset_id):
    first_code = f"import pandas as pd\n\ndf = pd.read_parquet('hf://datasets/{dataset_id}/data/train-00000-of-00001.parquet')"
    libraries = get_compatible_libraries(dataset_id)

    if not libraries:
        return gr.File(visible=False), gr.Row.update(visible=False)

    pandas_library = next(
        (lib for lib in libraries.get("libraries", []) if lib["library"] == "pandas"),
        None,
    )
    if pandas_library:
        first_code = pandas_library["loading_codes"][0]["code"]
    else:
        return gr.File(visible=False), gr.Row.update(visible=False)

    html_code = f"<iframe src='https://huggingface.co/datasets/{dataset_id}/embed/viewer' width='80%' height='560px'></iframe>"
    commands = [
        "!pip install pandas",
        first_code,
        "df.head()",
        f'from IPython.display import HTML\n\ndisplay(HTML("{html_code}"))',
        "print(df.shape)",
        "df.columns",
        "df.describe()",
        "df.info()",
        # TODO: Generate more commands according to column types for EDA and then for auto training?
    ]
    notebook_name = f"{dataset_id.replace('/', '-')}.ipynb"
    create_notebook_file(commands, notebook_name=notebook_name)
    return gr.File(value=notebook_name, visible=True), gr.Row.update(visible=True)


with gr.Blocks() as demo:
    gr.Markdown("# 🤖 Dataset notebook creator 🕵️")
    dataset_name = HuggingfaceHubSearch(
        label="Hub Dataset ID",
        placeholder="Search for dataset id on Huggingface",
        search_type="dataset",
        value="",
    )

    @gr.render(inputs=dataset_name)
    def embed(name):
        if not name:
            return gr.Markdown("### No dataset provided")
        html_code = f"""
        <iframe
          src="https://huggingface.co/datasets/{name}/embed/viewer/default/train"
          frameborder="0"
          width="100%"
          height="350px"
        ></iframe>
        """
        return gr.HTML(value=html_code)

    generate_btn = gr.Button("Generate notebook")
    download_link = gr.File(label="Download notebook", visible=False)
    with gr.Row(visible=False) as auth_page:
        with gr.Column():
            gr.Markdown(
                "Want to push to hub? Enter your token ([settings](https://huggingface.co/settings/tokens)):"
            )
            token_box = gr.Textbox(
                "", label="token", placeholder="hf_xxx", type="password"
            )
            auth_error = gr.Markdown("", visible=False)

    push_btn = gr.Button("Push notebook to hub", visible=False)
    output_lbl = gr.HTML(value="", visible=False)

    generate_btn.click(
        generate_notebook,
        inputs=[dataset_name],
        outputs=[download_link, auth_page],
    )

    def auth(token):
        if not token:
            return {
                auth_error: gr.Markdown(value="", visible=False),
                push_btn: gr.Button(visible=False),
            }
        return {
            auth_error: gr.Markdown(value="", visible=False),
            push_btn: gr.Button("Push notebook to hub", visible=True),
        }

    token_box.change(
        auth,
        inputs=token_box,
        outputs=[auth_error, push_btn],
    )

    push_btn.click(
        push_notebook,
        inputs=[download_link, dataset_name, token_box],
        outputs=output_lbl,
    )

demo.launch()