Spaces:

asoria
/

auto-dataset-analyst-creator

Sleeping

App Files Files Community

asoria HF staff commited on Aug 22

Commit

45f97ba

•

1 Parent(s): f5da21f

Details

Browse files

Files changed (1) hide show

app.py +80 -64

app.py CHANGED Viewed

@@ -44,20 +44,24 @@ logging.basicConfig(level=logging.INFO)
 def get_compatible_libraries(dataset: str):
-    resp = client.get(
-        f"{BASE_DATASETS_SERVER_URL}/compatible-libraries?dataset={dataset}"
-    )
-    resp.raise_for_status()
-    return resp.json()
 def create_notebook_file(cell_commands, notebook_name):
     nb = nbf.v4.new_notebook()
     nb["cells"] = [
-        nbf.v4.new_code_cell(command["source"])
-        if command["cell_type"] == "code"
-        else nbf.v4.new_markdown_cell(command["source"])
-        for command in cell_commands
     ]
     with open(notebook_name, "w") as f:
@@ -65,45 +69,51 @@ def create_notebook_file(cell_commands, notebook_name):
     logging.info(f"Notebook {notebook_name} created successfully")
-def push_notebook(file_path, dataset_id, token):
-    notebook_name = "dataset_analysis.ipynb"
-    api = HfApi(token=token)
     try:
-        api.upload_file(
-            path_or_fileobj=file_path,
-            path_in_repo=notebook_name,
-            repo_id=dataset_id,
-            repo_type="dataset",
-        )
-        link = f"https://huggingface.co/datasets/{dataset_id}/blob/main/{notebook_name}"
-        return gr.HTML(
-            value=f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline; text-decoration-style: dotted;">See notebook</a>',
-            visible=True,
         )
-    except Exception as err:
-        logging.error(f"Failed to push notebook: {err}")
-        return gr.HTML(value="Failed to push notebook", visible=True)
-def get_first_rows_as_df(dataset: str, config: str, split: str, limit: int):
-    resp = client.get(
-        f"{BASE_DATASETS_SERVER_URL}/first-rows?dataset={dataset}&config={config}&split={split}"
-    )
-    resp.raise_for_status()
-    content = resp.json()
-    rows = content["rows"]
-    rows = [row["row"] for row in rows]
-    first_rows_df = pd.DataFrame.from_dict(rows).sample(frac=1).head(limit)
-    features = content["features"]
-    features_dict = {feature["name"]: feature["type"] for feature in features}
-    return features_dict, first_rows_df
-def get_txt_from_output(output):
-    extracted_text = content_from_output(output)
-    content = json.loads(extracted_text)
-    logging.info(content)
-    return content
 def content_from_output(output):
@@ -123,18 +133,26 @@ def content_from_output(output):
     return match.group(1)
-def generate_eda_cells(dataset_id):
-    for messages in generate_cells(dataset_id, generate_eda_prompt):
         yield messages, gr.update(visible=False), None  # Keep button hidden
-    yield messages, gr.update(visible=True), f"{dataset_id.replace('/', '-')}.ipynb"
-def generate_embedding_cells(dataset_id):
-    for messages in generate_cells(dataset_id, generate_embedding_prompt):
         yield messages, gr.update(visible=False), None  # Keep button hidden
-    yield messages, gr.update(visible=True), f"{dataset_id.replace('/', '-')}.ipynb"
 def push_to_hub(
@@ -149,6 +167,7 @@ def push_to_hub(
         yield history + [
             gr.ChatMessage(role="assistant", content="⏳ _Login to push to hub..._")
         ]
     logging.info(f"Profile: {profile}, token: {oauth_token.token}")
     notebook_name = "dataset_analysis.ipynb"
@@ -165,15 +184,16 @@ def push_to_hub(
         logging.info(f"Notebook pushed to hub: {link}")
         yield history + [
             gr.ChatMessage(
-                role="assistant", content=f"[Here is the generated notebook]({link})"
             )
         ]
-    except Exception as err:
-        logging.info("Failed to push notebook", err)
-        yield history + [gr.ChatMessage(role="assistant", content=err)]
-def generate_cells(dataset_id, prompt_fn):
     try:
         libraries = get_compatible_libraries(dataset_id)
     except Exception as err:
@@ -198,12 +218,8 @@ def generate_cells(dataset_id, prompt_fn):
     first_code = first_config_loading_code["code"]
     first_config = first_config_loading_code["config_name"]
     first_split = list(first_config_loading_code["arguments"]["splits"].keys())[0]
-    logging.info(f"First config: {first_config} - first split: {first_split}")
-    first_file = f"hf://datasets/{dataset_id}/{first_config_loading_code['arguments']['splits'][first_split]}"
-    logging.info(f"First split file: {first_file}")
     features, df = get_first_rows_as_df(dataset_id, first_config, first_split, 3)
-    sample_data = df.head(5).to_dict(orient="records")
-    prompt = prompt_fn(features, sample_data, first_code)
     messages = [gr.ChatMessage(role="user", content=prompt)]
     yield messages + [gr.ChatMessage(role="assistant", content="⏳ _Starting task..._")]
@@ -240,7 +256,7 @@ def generate_cells(dataset_id, prompt_fn):
     commands = get_txt_from_output(cells_txt)
     html_code = f"<iframe src='https://huggingface.co/datasets/{dataset_id}/embed/viewer' width='80%' height='560px'></iframe>"
-    # Adding dataset viewer on the first part
     commands.insert(
         0,
         {
@@ -249,10 +265,10 @@ def generate_cells(dataset_id, prompt_fn):
         },
     )
     commands.insert(0, {"cell_type": "markdown", "source": "# Dataset Viewer"})
-    notebook_name = f"{dataset_id.replace('/', '-')}.ipynb"
     create_notebook_file(commands, notebook_name=notebook_name)
     messages.append(
-        gr.ChatMessage(role="user", content="Here is the generated notebook")
     )
     yield messages
     messages.append(
@@ -264,8 +280,8 @@ def generate_cells(dataset_id, prompt_fn):
     yield messages
-def comming_soon_message():
-    gr.Info("Comming soon")
 with gr.Blocks(fill_height=True) as demo:
@@ -322,7 +338,7 @@ with gr.Blocks(fill_height=True) as demo:
         outputs=[chatbot, push_btn, notebook_file],
     )
-    generate_training_btn.click(comming_soon_message, inputs=[], outputs=[])
     push_btn.click(
         push_to_hub,
         inputs=[

 def get_compatible_libraries(dataset: str):
+    try:
+        response = client.get(
+            f"{BASE_DATASETS_SERVER_URL}/compatible-libraries?dataset={dataset}"
+        )
+        response.raise_for_status()
+        return response.json()
+    except Exception as e:
+        logging.error(f"Error fetching compatible libraries: {e}")
+        raise
 def create_notebook_file(cell_commands, notebook_name):
     nb = nbf.v4.new_notebook()
     nb["cells"] = [
+        nbf.v4.new_code_cell(cmd["source"])
+        if cmd["cell_type"] == "code"
+        else nbf.v4.new_markdown_cell(cmd["source"])
+        for cmd in cell_commands
     ]
     with open(notebook_name, "w") as f:
     logging.info(f"Notebook {notebook_name} created successfully")
+def get_first_rows_as_df(dataset: str, config: str, split: str, limit: int):
     try:
+        resp = client.get(
+            f"{BASE_DATASETS_SERVER_URL}/first-rows?dataset={dataset}&config={config}&split={split}"
         )
+        resp.raise_for_status()
+        content = resp.json()
+        rows = content["rows"]
+        rows = [row["row"] for row in rows]
+        first_rows_df = pd.DataFrame.from_dict(rows).sample(frac=1).head(limit)
+        features = content["features"]
+        features_dict = {feature["name"]: feature["type"] for feature in features}
+        return features_dict, first_rows_df
+    except Exception as e:
+        logging.error(f"Error fetching first rows: {e}")
+        raise
+def get_txt_from_output(output):
+    try:
+        extracted_text = extract_content_from_output(output)
+        content = json.loads(extracted_text)
+        logging.info(content)
+        return content
+    except Exception as e:
+        gr.Error("Error when parsing notebook, try again.")
+        logging.error(f"Failed to fetch compatible libraries: {e}")
+        raise
+def extract_content_from_output(output):
+    patterns = [r"`json(.*?)`", r"```(.*?)```"]
+    for pattern in patterns:
+        match = re.search(pattern, output, re.DOTALL)
+        if match:
+            return match.group(1)
+    try:
+        index = output.index("```json")
+        logging.info(f"Index: {index}")
+        return output[index + 7 :]
+    except ValueError:
+        logging.error("Unable to generate Jupyter notebook.")
+        raise
 def content_from_output(output):
     return match.group(1)
+def generate_eda_cells(dataset_id, profile: gr.OAuthProfile | None):
+    for messages in generate_cells(dataset_id, generate_eda_prompt, "eda"):
         yield messages, gr.update(visible=False), None  # Keep button hidden
+    yield (
+        messages,
+        gr.update(visible=profile and dataset_id.split("/")[0] == profile.username),
+        f"{dataset_id.replace('/', '-')}-eda.ipynb",
+    )
+def generate_embedding_cells(dataset_id, profile: gr.OAuthProfile | None):
+    for messages in generate_cells(dataset_id, generate_embedding_prompt, "embedding"):
         yield messages, gr.update(visible=False), None  # Keep button hidden
+    yield (
+        messages,
+        gr.update(visible=profile and dataset_id.split("/")[0] == profile.username),
+        f"{dataset_id.replace('/', '-')}-embedding.ipynb",
+    )
 def push_to_hub(
         yield history + [
             gr.ChatMessage(role="assistant", content="⏳ _Login to push to hub..._")
         ]
+        return
     logging.info(f"Profile: {profile}, token: {oauth_token.token}")
     notebook_name = "dataset_analysis.ipynb"
         logging.info(f"Notebook pushed to hub: {link}")
         yield history + [
             gr.ChatMessage(
+                role="user",
+                content=f"[See the notebook on the Hub]({link})",
             )
         ]
+    except Exception as e:
+        logging.info("Failed to push notebook", e)
+        yield history + [gr.ChatMessage(role="assistant", content=e)]
+def generate_cells(dataset_id, prompt_fn, notebook_type="eda"):
     try:
         libraries = get_compatible_libraries(dataset_id)
     except Exception as err:
     first_code = first_config_loading_code["code"]
     first_config = first_config_loading_code["config_name"]
     first_split = list(first_config_loading_code["arguments"]["splits"].keys())[0]
     features, df = get_first_rows_as_df(dataset_id, first_config, first_split, 3)
+    prompt = prompt_fn(features, df.head(5).to_dict(orient="records"), first_code)
     messages = [gr.ChatMessage(role="user", content=prompt)]
     yield messages + [gr.ChatMessage(role="assistant", content="⏳ _Starting task..._")]
     commands = get_txt_from_output(cells_txt)
     html_code = f"<iframe src='https://huggingface.co/datasets/{dataset_id}/embed/viewer' width='80%' height='560px'></iframe>"
     commands.insert(
         0,
         {
         },
     )
     commands.insert(0, {"cell_type": "markdown", "source": "# Dataset Viewer"})
+    notebook_name = f"{dataset_id.replace('/', '-')}-{notebook_type}.ipynb"
     create_notebook_file(commands, notebook_name=notebook_name)
     messages.append(
+        gr.ChatMessage(role="user", content="Here is the generated notebook file")
     )
     yield messages
     messages.append(
     yield messages
+def coming_soon_message():
+    return gr.Info("Coming soon")
 with gr.Blocks(fill_height=True) as demo:
         outputs=[chatbot, push_btn, notebook_file],
     )
+    generate_training_btn.click(coming_soon_message, inputs=[], outputs=[])
     push_btn.click(
         push_to_hub,
         inputs=[