Spaces:

asoria
/

auto-dataset-analyst-creator

Sleeping

App Files Files Community

asoria HF staff commited on Aug 12, 2024

Commit

44cdaf2

1 Parent(s): 810f00f

Adding TODOs

Browse files

Files changed (1) hide show

app.py +52 -86

app.py CHANGED Viewed

@@ -5,19 +5,19 @@ from huggingface_hub import HfApi
 from httpx import Client
 import logging
 from huggingface_hub import InferenceClient
-import json
 import re
 """
 TODOs:
 - Refactor
 - Make the notebook generation more dynamic, add loading components to do not freeze the UI
 - Fix errors:
     - When generating output
     - When parsing output
     - When pushing notebook
-- Parametrize the commands (Move to another file)
-- Use an LLM to suggest commands by column types
 - Add target tasks to choose for the notebook:
     - Exploratory data analysis
     - Auto training
@@ -37,42 +37,15 @@ logging.basicConfig(level=logging.INFO)
 def get_compatible_libraries(dataset: str):
-        resp = client.get(
-            f"{BASE_DATASETS_SERVER_URL}/compatible-libraries?dataset={dataset}"
-        )
-        resp.raise_for_status()
-        return resp.json()
-import pandas as pd
 def generate_eda_prompt(columns_info, df, first_code):
-    # columns_info = df.dtypes.to_dict()
-    sample_data = df.head(5).to_dict(orient='records')
-    # prompt = (
-    #     "You are an expert data analyst tasked with generating an exploratory data analysis (EDA) jupyter notebook. "
-    #     "The data is provided as a pandas DataFrame with the following structure:\n\n"
-    #     f"Columns and Data Types:\n{columns_info}\n\n"
-    #     f"Sample Data:\n{sample_data}\n\n"
-    #     "Please create a pandas EDA notebook that includes the following:\n"
-    #     "1. Summary statistics for numerical columns.\n"
-    #     "2. Distribution plots for numerical columns.\n"
-    #     "3. Bar plots or count plots for categorical columns.\n"
-    #     "4. Correlation matrix and heatmap for numerical columns.\n"
-    #     "5. Any other relevant visualizations or analyses you deem appropriate.\n\n"
-    #     "Ensure the notebook is well-organized, with explanations for each step."
-    #     f"You can use the following code to load the dataset:\n\n{first_code}\n"
-    #     """The output should be a markdown code snippet formatted in the following schema, including the leading and trailing "```json" and "```":\n"
-    #     ```json
-    #     [
-    #     {
-    #         "cell_type": string  // This refers either is a markdown or code cell type.
-    #         "source": list of string  // This is the list of text or python code.
-    #     }
-    #     ]
-    #     ```
-    #     Do not include more information than necessary, as this will be used to generate the notebook.
-    #     """
-    # )
     format_instructions = """
 The output should be a markdown code snippet formatted in the
 following schema, including the leading and trailing "```json" and "```":
@@ -81,11 +54,11 @@ following schema, including the leading and trailing "```json" and "```":
 [
     {
         "cell_type": string  // This refers either is a markdown or code cell type.
-        "source": list of string  // This is the list of text or python code.
     }
 ]
 ```
-"""
     prompt = """
 You are an expert data analyst tasked with generating an exploratory data analysis (EDA) Jupyter notebook. The data is provided as a pandas DataFrame with the following structure:
@@ -112,12 +85,22 @@ It is mandatory that you use the following code to load the dataset, DO NOT try
 {format_instructions}
 """
-    return prompt.format(columns_info=columns_info, sample_data=sample_data, first_code=first_code, format_instructions=format_instructions)
 def create_notebook_file(cell_commands, notebook_name):
     nb = nbf.v4.new_notebook()
-    nb["cells"] = [nbf.v4.new_code_cell(command['source']) if command['cell_type'] == 'code' else nbf.v4.new_markdown_cell(command['source']) for command in cell_commands]
     with open(notebook_name, "w") as f:
         nbf.write(nb, f)
@@ -143,62 +126,55 @@ def push_notebook(file_path, dataset_id, token):
         logging.error(f"Failed to push notebook: {err}")
         return gr.HTML(value="Failed to push notebook", visible=True)
-def get_first_rows_as_df(dataset: str, config: str, split: str, limit:int):
-    resp = client.get(f"{BASE_DATASETS_SERVER_URL}/first-rows?dataset={dataset}&config={config}&split={split}")
     resp.raise_for_status()
     content = resp.json()
     rows = content["rows"]
-    rows = [row['row'] for row in rows]
-    first_rows_df = pd.DataFrame.from_dict(rows).sample(frac = 1).head(limit)
-    features = content['features']
-    features_dict = {feature['name']: feature['type'] for feature in features}
     return features_dict, first_rows_df
 def content_from_output(output):
-    pattern = r'`json(.*?)`'
     logging.info("--------> Getting data from output")
     match = re.search(pattern, output, re.DOTALL)
     if not match:
-        pattern = r'```(.*?)```'
         logging.info("--------> Getting data from output, second try")
         match = re.search(pattern, output, re.DOTALL)
-        if  not match:
             raise Exception("Unable to generate jupyter notebook.")
     extracted_text = match.group(1)
     logging.info(extracted_text)
 def get_notebook_cells(prompt):
     messages = [{"role": "user", "content": prompt}]
     output = inference_client.chat_completion(messages=messages, max_tokens=2500)
-    output = (output.choices[0].message.content)
-    logging.info(output)
-    pattern = r'`json(.*?)`'
-    logging.info("--------> Getting data from output")
-    match = re.search(pattern, output, re.DOTALL)
-    if not match:
-        raise Exception("Unable to generate jupyter notebook.")
-    extracted_text = match.group(1)
-    logging.info(extracted_text)
-    content = json.loads(extracted_text)
-    logging.info(content)
-    return content
-def generate_notebook(dataset_id):
-    #TODO: Load dataframe from notebook here
-    # generate_eda_prompt
     try:
         libraries = get_compatible_libraries(dataset_id)
     except Exception as err:
-        gr.Error('Unable to retrieve dataset info from HF Hub.')
         logging.error(f"Failed to fetch compatible libraries: {err}")
         return None
     if not libraries:
-        gr.Warning('Dataset not compatible with pandas library.')
         logging.error(f"Dataset not compatible with pandas library")
         return gr.File(visible=False), gr.Row.update(visible=False)
@@ -207,15 +183,15 @@ def generate_notebook(dataset_id):
         None,
     )
     if not pandas_library:
-        gr.Warning('Dataset not compatible with pandas library.')
         logging.error(f"Dataset not compatible with pandas library")
         return gr.File(visible=False), gr.Row.update(visible=False)
-    first_config_loading_code = pandas_library['loading_codes'][0]
-    first_code = first_config_loading_code['code']
-    first_config = first_config_loading_code['config_name']
-    first_split = list(first_config_loading_code['arguments']['splits'].keys())[0]
     logging.info(f"First config: {first_config} - first split: {first_split}")
     first_file = f"hf://datasets/{dataset_id}/{first_config_loading_code['arguments']['splits'][first_split]}"
     logging.info(f"First split file: {first_file}")
@@ -224,19 +200,9 @@ def generate_notebook(dataset_id):
     prompt = generate_eda_prompt(features, df, first_code)
     logging.info(f"Prompt: {prompt}")
     commands = get_notebook_cells(prompt)
-    # TODO: Generate this commands using InferenceClient
-    # commands = [
-    #     "!pip install pandas",
-    #     "import pandas as pd"
-    #     f"df = pd.read_parquet('{first_file}')",
-    #     "df.head()",
-    #     f'from IPython.display import HTML\n\ndisplay(HTML("{html_code}"))',
-    #     "print(df.shape)",
-    #     "df.columns",
-    #     "df.describe()",
-    #     "df.info()",
-    #     # TODO: Generate more commands according to column types for EDA and then for auto training?
-    # ]
     notebook_name = f"{dataset_id.replace('/', '-')}.ipynb"
     create_notebook_file(commands, notebook_name=notebook_name)
     return gr.File(value=notebook_name, visible=True), gr.Row.update(visible=True)

 from httpx import Client
 import logging
 from huggingface_hub import InferenceClient
+import json
 import re
+import pandas as pd
 """
 TODOs:
+- Need feedback on the output commands to validate if operations are appropiate to data types
 - Refactor
 - Make the notebook generation more dynamic, add loading components to do not freeze the UI
 - Fix errors:
     - When generating output
     - When parsing output
     - When pushing notebook
 - Add target tasks to choose for the notebook:
     - Exploratory data analysis
     - Auto training
 def get_compatible_libraries(dataset: str):
+    resp = client.get(
+        f"{BASE_DATASETS_SERVER_URL}/compatible-libraries?dataset={dataset}"
+    )
+    resp.raise_for_status()
+    return resp.json()
 def generate_eda_prompt(columns_info, df, first_code):
+    sample_data = df.head(5).to_dict(orient="records")
     format_instructions = """
 The output should be a markdown code snippet formatted in the
 following schema, including the leading and trailing "```json" and "```":
 [
     {
         "cell_type": string  // This refers either is a markdown or code cell type.
+        "source": list of string separated by comma // This is the list of text or python code.
     }
 ]
 ```
+"""
     prompt = """
 You are an expert data analyst tasked with generating an exploratory data analysis (EDA) Jupyter notebook. The data is provided as a pandas DataFrame with the following structure:
 {format_instructions}
 """
+    return prompt.format(
+        columns_info=columns_info,
+        sample_data=sample_data,
+        first_code=first_code,
+        format_instructions=format_instructions,
+    )
 def create_notebook_file(cell_commands, notebook_name):
     nb = nbf.v4.new_notebook()
+    nb["cells"] = [
+        nbf.v4.new_code_cell(command["source"])
+        if command["cell_type"] == "code"
+        else nbf.v4.new_markdown_cell(command["source"])
+        for command in cell_commands
+    ]
     with open(notebook_name, "w") as f:
         nbf.write(nb, f)
         logging.error(f"Failed to push notebook: {err}")
         return gr.HTML(value="Failed to push notebook", visible=True)
+def get_first_rows_as_df(dataset: str, config: str, split: str, limit: int):
+    resp = client.get(
+        f"{BASE_DATASETS_SERVER_URL}/first-rows?dataset={dataset}&config={config}&split={split}"
+    )
     resp.raise_for_status()
     content = resp.json()
     rows = content["rows"]
+    rows = [row["row"] for row in rows]
+    first_rows_df = pd.DataFrame.from_dict(rows).sample(frac=1).head(limit)
+    features = content["features"]
+    features_dict = {feature["name"]: feature["type"] for feature in features}
     return features_dict, first_rows_df
 def content_from_output(output):
+    pattern = r"`json(.*?)`"
     logging.info("--------> Getting data from output")
     match = re.search(pattern, output, re.DOTALL)
     if not match:
+        pattern = r"```(.*?)```"
         logging.info("--------> Getting data from output, second try")
         match = re.search(pattern, output, re.DOTALL)
+        if not match:
             raise Exception("Unable to generate jupyter notebook.")
     extracted_text = match.group(1)
     logging.info(extracted_text)
+    content = json.loads(extracted_text)
+    logging.info(content)
+    return content
 def get_notebook_cells(prompt):
     messages = [{"role": "user", "content": prompt}]
     output = inference_client.chat_completion(messages=messages, max_tokens=2500)
+    output = output.choices[0].message.content
+    return content_from_output(output)
+def generate_notebook(dataset_id):
     try:
         libraries = get_compatible_libraries(dataset_id)
     except Exception as err:
+        gr.Error("Unable to retrieve dataset info from HF Hub.")
         logging.error(f"Failed to fetch compatible libraries: {err}")
         return None
     if not libraries:
+        gr.Warning("Dataset not compatible with pandas library.")
         logging.error(f"Dataset not compatible with pandas library")
         return gr.File(visible=False), gr.Row.update(visible=False)
         None,
     )
     if not pandas_library:
+        gr.Warning("Dataset not compatible with pandas library.")
         logging.error(f"Dataset not compatible with pandas library")
         return gr.File(visible=False), gr.Row.update(visible=False)
+    first_config_loading_code = pandas_library["loading_codes"][0]
+    first_code = first_config_loading_code["code"]
+    first_config = first_config_loading_code["config_name"]
+    first_split = list(first_config_loading_code["arguments"]["splits"].keys())[0]
     logging.info(f"First config: {first_config} - first split: {first_split}")
     first_file = f"hf://datasets/{dataset_id}/{first_config_loading_code['arguments']['splits'][first_split]}"
     logging.info(f"First split file: {first_file}")
     prompt = generate_eda_prompt(features, df, first_code)
     logging.info(f"Prompt: {prompt}")
     commands = get_notebook_cells(prompt)
+    # Adding dataset viewer on the first part
+    commands.insert(0, {"cell_type": "code", "source": f'from IPython.display import HTML\n\ndisplay(HTML("{html_code}"))'})
+    commands.insert(0, {"cell_type": "markdown", "source": "# Dataset Viewer"})
     notebook_name = f"{dataset_id.replace('/', '-')}.ipynb"
     create_notebook_file(commands, notebook_name=notebook_name)
     return gr.File(value=notebook_name, visible=True), gr.Row.update(visible=True)