Spaces:
Sleeping
Sleeping
Adding TODOs
Browse files
app.py
CHANGED
@@ -5,19 +5,19 @@ from huggingface_hub import HfApi
|
|
5 |
from httpx import Client
|
6 |
import logging
|
7 |
from huggingface_hub import InferenceClient
|
8 |
-
import json
|
9 |
import re
|
|
|
10 |
|
11 |
"""
|
12 |
TODOs:
|
|
|
13 |
- Refactor
|
14 |
- Make the notebook generation more dynamic, add loading components to do not freeze the UI
|
15 |
- Fix errors:
|
16 |
- When generating output
|
17 |
- When parsing output
|
18 |
- When pushing notebook
|
19 |
-
- Parametrize the commands (Move to another file)
|
20 |
-
- Use an LLM to suggest commands by column types
|
21 |
- Add target tasks to choose for the notebook:
|
22 |
- Exploratory data analysis
|
23 |
- Auto training
|
@@ -37,42 +37,15 @@ logging.basicConfig(level=logging.INFO)
|
|
37 |
|
38 |
|
39 |
def get_compatible_libraries(dataset: str):
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
|
46 |
-
import pandas as pd
|
47 |
|
48 |
def generate_eda_prompt(columns_info, df, first_code):
|
49 |
-
|
50 |
-
sample_data = df.head(5).to_dict(orient='records')
|
51 |
-
# prompt = (
|
52 |
-
# "You are an expert data analyst tasked with generating an exploratory data analysis (EDA) jupyter notebook. "
|
53 |
-
# "The data is provided as a pandas DataFrame with the following structure:\n\n"
|
54 |
-
# f"Columns and Data Types:\n{columns_info}\n\n"
|
55 |
-
# f"Sample Data:\n{sample_data}\n\n"
|
56 |
-
# "Please create a pandas EDA notebook that includes the following:\n"
|
57 |
-
# "1. Summary statistics for numerical columns.\n"
|
58 |
-
# "2. Distribution plots for numerical columns.\n"
|
59 |
-
# "3. Bar plots or count plots for categorical columns.\n"
|
60 |
-
# "4. Correlation matrix and heatmap for numerical columns.\n"
|
61 |
-
# "5. Any other relevant visualizations or analyses you deem appropriate.\n\n"
|
62 |
-
# "Ensure the notebook is well-organized, with explanations for each step."
|
63 |
-
# f"You can use the following code to load the dataset:\n\n{first_code}\n"
|
64 |
-
# """The output should be a markdown code snippet formatted in the following schema, including the leading and trailing "```json" and "```":\n"
|
65 |
-
# ```json
|
66 |
-
# [
|
67 |
-
# {
|
68 |
-
# "cell_type": string // This refers either is a markdown or code cell type.
|
69 |
-
# "source": list of string // This is the list of text or python code.
|
70 |
-
# }
|
71 |
-
# ]
|
72 |
-
# ```
|
73 |
-
# Do not include more information than necessary, as this will be used to generate the notebook.
|
74 |
-
# """
|
75 |
-
# )
|
76 |
format_instructions = """
|
77 |
The output should be a markdown code snippet formatted in the
|
78 |
following schema, including the leading and trailing "```json" and "```":
|
@@ -81,11 +54,11 @@ following schema, including the leading and trailing "```json" and "```":
|
|
81 |
[
|
82 |
{
|
83 |
"cell_type": string // This refers either is a markdown or code cell type.
|
84 |
-
"source": list of string
|
85 |
}
|
86 |
]
|
87 |
```
|
88 |
-
"""
|
89 |
|
90 |
prompt = """
|
91 |
You are an expert data analyst tasked with generating an exploratory data analysis (EDA) Jupyter notebook. The data is provided as a pandas DataFrame with the following structure:
|
@@ -112,12 +85,22 @@ It is mandatory that you use the following code to load the dataset, DO NOT try
|
|
112 |
|
113 |
{format_instructions}
|
114 |
"""
|
115 |
-
return prompt.format(
|
|
|
|
|
|
|
|
|
|
|
|
|
116 |
|
117 |
def create_notebook_file(cell_commands, notebook_name):
|
118 |
nb = nbf.v4.new_notebook()
|
119 |
-
nb["cells"] = [
|
120 |
-
|
|
|
|
|
|
|
|
|
121 |
|
122 |
with open(notebook_name, "w") as f:
|
123 |
nbf.write(nb, f)
|
@@ -143,62 +126,55 @@ def push_notebook(file_path, dataset_id, token):
|
|
143 |
logging.error(f"Failed to push notebook: {err}")
|
144 |
return gr.HTML(value="Failed to push notebook", visible=True)
|
145 |
|
146 |
-
|
147 |
-
|
|
|
|
|
|
|
148 |
resp.raise_for_status()
|
149 |
content = resp.json()
|
150 |
rows = content["rows"]
|
151 |
-
rows = [row[
|
152 |
-
first_rows_df = pd.DataFrame.from_dict(rows).sample(frac
|
153 |
-
features = content[
|
154 |
-
features_dict = {feature[
|
155 |
return features_dict, first_rows_df
|
156 |
|
157 |
|
158 |
def content_from_output(output):
|
159 |
-
pattern = r
|
160 |
logging.info("--------> Getting data from output")
|
161 |
match = re.search(pattern, output, re.DOTALL)
|
162 |
if not match:
|
163 |
-
pattern = r
|
164 |
logging.info("--------> Getting data from output, second try")
|
165 |
match = re.search(pattern, output, re.DOTALL)
|
166 |
-
if
|
167 |
raise Exception("Unable to generate jupyter notebook.")
|
168 |
extracted_text = match.group(1)
|
169 |
logging.info(extracted_text)
|
|
|
|
|
|
|
170 |
|
171 |
|
172 |
def get_notebook_cells(prompt):
|
173 |
messages = [{"role": "user", "content": prompt}]
|
174 |
output = inference_client.chat_completion(messages=messages, max_tokens=2500)
|
175 |
-
output =
|
176 |
-
|
177 |
-
pattern = r'`json(.*?)`'
|
178 |
-
logging.info("--------> Getting data from output")
|
179 |
-
match = re.search(pattern, output, re.DOTALL)
|
180 |
-
if not match:
|
181 |
-
raise Exception("Unable to generate jupyter notebook.")
|
182 |
-
extracted_text = match.group(1)
|
183 |
-
logging.info(extracted_text)
|
184 |
-
content = json.loads(extracted_text)
|
185 |
-
logging.info(content)
|
186 |
-
return content
|
187 |
|
188 |
-
def generate_notebook(dataset_id):
|
189 |
-
|
190 |
-
#TODO: Load dataframe from notebook here
|
191 |
-
# generate_eda_prompt
|
192 |
|
|
|
193 |
try:
|
194 |
libraries = get_compatible_libraries(dataset_id)
|
195 |
except Exception as err:
|
196 |
-
gr.Error(
|
197 |
logging.error(f"Failed to fetch compatible libraries: {err}")
|
198 |
return None
|
199 |
|
200 |
if not libraries:
|
201 |
-
gr.Warning(
|
202 |
logging.error(f"Dataset not compatible with pandas library")
|
203 |
return gr.File(visible=False), gr.Row.update(visible=False)
|
204 |
|
@@ -207,15 +183,15 @@ def generate_notebook(dataset_id):
|
|
207 |
None,
|
208 |
)
|
209 |
if not pandas_library:
|
210 |
-
gr.Warning(
|
211 |
logging.error(f"Dataset not compatible with pandas library")
|
212 |
return gr.File(visible=False), gr.Row.update(visible=False)
|
213 |
|
214 |
-
first_config_loading_code = pandas_library[
|
215 |
-
first_code = first_config_loading_code[
|
216 |
|
217 |
-
first_config = first_config_loading_code[
|
218 |
-
first_split = list(first_config_loading_code[
|
219 |
logging.info(f"First config: {first_config} - first split: {first_split}")
|
220 |
first_file = f"hf://datasets/{dataset_id}/{first_config_loading_code['arguments']['splits'][first_split]}"
|
221 |
logging.info(f"First split file: {first_file}")
|
@@ -224,19 +200,9 @@ def generate_notebook(dataset_id):
|
|
224 |
prompt = generate_eda_prompt(features, df, first_code)
|
225 |
logging.info(f"Prompt: {prompt}")
|
226 |
commands = get_notebook_cells(prompt)
|
227 |
-
#
|
228 |
-
|
229 |
-
|
230 |
-
# "import pandas as pd"
|
231 |
-
# f"df = pd.read_parquet('{first_file}')",
|
232 |
-
# "df.head()",
|
233 |
-
# f'from IPython.display import HTML\n\ndisplay(HTML("{html_code}"))',
|
234 |
-
# "print(df.shape)",
|
235 |
-
# "df.columns",
|
236 |
-
# "df.describe()",
|
237 |
-
# "df.info()",
|
238 |
-
# # TODO: Generate more commands according to column types for EDA and then for auto training?
|
239 |
-
# ]
|
240 |
notebook_name = f"{dataset_id.replace('/', '-')}.ipynb"
|
241 |
create_notebook_file(commands, notebook_name=notebook_name)
|
242 |
return gr.File(value=notebook_name, visible=True), gr.Row.update(visible=True)
|
|
|
5 |
from httpx import Client
|
6 |
import logging
|
7 |
from huggingface_hub import InferenceClient
|
8 |
+
import json
|
9 |
import re
|
10 |
+
import pandas as pd
|
11 |
|
12 |
"""
|
13 |
TODOs:
|
14 |
+
- Need feedback on the output commands to validate if operations are appropiate to data types
|
15 |
- Refactor
|
16 |
- Make the notebook generation more dynamic, add loading components to do not freeze the UI
|
17 |
- Fix errors:
|
18 |
- When generating output
|
19 |
- When parsing output
|
20 |
- When pushing notebook
|
|
|
|
|
21 |
- Add target tasks to choose for the notebook:
|
22 |
- Exploratory data analysis
|
23 |
- Auto training
|
|
|
37 |
|
38 |
|
39 |
def get_compatible_libraries(dataset: str):
|
40 |
+
resp = client.get(
|
41 |
+
f"{BASE_DATASETS_SERVER_URL}/compatible-libraries?dataset={dataset}"
|
42 |
+
)
|
43 |
+
resp.raise_for_status()
|
44 |
+
return resp.json()
|
45 |
|
|
|
46 |
|
47 |
def generate_eda_prompt(columns_info, df, first_code):
|
48 |
+
sample_data = df.head(5).to_dict(orient="records")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
format_instructions = """
|
50 |
The output should be a markdown code snippet formatted in the
|
51 |
following schema, including the leading and trailing "```json" and "```":
|
|
|
54 |
[
|
55 |
{
|
56 |
"cell_type": string // This refers either is a markdown or code cell type.
|
57 |
+
"source": list of string separated by comma // This is the list of text or python code.
|
58 |
}
|
59 |
]
|
60 |
```
|
61 |
+
"""
|
62 |
|
63 |
prompt = """
|
64 |
You are an expert data analyst tasked with generating an exploratory data analysis (EDA) Jupyter notebook. The data is provided as a pandas DataFrame with the following structure:
|
|
|
85 |
|
86 |
{format_instructions}
|
87 |
"""
|
88 |
+
return prompt.format(
|
89 |
+
columns_info=columns_info,
|
90 |
+
sample_data=sample_data,
|
91 |
+
first_code=first_code,
|
92 |
+
format_instructions=format_instructions,
|
93 |
+
)
|
94 |
+
|
95 |
|
96 |
def create_notebook_file(cell_commands, notebook_name):
|
97 |
nb = nbf.v4.new_notebook()
|
98 |
+
nb["cells"] = [
|
99 |
+
nbf.v4.new_code_cell(command["source"])
|
100 |
+
if command["cell_type"] == "code"
|
101 |
+
else nbf.v4.new_markdown_cell(command["source"])
|
102 |
+
for command in cell_commands
|
103 |
+
]
|
104 |
|
105 |
with open(notebook_name, "w") as f:
|
106 |
nbf.write(nb, f)
|
|
|
126 |
logging.error(f"Failed to push notebook: {err}")
|
127 |
return gr.HTML(value="Failed to push notebook", visible=True)
|
128 |
|
129 |
+
|
130 |
+
def get_first_rows_as_df(dataset: str, config: str, split: str, limit: int):
|
131 |
+
resp = client.get(
|
132 |
+
f"{BASE_DATASETS_SERVER_URL}/first-rows?dataset={dataset}&config={config}&split={split}"
|
133 |
+
)
|
134 |
resp.raise_for_status()
|
135 |
content = resp.json()
|
136 |
rows = content["rows"]
|
137 |
+
rows = [row["row"] for row in rows]
|
138 |
+
first_rows_df = pd.DataFrame.from_dict(rows).sample(frac=1).head(limit)
|
139 |
+
features = content["features"]
|
140 |
+
features_dict = {feature["name"]: feature["type"] for feature in features}
|
141 |
return features_dict, first_rows_df
|
142 |
|
143 |
|
144 |
def content_from_output(output):
|
145 |
+
pattern = r"`json(.*?)`"
|
146 |
logging.info("--------> Getting data from output")
|
147 |
match = re.search(pattern, output, re.DOTALL)
|
148 |
if not match:
|
149 |
+
pattern = r"```(.*?)```"
|
150 |
logging.info("--------> Getting data from output, second try")
|
151 |
match = re.search(pattern, output, re.DOTALL)
|
152 |
+
if not match:
|
153 |
raise Exception("Unable to generate jupyter notebook.")
|
154 |
extracted_text = match.group(1)
|
155 |
logging.info(extracted_text)
|
156 |
+
content = json.loads(extracted_text)
|
157 |
+
logging.info(content)
|
158 |
+
return content
|
159 |
|
160 |
|
161 |
def get_notebook_cells(prompt):
|
162 |
messages = [{"role": "user", "content": prompt}]
|
163 |
output = inference_client.chat_completion(messages=messages, max_tokens=2500)
|
164 |
+
output = output.choices[0].message.content
|
165 |
+
return content_from_output(output)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
166 |
|
|
|
|
|
|
|
|
|
167 |
|
168 |
+
def generate_notebook(dataset_id):
|
169 |
try:
|
170 |
libraries = get_compatible_libraries(dataset_id)
|
171 |
except Exception as err:
|
172 |
+
gr.Error("Unable to retrieve dataset info from HF Hub.")
|
173 |
logging.error(f"Failed to fetch compatible libraries: {err}")
|
174 |
return None
|
175 |
|
176 |
if not libraries:
|
177 |
+
gr.Warning("Dataset not compatible with pandas library.")
|
178 |
logging.error(f"Dataset not compatible with pandas library")
|
179 |
return gr.File(visible=False), gr.Row.update(visible=False)
|
180 |
|
|
|
183 |
None,
|
184 |
)
|
185 |
if not pandas_library:
|
186 |
+
gr.Warning("Dataset not compatible with pandas library.")
|
187 |
logging.error(f"Dataset not compatible with pandas library")
|
188 |
return gr.File(visible=False), gr.Row.update(visible=False)
|
189 |
|
190 |
+
first_config_loading_code = pandas_library["loading_codes"][0]
|
191 |
+
first_code = first_config_loading_code["code"]
|
192 |
|
193 |
+
first_config = first_config_loading_code["config_name"]
|
194 |
+
first_split = list(first_config_loading_code["arguments"]["splits"].keys())[0]
|
195 |
logging.info(f"First config: {first_config} - first split: {first_split}")
|
196 |
first_file = f"hf://datasets/{dataset_id}/{first_config_loading_code['arguments']['splits'][first_split]}"
|
197 |
logging.info(f"First split file: {first_file}")
|
|
|
200 |
prompt = generate_eda_prompt(features, df, first_code)
|
201 |
logging.info(f"Prompt: {prompt}")
|
202 |
commands = get_notebook_cells(prompt)
|
203 |
+
# Adding dataset viewer on the first part
|
204 |
+
commands.insert(0, {"cell_type": "code", "source": f'from IPython.display import HTML\n\ndisplay(HTML("{html_code}"))'})
|
205 |
+
commands.insert(0, {"cell_type": "markdown", "source": "# Dataset Viewer"})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
206 |
notebook_name = f"{dataset_id.replace('/', '-')}.ipynb"
|
207 |
create_notebook_file(commands, notebook_name=notebook_name)
|
208 |
return gr.File(value=notebook_name, visible=True), gr.Row.update(visible=True)
|