asoria HF staff commited on
Commit
44cdaf2
1 Parent(s): 810f00f

Adding TODOs

Browse files
Files changed (1) hide show
  1. app.py +52 -86
app.py CHANGED
@@ -5,19 +5,19 @@ from huggingface_hub import HfApi
5
  from httpx import Client
6
  import logging
7
  from huggingface_hub import InferenceClient
8
- import json
9
  import re
 
10
 
11
  """
12
  TODOs:
 
13
  - Refactor
14
  - Make the notebook generation more dynamic, add loading components to do not freeze the UI
15
  - Fix errors:
16
  - When generating output
17
  - When parsing output
18
  - When pushing notebook
19
- - Parametrize the commands (Move to another file)
20
- - Use an LLM to suggest commands by column types
21
  - Add target tasks to choose for the notebook:
22
  - Exploratory data analysis
23
  - Auto training
@@ -37,42 +37,15 @@ logging.basicConfig(level=logging.INFO)
37
 
38
 
39
  def get_compatible_libraries(dataset: str):
40
- resp = client.get(
41
- f"{BASE_DATASETS_SERVER_URL}/compatible-libraries?dataset={dataset}"
42
- )
43
- resp.raise_for_status()
44
- return resp.json()
45
 
46
- import pandas as pd
47
 
48
  def generate_eda_prompt(columns_info, df, first_code):
49
- # columns_info = df.dtypes.to_dict()
50
- sample_data = df.head(5).to_dict(orient='records')
51
- # prompt = (
52
- # "You are an expert data analyst tasked with generating an exploratory data analysis (EDA) jupyter notebook. "
53
- # "The data is provided as a pandas DataFrame with the following structure:\n\n"
54
- # f"Columns and Data Types:\n{columns_info}\n\n"
55
- # f"Sample Data:\n{sample_data}\n\n"
56
- # "Please create a pandas EDA notebook that includes the following:\n"
57
- # "1. Summary statistics for numerical columns.\n"
58
- # "2. Distribution plots for numerical columns.\n"
59
- # "3. Bar plots or count plots for categorical columns.\n"
60
- # "4. Correlation matrix and heatmap for numerical columns.\n"
61
- # "5. Any other relevant visualizations or analyses you deem appropriate.\n\n"
62
- # "Ensure the notebook is well-organized, with explanations for each step."
63
- # f"You can use the following code to load the dataset:\n\n{first_code}\n"
64
- # """The output should be a markdown code snippet formatted in the following schema, including the leading and trailing "```json" and "```":\n"
65
- # ```json
66
- # [
67
- # {
68
- # "cell_type": string // This refers either is a markdown or code cell type.
69
- # "source": list of string // This is the list of text or python code.
70
- # }
71
- # ]
72
- # ```
73
- # Do not include more information than necessary, as this will be used to generate the notebook.
74
- # """
75
- # )
76
  format_instructions = """
77
  The output should be a markdown code snippet formatted in the
78
  following schema, including the leading and trailing "```json" and "```":
@@ -81,11 +54,11 @@ following schema, including the leading and trailing "```json" and "```":
81
  [
82
  {
83
  "cell_type": string // This refers either is a markdown or code cell type.
84
- "source": list of string // This is the list of text or python code.
85
  }
86
  ]
87
  ```
88
- """
89
 
90
  prompt = """
91
  You are an expert data analyst tasked with generating an exploratory data analysis (EDA) Jupyter notebook. The data is provided as a pandas DataFrame with the following structure:
@@ -112,12 +85,22 @@ It is mandatory that you use the following code to load the dataset, DO NOT try
112
 
113
  {format_instructions}
114
  """
115
- return prompt.format(columns_info=columns_info, sample_data=sample_data, first_code=first_code, format_instructions=format_instructions)
 
 
 
 
 
 
116
 
117
  def create_notebook_file(cell_commands, notebook_name):
118
  nb = nbf.v4.new_notebook()
119
- nb["cells"] = [nbf.v4.new_code_cell(command['source']) if command['cell_type'] == 'code' else nbf.v4.new_markdown_cell(command['source']) for command in cell_commands]
120
-
 
 
 
 
121
 
122
  with open(notebook_name, "w") as f:
123
  nbf.write(nb, f)
@@ -143,62 +126,55 @@ def push_notebook(file_path, dataset_id, token):
143
  logging.error(f"Failed to push notebook: {err}")
144
  return gr.HTML(value="Failed to push notebook", visible=True)
145
 
146
- def get_first_rows_as_df(dataset: str, config: str, split: str, limit:int):
147
- resp = client.get(f"{BASE_DATASETS_SERVER_URL}/first-rows?dataset={dataset}&config={config}&split={split}")
 
 
 
148
  resp.raise_for_status()
149
  content = resp.json()
150
  rows = content["rows"]
151
- rows = [row['row'] for row in rows]
152
- first_rows_df = pd.DataFrame.from_dict(rows).sample(frac = 1).head(limit)
153
- features = content['features']
154
- features_dict = {feature['name']: feature['type'] for feature in features}
155
  return features_dict, first_rows_df
156
 
157
 
158
  def content_from_output(output):
159
- pattern = r'`json(.*?)`'
160
  logging.info("--------> Getting data from output")
161
  match = re.search(pattern, output, re.DOTALL)
162
  if not match:
163
- pattern = r'```(.*?)```'
164
  logging.info("--------> Getting data from output, second try")
165
  match = re.search(pattern, output, re.DOTALL)
166
- if not match:
167
  raise Exception("Unable to generate jupyter notebook.")
168
  extracted_text = match.group(1)
169
  logging.info(extracted_text)
 
 
 
170
 
171
 
172
  def get_notebook_cells(prompt):
173
  messages = [{"role": "user", "content": prompt}]
174
  output = inference_client.chat_completion(messages=messages, max_tokens=2500)
175
- output = (output.choices[0].message.content)
176
- logging.info(output)
177
- pattern = r'`json(.*?)`'
178
- logging.info("--------> Getting data from output")
179
- match = re.search(pattern, output, re.DOTALL)
180
- if not match:
181
- raise Exception("Unable to generate jupyter notebook.")
182
- extracted_text = match.group(1)
183
- logging.info(extracted_text)
184
- content = json.loads(extracted_text)
185
- logging.info(content)
186
- return content
187
 
188
- def generate_notebook(dataset_id):
189
-
190
- #TODO: Load dataframe from notebook here
191
- # generate_eda_prompt
192
 
 
193
  try:
194
  libraries = get_compatible_libraries(dataset_id)
195
  except Exception as err:
196
- gr.Error('Unable to retrieve dataset info from HF Hub.')
197
  logging.error(f"Failed to fetch compatible libraries: {err}")
198
  return None
199
 
200
  if not libraries:
201
- gr.Warning('Dataset not compatible with pandas library.')
202
  logging.error(f"Dataset not compatible with pandas library")
203
  return gr.File(visible=False), gr.Row.update(visible=False)
204
 
@@ -207,15 +183,15 @@ def generate_notebook(dataset_id):
207
  None,
208
  )
209
  if not pandas_library:
210
- gr.Warning('Dataset not compatible with pandas library.')
211
  logging.error(f"Dataset not compatible with pandas library")
212
  return gr.File(visible=False), gr.Row.update(visible=False)
213
 
214
- first_config_loading_code = pandas_library['loading_codes'][0]
215
- first_code = first_config_loading_code['code']
216
 
217
- first_config = first_config_loading_code['config_name']
218
- first_split = list(first_config_loading_code['arguments']['splits'].keys())[0]
219
  logging.info(f"First config: {first_config} - first split: {first_split}")
220
  first_file = f"hf://datasets/{dataset_id}/{first_config_loading_code['arguments']['splits'][first_split]}"
221
  logging.info(f"First split file: {first_file}")
@@ -224,19 +200,9 @@ def generate_notebook(dataset_id):
224
  prompt = generate_eda_prompt(features, df, first_code)
225
  logging.info(f"Prompt: {prompt}")
226
  commands = get_notebook_cells(prompt)
227
- # TODO: Generate this commands using InferenceClient
228
- # commands = [
229
- # "!pip install pandas",
230
- # "import pandas as pd"
231
- # f"df = pd.read_parquet('{first_file}')",
232
- # "df.head()",
233
- # f'from IPython.display import HTML\n\ndisplay(HTML("{html_code}"))',
234
- # "print(df.shape)",
235
- # "df.columns",
236
- # "df.describe()",
237
- # "df.info()",
238
- # # TODO: Generate more commands according to column types for EDA and then for auto training?
239
- # ]
240
  notebook_name = f"{dataset_id.replace('/', '-')}.ipynb"
241
  create_notebook_file(commands, notebook_name=notebook_name)
242
  return gr.File(value=notebook_name, visible=True), gr.Row.update(visible=True)
 
5
  from httpx import Client
6
  import logging
7
  from huggingface_hub import InferenceClient
8
+ import json
9
  import re
10
+ import pandas as pd
11
 
12
  """
13
  TODOs:
14
+ - Need feedback on the output commands to validate if operations are appropiate to data types
15
  - Refactor
16
  - Make the notebook generation more dynamic, add loading components to do not freeze the UI
17
  - Fix errors:
18
  - When generating output
19
  - When parsing output
20
  - When pushing notebook
 
 
21
  - Add target tasks to choose for the notebook:
22
  - Exploratory data analysis
23
  - Auto training
 
37
 
38
 
39
  def get_compatible_libraries(dataset: str):
40
+ resp = client.get(
41
+ f"{BASE_DATASETS_SERVER_URL}/compatible-libraries?dataset={dataset}"
42
+ )
43
+ resp.raise_for_status()
44
+ return resp.json()
45
 
 
46
 
47
  def generate_eda_prompt(columns_info, df, first_code):
48
+ sample_data = df.head(5).to_dict(orient="records")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  format_instructions = """
50
  The output should be a markdown code snippet formatted in the
51
  following schema, including the leading and trailing "```json" and "```":
 
54
  [
55
  {
56
  "cell_type": string // This refers either is a markdown or code cell type.
57
+ "source": list of string separated by comma // This is the list of text or python code.
58
  }
59
  ]
60
  ```
61
+ """
62
 
63
  prompt = """
64
  You are an expert data analyst tasked with generating an exploratory data analysis (EDA) Jupyter notebook. The data is provided as a pandas DataFrame with the following structure:
 
85
 
86
  {format_instructions}
87
  """
88
+ return prompt.format(
89
+ columns_info=columns_info,
90
+ sample_data=sample_data,
91
+ first_code=first_code,
92
+ format_instructions=format_instructions,
93
+ )
94
+
95
 
96
  def create_notebook_file(cell_commands, notebook_name):
97
  nb = nbf.v4.new_notebook()
98
+ nb["cells"] = [
99
+ nbf.v4.new_code_cell(command["source"])
100
+ if command["cell_type"] == "code"
101
+ else nbf.v4.new_markdown_cell(command["source"])
102
+ for command in cell_commands
103
+ ]
104
 
105
  with open(notebook_name, "w") as f:
106
  nbf.write(nb, f)
 
126
  logging.error(f"Failed to push notebook: {err}")
127
  return gr.HTML(value="Failed to push notebook", visible=True)
128
 
129
+
130
+ def get_first_rows_as_df(dataset: str, config: str, split: str, limit: int):
131
+ resp = client.get(
132
+ f"{BASE_DATASETS_SERVER_URL}/first-rows?dataset={dataset}&config={config}&split={split}"
133
+ )
134
  resp.raise_for_status()
135
  content = resp.json()
136
  rows = content["rows"]
137
+ rows = [row["row"] for row in rows]
138
+ first_rows_df = pd.DataFrame.from_dict(rows).sample(frac=1).head(limit)
139
+ features = content["features"]
140
+ features_dict = {feature["name"]: feature["type"] for feature in features}
141
  return features_dict, first_rows_df
142
 
143
 
144
  def content_from_output(output):
145
+ pattern = r"`json(.*?)`"
146
  logging.info("--------> Getting data from output")
147
  match = re.search(pattern, output, re.DOTALL)
148
  if not match:
149
+ pattern = r"```(.*?)```"
150
  logging.info("--------> Getting data from output, second try")
151
  match = re.search(pattern, output, re.DOTALL)
152
+ if not match:
153
  raise Exception("Unable to generate jupyter notebook.")
154
  extracted_text = match.group(1)
155
  logging.info(extracted_text)
156
+ content = json.loads(extracted_text)
157
+ logging.info(content)
158
+ return content
159
 
160
 
161
  def get_notebook_cells(prompt):
162
  messages = [{"role": "user", "content": prompt}]
163
  output = inference_client.chat_completion(messages=messages, max_tokens=2500)
164
+ output = output.choices[0].message.content
165
+ return content_from_output(output)
 
 
 
 
 
 
 
 
 
 
166
 
 
 
 
 
167
 
168
+ def generate_notebook(dataset_id):
169
  try:
170
  libraries = get_compatible_libraries(dataset_id)
171
  except Exception as err:
172
+ gr.Error("Unable to retrieve dataset info from HF Hub.")
173
  logging.error(f"Failed to fetch compatible libraries: {err}")
174
  return None
175
 
176
  if not libraries:
177
+ gr.Warning("Dataset not compatible with pandas library.")
178
  logging.error(f"Dataset not compatible with pandas library")
179
  return gr.File(visible=False), gr.Row.update(visible=False)
180
 
 
183
  None,
184
  )
185
  if not pandas_library:
186
+ gr.Warning("Dataset not compatible with pandas library.")
187
  logging.error(f"Dataset not compatible with pandas library")
188
  return gr.File(visible=False), gr.Row.update(visible=False)
189
 
190
+ first_config_loading_code = pandas_library["loading_codes"][0]
191
+ first_code = first_config_loading_code["code"]
192
 
193
+ first_config = first_config_loading_code["config_name"]
194
+ first_split = list(first_config_loading_code["arguments"]["splits"].keys())[0]
195
  logging.info(f"First config: {first_config} - first split: {first_split}")
196
  first_file = f"hf://datasets/{dataset_id}/{first_config_loading_code['arguments']['splits'][first_split]}"
197
  logging.info(f"First split file: {first_file}")
 
200
  prompt = generate_eda_prompt(features, df, first_code)
201
  logging.info(f"Prompt: {prompt}")
202
  commands = get_notebook_cells(prompt)
203
+ # Adding dataset viewer on the first part
204
+ commands.insert(0, {"cell_type": "code", "source": f'from IPython.display import HTML\n\ndisplay(HTML("{html_code}"))'})
205
+ commands.insert(0, {"cell_type": "markdown", "source": "# Dataset Viewer"})
 
 
 
 
 
 
 
 
 
 
206
  notebook_name = f"{dataset_id.replace('/', '-')}.ipynb"
207
  create_notebook_file(commands, notebook_name=notebook_name)
208
  return gr.File(value=notebook_name, visible=True), gr.Row.update(visible=True)