asoria HF staff commited on
Commit
777edd0
1 Parent(s): 44cdaf2

Adding second layer to parse code to cells

Browse files
Files changed (1) hide show
  1. app.py +137 -40
app.py CHANGED
@@ -8,6 +8,8 @@ from huggingface_hub import InferenceClient
8
  import json
9
  import re
10
  import pandas as pd
 
 
11
 
12
  """
13
  TODOs:
@@ -30,6 +32,8 @@ TODOs:
30
  # Configuration
31
  BASE_DATASETS_SERVER_URL = "https://datasets-server.huggingface.co"
32
  HEADERS = {"Accept": "application/json", "Content-Type": "application/json"}
 
 
33
  client = Client(headers=HEADERS)
34
  inference_client = InferenceClient("meta-llama/Meta-Llama-3-8B-Instruct")
35
 
@@ -44,9 +48,12 @@ def get_compatible_libraries(dataset: str):
44
  return resp.json()
45
 
46
 
47
- def generate_eda_prompt(columns_info, df, first_code):
48
- sample_data = df.head(5).to_dict(orient="records")
49
- format_instructions = """
 
 
 
50
  The output should be a markdown code snippet formatted in the
51
  following schema, including the leading and trailing "```json" and "```":
52
 
@@ -58,7 +65,13 @@ following schema, including the leading and trailing "```json" and "```":
58
  }
59
  ]
60
  ```
61
- """
 
 
 
 
 
 
62
 
63
  prompt = """
64
  You are an expert data analyst tasked with generating an exploratory data analysis (EDA) Jupyter notebook. The data is provided as a pandas DataFrame with the following structure:
@@ -83,13 +96,11 @@ It is mandatory that you use the following code to load the dataset, DO NOT try
83
 
84
  {first_code}
85
 
86
- {format_instructions}
87
  """
88
  return prompt.format(
89
  columns_info=columns_info,
90
  sample_data=sample_data,
91
  first_code=first_code,
92
- format_instructions=format_instructions,
93
  )
94
 
95
 
@@ -141,40 +152,40 @@ def get_first_rows_as_df(dataset: str, config: str, split: str, limit: int):
141
  return features_dict, first_rows_df
142
 
143
 
 
 
 
 
 
 
 
144
  def content_from_output(output):
145
  pattern = r"`json(.*?)`"
146
- logging.info("--------> Getting data from output")
147
  match = re.search(pattern, output, re.DOTALL)
148
  if not match:
149
  pattern = r"```(.*?)```"
150
- logging.info("--------> Getting data from output, second try")
151
  match = re.search(pattern, output, re.DOTALL)
152
  if not match:
 
 
 
 
 
 
153
  raise Exception("Unable to generate jupyter notebook.")
154
- extracted_text = match.group(1)
155
- logging.info(extracted_text)
156
- content = json.loads(extracted_text)
157
- logging.info(content)
158
- return content
159
-
160
 
161
- def get_notebook_cells(prompt):
162
- messages = [{"role": "user", "content": prompt}]
163
- output = inference_client.chat_completion(messages=messages, max_tokens=2500)
164
- output = output.choices[0].message.content
165
- return content_from_output(output)
166
 
167
-
168
- def generate_notebook(dataset_id):
169
  try:
170
  libraries = get_compatible_libraries(dataset_id)
171
  except Exception as err:
172
  gr.Error("Unable to retrieve dataset info from HF Hub.")
173
  logging.error(f"Failed to fetch compatible libraries: {err}")
174
- return None
175
 
176
  if not libraries:
177
- gr.Warning("Dataset not compatible with pandas library.")
178
  logging.error(f"Dataset not compatible with pandas library")
179
  return gr.File(visible=False), gr.Row.update(visible=False)
180
 
@@ -183,29 +194,103 @@ def generate_notebook(dataset_id):
183
  None,
184
  )
185
  if not pandas_library:
186
- gr.Warning("Dataset not compatible with pandas library.")
187
- logging.error(f"Dataset not compatible with pandas library")
188
- return gr.File(visible=False), gr.Row.update(visible=False)
189
 
190
  first_config_loading_code = pandas_library["loading_codes"][0]
191
  first_code = first_config_loading_code["code"]
192
-
193
  first_config = first_config_loading_code["config_name"]
194
  first_split = list(first_config_loading_code["arguments"]["splits"].keys())[0]
195
  logging.info(f"First config: {first_config} - first split: {first_split}")
196
  first_file = f"hf://datasets/{dataset_id}/{first_config_loading_code['arguments']['splits'][first_split]}"
197
  logging.info(f"First split file: {first_file}")
198
- html_code = f"<iframe src='https://huggingface.co/datasets/{dataset_id}/embed/viewer' width='80%' height='560px'></iframe>"
199
  features, df = get_first_rows_as_df(dataset_id, first_config, first_split, 3)
200
  prompt = generate_eda_prompt(features, df, first_code)
201
- logging.info(f"Prompt: {prompt}")
202
- commands = get_notebook_cells(prompt)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
  # Adding dataset viewer on the first part
204
- commands.insert(0, {"cell_type": "code", "source": f'from IPython.display import HTML\n\ndisplay(HTML("{html_code}"))'})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205
  commands.insert(0, {"cell_type": "markdown", "source": "# Dataset Viewer"})
206
  notebook_name = f"{dataset_id.replace('/', '-')}.ipynb"
207
  create_notebook_file(commands, notebook_name=notebook_name)
208
- return gr.File(value=notebook_name, visible=True), gr.Row.update(visible=True)
 
 
 
 
 
 
 
 
 
209
 
210
 
211
  with gr.Blocks() as demo:
@@ -231,8 +316,24 @@ with gr.Blocks() as demo:
231
  """
232
  return gr.HTML(value=html_code)
233
 
234
- generate_btn = gr.Button("Generate notebook")
235
- download_link = gr.File(label="Download notebook", visible=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
236
  with gr.Row(visible=False) as auth_page:
237
  with gr.Column():
238
  gr.Markdown(
@@ -246,11 +347,7 @@ with gr.Blocks() as demo:
246
  push_btn = gr.Button("Push notebook to hub", visible=False)
247
  output_lbl = gr.HTML(value="", visible=False)
248
 
249
- generate_btn.click(
250
- generate_notebook,
251
- inputs=[dataset_name],
252
- outputs=[download_link, auth_page],
253
- )
254
 
255
  def auth(token):
256
  if not token:
@@ -271,7 +368,7 @@ with gr.Blocks() as demo:
271
 
272
  push_btn.click(
273
  push_notebook,
274
- inputs=[download_link, dataset_name, token_box],
275
  outputs=output_lbl,
276
  )
277
 
 
8
  import json
9
  import re
10
  import pandas as pd
11
+ from gradio.data_classes import FileData
12
+
13
 
14
  """
15
  TODOs:
 
32
  # Configuration
33
  BASE_DATASETS_SERVER_URL = "https://datasets-server.huggingface.co"
34
  HEADERS = {"Accept": "application/json", "Content-Type": "application/json"}
35
+ GENERATED_TEXT = ""
36
+
37
  client = Client(headers=HEADERS)
38
  inference_client = InferenceClient("meta-llama/Meta-Llama-3-8B-Instruct")
39
 
 
48
  return resp.json()
49
 
50
 
51
+ def generate_mapping_prompt(code):
52
+ logging.info("Generating mapping prompt")
53
+ logging.info(code)
54
+ format_instructions = "Format the following python code to a list of cells to be used in a jupyter notebook:\n"
55
+ format_instructions += code
56
+ format_instructions += """
57
  The output should be a markdown code snippet formatted in the
58
  following schema, including the leading and trailing "```json" and "```":
59
 
 
65
  }
66
  ]
67
  ```
68
+ """
69
+
70
+ return format_instructions
71
+
72
+
73
+ def generate_eda_prompt(columns_info, df, first_code):
74
+ sample_data = df.head(5).to_dict(orient="records")
75
 
76
  prompt = """
77
  You are an expert data analyst tasked with generating an exploratory data analysis (EDA) Jupyter notebook. The data is provided as a pandas DataFrame with the following structure:
 
96
 
97
  {first_code}
98
 
 
99
  """
100
  return prompt.format(
101
  columns_info=columns_info,
102
  sample_data=sample_data,
103
  first_code=first_code,
 
104
  )
105
 
106
 
 
152
  return features_dict, first_rows_df
153
 
154
 
155
+ def get_txt_from_output(output):
156
+ extracted_text = content_from_output(output)
157
+ content = json.loads(extracted_text)
158
+ logging.info(content)
159
+ return content
160
+
161
+
162
  def content_from_output(output):
163
  pattern = r"`json(.*?)`"
 
164
  match = re.search(pattern, output, re.DOTALL)
165
  if not match:
166
  pattern = r"```(.*?)```"
 
167
  match = re.search(pattern, output, re.DOTALL)
168
  if not match:
169
+ try:
170
+ index = output.index("```json")
171
+ logging.info(f"Index: {index}")
172
+ return output[index + 7 :]
173
+ except:
174
+ pass
175
  raise Exception("Unable to generate jupyter notebook.")
176
+ return match.group(1)
 
 
 
 
 
177
 
 
 
 
 
 
178
 
179
+ def generate_cells(dataset_id):
 
180
  try:
181
  libraries = get_compatible_libraries(dataset_id)
182
  except Exception as err:
183
  gr.Error("Unable to retrieve dataset info from HF Hub.")
184
  logging.error(f"Failed to fetch compatible libraries: {err}")
185
+ return []
186
 
187
  if not libraries:
188
+ gr.Error("Dataset not compatible with pandas library.")
189
  logging.error(f"Dataset not compatible with pandas library")
190
  return gr.File(visible=False), gr.Row.update(visible=False)
191
 
 
194
  None,
195
  )
196
  if not pandas_library:
197
+ gr.Error("Dataset not compatible with pandas library.")
198
+ return []
 
199
 
200
  first_config_loading_code = pandas_library["loading_codes"][0]
201
  first_code = first_config_loading_code["code"]
 
202
  first_config = first_config_loading_code["config_name"]
203
  first_split = list(first_config_loading_code["arguments"]["splits"].keys())[0]
204
  logging.info(f"First config: {first_config} - first split: {first_split}")
205
  first_file = f"hf://datasets/{dataset_id}/{first_config_loading_code['arguments']['splits'][first_split]}"
206
  logging.info(f"First split file: {first_file}")
 
207
  features, df = get_first_rows_as_df(dataset_id, first_config, first_split, 3)
208
  prompt = generate_eda_prompt(features, df, first_code)
209
+ messages = [gr.ChatMessage(role="user", content=prompt)]
210
+ yield messages + [gr.ChatMessage(role="assistant", content="⏳ _Starting task..._")]
211
+
212
+ prompt_messages = [{"role": "user", "content": prompt}]
213
+ output = inference_client.chat_completion(
214
+ messages=prompt_messages, stream=True, max_tokens=2500
215
+ )
216
+
217
+ global GENERATED_TEXT
218
+ GENERATED_TEXT = ""
219
+ current_line = ""
220
+ for chunk in output:
221
+ current_line += chunk.choices[0].delta.content
222
+ if current_line.endswith("\n"):
223
+ GENERATED_TEXT += current_line
224
+ messages.append(gr.ChatMessage(role="assistant", content=current_line))
225
+ current_line = ""
226
+ yield messages
227
+ yield messages
228
+
229
+ logging.info("---> FOrmated prompt")
230
+ formatted_prompt = generate_mapping_prompt(GENERATED_TEXT)
231
+ logging.info(formatted_prompt)
232
+ prompt_messages = [{"role": "user", "content": formatted_prompt}]
233
+ yield messages + [gr.ChatMessage(role="assistant", content="⏳ _Generating notebook..._")]
234
+
235
+ output = inference_client.chat_completion(
236
+ messages=prompt_messages, stream=False, max_tokens=2500
237
+ )
238
+ cells_txt = output.choices[0].message.content
239
+ logging.info("---> Model output")
240
+ logging.info(cells_txt)
241
+
242
+
243
+ commands = get_txt_from_output(cells_txt)
244
+ html_code = f"<iframe src='https://huggingface.co/datasets/{dataset_id}/embed/viewer' width='80%' height='560px'></iframe>"
245
  # Adding dataset viewer on the first part
246
+ commands.insert(
247
+ 0,
248
+ {
249
+ "cell_type": "code",
250
+ "source": f'from IPython.display import HTML\n\ndisplay(HTML("{html_code}"))',
251
+ },
252
+ )
253
+ commands.insert(0, {"cell_type": "markdown", "source": "# Dataset Viewer"})
254
+ notebook_name = f"{dataset_id.replace('/', '-')}.ipynb"
255
+ create_notebook_file(commands, notebook_name=notebook_name)
256
+ messages.append(
257
+ gr.ChatMessage(role="user", content="Here is the generated notebook")
258
+ )
259
+ yield messages
260
+ messages.append(
261
+ gr.ChatMessage(
262
+ role="user",
263
+ content=FileData(path=notebook_name, mime_type="application/x-ipynb+json"),
264
+ )
265
+ )
266
+ yield messages
267
+
268
+ def write_notebook_file(dataset_id, history):
269
+ if not GENERATED_TEXT:
270
+ raise Exception("No generated notebook")
271
+ commands = get_txt_from_output(GENERATED_TEXT)
272
+ html_code = f"<iframe src='https://huggingface.co/datasets/{dataset_id}/embed/viewer' width='80%' height='560px'></iframe>"
273
+ # Adding dataset viewer on the first part
274
+ commands.insert(
275
+ 0,
276
+ {
277
+ "cell_type": "code",
278
+ "source": f'from IPython.display import HTML\n\ndisplay(HTML("{html_code}"))',
279
+ },
280
+ )
281
  commands.insert(0, {"cell_type": "markdown", "source": "# Dataset Viewer"})
282
  notebook_name = f"{dataset_id.replace('/', '-')}.ipynb"
283
  create_notebook_file(commands, notebook_name=notebook_name)
284
+ history.append(
285
+ gr.ChatMessage(role="user", content="Here is the generated notebook")
286
+ )
287
+ history.append(
288
+ gr.ChatMessage(
289
+ role="user",
290
+ content=FileData(path=notebook_name, mime_type="application/x-ipynb+json"),
291
+ )
292
+ )
293
+ return history
294
 
295
 
296
  with gr.Blocks() as demo:
 
316
  """
317
  return gr.HTML(value=html_code)
318
 
319
+ generate_cells_btn = gr.Button("Generate notebook")
320
+
321
+ chatbot = gr.Chatbot(
322
+ label="Results",
323
+ type="messages",
324
+ avatar_images=(
325
+ None,
326
+ None,
327
+ ),
328
+ )
329
+
330
+ generate_cells_btn.click(
331
+ generate_cells,
332
+ inputs=[dataset_name],
333
+ outputs=[chatbot],
334
+ )
335
+
336
+
337
  with gr.Row(visible=False) as auth_page:
338
  with gr.Column():
339
  gr.Markdown(
 
347
  push_btn = gr.Button("Push notebook to hub", visible=False)
348
  output_lbl = gr.HTML(value="", visible=False)
349
 
350
+
 
 
 
 
351
 
352
  def auth(token):
353
  if not token:
 
368
 
369
  push_btn.click(
370
  push_notebook,
371
+ inputs=[dataset_name, token_box],
372
  outputs=output_lbl,
373
  )
374