Aziz Alto commited on
Commit
32a1d6d
β€’
1 Parent(s): d56ba03

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +477 -0
app.py ADDED
@@ -0,0 +1,477 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ from functools import lru_cache
3
+
4
+ import pandas as pd
5
+ import streamlit as st
6
+ import streamlit_ace as stace
7
+ import duckdb
8
+ import numpy as np # for user session
9
+ import scipy # for user session
10
+ import plotly_express
11
+ import plotly.express as px # for user session
12
+ import plotly.figure_factory as ff # for user session
13
+ import matplotlib.pyplot as plt # for user session
14
+ import sklearn
15
+ from ydata_profiling import ProfileReport
16
+ from streamlit_pandas_profiling import st_profile_report
17
+
18
+ st.set_page_config(page_title="PySQLify", page_icon="πŸ”Ž", layout="wide")
19
+ st.title("PySQLify")
20
+ st.write("_Data Analysis_ Tool")
21
+
22
+ p = st.write
23
+ print = st.write
24
+ display = st.write
25
+
26
+ @st.cache_data
27
+ def _read_csv(f, **kwargs):
28
+ df = pd.read_csv(f, on_bad_lines="skip", **kwargs)
29
+ # clean
30
+ df.columns = [c.strip() for c in df.columns]
31
+ return df
32
+
33
+
34
+ def timer(func):
35
+ def wrapper_function(*args, **kwargs):
36
+ start_time = time.time()
37
+ func(*args, **kwargs)
38
+ st.write(f"`{(time.time() - start_time):.2f}s.`")
39
+ return wrapper_function
40
+
41
+
42
+ SAMPLE_DATA = {
43
+ "Churn dataset": "https://raw.githubusercontent.com/AtashfarazNavid/MachineLearing-ChurnModeling/main/Streamlit-WebApp-1/Churn.csv",
44
+ "Periodic Table": "https://gist.githubusercontent.com/GoodmanSciences/c2dd862cd38f21b0ad36b8f96b4bf1ee/raw/1d92663004489a5b6926e944c1b3d9ec5c40900e/Periodic%2520Table%2520of%2520Elements.csv",
45
+ "Movies": "https://raw.githubusercontent.com/reisanar/datasets/master/HollywoodMovies.csv",
46
+ "Iris Flower": "https://gist.githubusercontent.com/netj/8836201/raw/6f9306ad21398ea43cba4f7d537619d0e07d5ae3/iris.csv",
47
+ "World Population": "https://gist.githubusercontent.com/curran/13d30e855d48cdd6f22acdf0afe27286/raw/0635f14817ec634833bb904a47594cc2f5f9dbf8/worldcities_clean.csv",
48
+ "Country Table": "https://raw.githubusercontent.com/datasciencedojo/datasets/master/WorldDBTables/CountryTable.csv",
49
+ "World Cities": "https://raw.githubusercontent.com/dr5hn/countries-states-cities-database/master/csv/cities.csv",
50
+ "World States": "https://raw.githubusercontent.com/dr5hn/countries-states-cities-database/master/csv/states.csv",
51
+ "World Countries": "https://raw.githubusercontent.com/dr5hn/countries-states-cities-database/master/csv/countries.csv"
52
+ }
53
+
54
+
55
+ def read_data():
56
+ txt = "Upload a data file (supported files: .csv)"
57
+ placeholder = st.empty()
58
+ with placeholder:
59
+ col1, col2, col3 = st.columns([3, 2, 1])
60
+ with col1:
61
+ file_ = st.file_uploader(txt, help="TODO: .tsv, .xls, .xlsx")
62
+ with col2:
63
+ url = st.text_input(
64
+ "Read from a URL",
65
+ placeholder="Enter URL (supported types: .csv and .tsv)",
66
+ )
67
+ if url:
68
+ file_ = url
69
+ with col3:
70
+ selected = st.selectbox("Select a sample dataset", options=[""] + list(SAMPLE_DATA))
71
+ if selected:
72
+ file_ = SAMPLE_DATA[selected]
73
+
74
+ if not file_:
75
+ st.stop()
76
+
77
+ placeholder.empty()
78
+ # kwargs = {"skiprows": st.number_input("skip header", value=0, max_value=10)}
79
+ kwargs = {"skiprows": 0}
80
+ try:
81
+ return _read_csv(file_, **kwargs)
82
+ except Exception as e:
83
+ st.warning("Unsupported file type!")
84
+ st.stop()
85
+
86
+
87
+ def display(df):
88
+ view_info = st.sidebar.checkbox("view data types")
89
+ st.dataframe(df, use_container_width=True)
90
+
91
+ # info
92
+ st.markdown(f"> <sup>shape `{df.shape}`</sup>", unsafe_allow_html=True)
93
+
94
+ if view_info:
95
+ types_ = df.dtypes.to_dict()
96
+ types_ = [{"Column": c, "Type": t} for c, t in types_.items()]
97
+ df_ = pd.DataFrame(types_)
98
+ st.sidebar.subheader("TABLE DETAILS")
99
+ st.sidebar.write(df_)
100
+
101
+
102
+ def code_editor(language, hint, show_panel, key=None, content=None):
103
+ # Spawn a new Ace editor
104
+ placeholder = st.empty()
105
+
106
+ default_theme = "solarized_dark" if language == "sql" else "chrome"
107
+
108
+ with placeholder.expander("CELL CONFIG"):
109
+ # configs
110
+ _THEMES = stace.THEMES
111
+ _KEYBINDINGS = stace.KEYBINDINGS
112
+ col21, col22 = st.columns(2)
113
+ with col21:
114
+ theme = st.selectbox("Theme", options=[default_theme] + _THEMES, key=f"{language}1{key}")
115
+ tab_size = st.slider("Tab size", min_value=1, max_value=8, value=4, key=f"{language}2{key}")
116
+ with col22:
117
+ keybinding = st.selectbox("Keybinding", options=[_KEYBINDINGS[-2]] + _KEYBINDINGS, key=f"{language}3{key}")
118
+ font_size = st.slider("Font size", min_value=5, max_value=24, value=14, key=f"{language}4{key}")
119
+ height = st.slider("Editor height", value=130, max_value=777,key=f"{language}5{key}")
120
+ # kwargs = {theme: theme, keybinding: keybinding} # TODO: DRY
121
+ if not show_panel:
122
+ placeholder.empty()
123
+
124
+ content = stace.st_ace(
125
+ value=content if content else "",
126
+ language=language,
127
+ height=height,
128
+ show_gutter=False,
129
+ # annotations="",
130
+ placeholder=hint,
131
+ keybinding=keybinding,
132
+ theme=theme,
133
+ font_size=font_size,
134
+ tab_size=tab_size,
135
+ key=key
136
+ )
137
+
138
+ # Display editor's content as you type
139
+ # content
140
+ return content
141
+
142
+
143
+ @st.cache_data
144
+ def query_data(sql, df):
145
+ try:
146
+ return duckdb.query(sql).df()
147
+ except Exception as e:
148
+ st.warning("Invalid Query!")
149
+ # st.stop()
150
+
151
+
152
+ def download(df, key, save_as="results.csv"):
153
+ # -- to download
154
+ # @st.cache_data
155
+ def convert_df(_df):
156
+ return _df.to_csv().encode("utf-8")
157
+
158
+ csv = convert_df(df)
159
+ st.download_button(
160
+ "Download",
161
+ csv,
162
+ save_as,
163
+ "text/csv",
164
+ key=key
165
+ )
166
+
167
+
168
+ def display_results(query: str, result: pd.DataFrame, key: str):
169
+ st.dataframe(result, use_container_width=True)
170
+ st.markdown(f"> `{result.shape}`")
171
+ download(result, key=key)
172
+
173
+
174
+ @timer
175
+ def run_python_script(user_script, key):
176
+ if user_script.startswith("st.") or ";" in user_script:
177
+ py = user_script
178
+ elif user_script.endswith("?"): # -- same as ? in Jupyter Notebook
179
+ in_ = user_script.replace("?", "")
180
+ py = f"st.help({in_})"
181
+ else:
182
+ py = f"st.write({user_script})"
183
+ try:
184
+ cmds = py.split(";")
185
+ for cmd in cmds:
186
+ exec(cmd)
187
+ except Exception as e:
188
+ c1, c2 = st.columns(2)
189
+ c1.warning("Wrong Python command.")
190
+ if c2.button("Show error", key=key):
191
+ st.exception(e)
192
+
193
+
194
+ @st.cache_resource
195
+ def data_profiler(df):
196
+ return ProfileReport(df, title="Profiling Report")
197
+
198
+
199
+ def docs():
200
+ content = """
201
+
202
+ # What
203
+
204
+ Upload a dataset to process (manipulate/analyze) it using SQL and Python, similar to running Jupyter Notebooks.
205
+ To get started, drag and drop the dataset file, read from a URL, or select a sample dataset. To load a new dataset, refresh the webpage.
206
+ > <sub>[_src code_ here](https://github.com/iamaziz/sqlify)</sub>
207
+
208
+ More public datasets available [here](https://github.com/fivethirtyeight/data).
209
+
210
+ # Usage
211
+
212
+ Example usage
213
+
214
+ > After loading the sample Iris dataset from sklearn (or select it from the dropdown list), the lines below can be executed inside a Python cell:
215
+
216
+ ```python
217
+
218
+ from sklearn.datasets import load_iris;
219
+ from sklearn import tree;
220
+ iris = load_iris();
221
+ X, y = iris.data, iris.target;
222
+ clf = tree.DecisionTreeClassifier(max_depth=4);
223
+ clf = clf.fit(X, y);
224
+ plt.figure(figsize=(7,3));
225
+ fig, ax = plt.subplots()
226
+ tree.plot_tree(clf, filled=True, fontsize=4);
227
+ st.pyplot(fig)
228
+ ```
229
+
230
+ Which outputs the tree below:
231
+
232
+ > <img width="1000" alt="image" src="https://user-images.githubusercontent.com/3298308/222992623-1dba9bad-4858-43b6-84bf-9d7cf78d61f7.png">
233
+
234
+ # SCREENSHOTS
235
+
236
+ ## _EXAMPLE 1_
237
+ ![image](https://user-images.githubusercontent.com/3298308/222946054-a92ea42c-ffe6-4958-900b-2b72056216f8.png)
238
+
239
+ ## _EXAMPLE 2_
240
+ ![image](https://user-images.githubusercontent.com/3298308/222947315-f2c06063-dd18-4215-bbab-c1b2f3f00888.png)
241
+ ![image](https://user-images.githubusercontent.com/3298308/222947321-c7e38d9d-7274-4368-91c1-1548b0da14dc.png)
242
+
243
+ ## _EXAMPLE 3_
244
+ ![image](https://user-images.githubusercontent.com/3298308/222949287-2024a75f-04db-4861-93b5-c43d206e2dc6.png)
245
+
246
+ ## _EXAMPLE 4_
247
+ ![image](https://user-images.githubusercontent.com/3298308/222984104-0bfd806f-ecd9-455e-b368-181f9aa0225b.png)
248
+
249
+ """
250
+
251
+ with st.expander("READE"):
252
+ st.markdown(content, unsafe_allow_html=True)
253
+
254
+ return st.checkbox("Show more code examples")
255
+
256
+
257
+ def display_example_snippets():
258
+ from glob import glob
259
+
260
+ examples = glob("./examples/*")
261
+ with st.expander("EXAMPLES"):
262
+ example = st.selectbox("", options=[""] + examples)
263
+ if example:
264
+ with open(example, "r") as f:
265
+ content = f.read()
266
+ st.code(content)
267
+
268
+
269
+ class GPTWrapper:
270
+ def __init__(self):#, df_info):
271
+
272
+ from gpt import AnthropicSerivce, OpenAIService
273
+
274
+ self.anthropic_model = AnthropicSerivce()
275
+ self.df_info = df_info
276
+
277
+ @staticmethod
278
+ @st.cache_data
279
+ def ask_sql(df_info, question):
280
+ from gpt import OpenAIService
281
+ openai_model = OpenAIService()
282
+ prompt = GPTWrapper().build_sql_prompt(df_info, question)
283
+ res = openai_model.prompt(prompt)
284
+ return res, prompt
285
+
286
+ @staticmethod
287
+ @st.cache_data
288
+ def ask_python(df_info, question):
289
+ from gpt import OpenAIService
290
+ openai_model = OpenAIService()
291
+ prompt = GPTWrapper().build_python_prompt(df_info, question)
292
+ res = openai_model.prompt(prompt)
293
+ return res, prompt
294
+
295
+
296
+ @staticmethod
297
+ @st.cache_data
298
+ def build_sql_prompt(df_info, question):
299
+ prompt = f"""I have data in a pandas dataframe, here is the data schema: {df_info}
300
+ Next, I will ask you a question. Assume the table name is `df`.
301
+ And you will answer in writing a SQL query only. {question}
302
+ """
303
+ return prompt
304
+
305
+ @staticmethod
306
+ @st.cache_data
307
+ def build_python_prompt(df_info, question):
308
+ prompt = f"""I have data in a pandas dataframe, here is the dataframe schema: {df_info}
309
+ Next, I will ask you a question. And you will answer in writing a Python code only.
310
+ Assume the data is stored in a variable named `df`.
311
+ Here are some instructions for the generated Python code:
312
+
313
+ - You should always use the variable `df` to refer to the dataframe.
314
+ - You should not include any markdown syntax or any other syntax that is not Python in the answer.
315
+ - Import any required libraries in the first line of the generated code.
316
+ - Just show the Python code only, don't include any Python comments or English explanation in the answer text.
317
+ - If the generarted code has multiple Python lines, every Python line must end with a semicolon (;).
318
+ - If the answer is not a plot or a figure, always use print to print the answer using print().
319
+ - If the answer requires plotting, generate a plot using plotly_express and show it using st.plotly_chart(fig).
320
+
321
+ Here is the question: {question}
322
+ """
323
+ return prompt
324
+
325
+
326
+ def ask_gpt_sql(df_info, key):
327
+ # -- GPT AI
328
+ # agi = GPTWrapper(df_info=df_info)
329
+ question = st.text_input("Ask a question about the dataset to get a SQL query that answers the question",
330
+ placeholder="How many rows are there in the dataset?",
331
+ key=key
332
+ )
333
+ if question:
334
+ # res, prompt = agi.ask_sql(df_info, question)
335
+ res, prompt = GPTWrapper().ask_sql(df_info, question)
336
+ # st.markdown(f"```{prompt}```")
337
+ sql_query = res.choices[0].message.content
338
+ st.code(sql_query, language="sql")
339
+ return sql_query
340
+
341
+ def ask_gpt_python(df_info, key):
342
+ # -- GPT AI
343
+ # agi = GPTWrapper(df_info=df_info)
344
+ question = st.text_input("Ask a question about the dataset to get a Python code that answers the question",
345
+ placeholder="How many rows and columns are there in the dataset?",
346
+ key=key
347
+ )
348
+ if question:
349
+ # res, prompt = agi.ask_python(df_info, question)
350
+ res, prompt = GPTWrapper().ask_python(df_info, question)
351
+ # st.markdown(f"```{prompt}```")
352
+ python_code = res.choices[0].message.content
353
+ st.code(python_code, language="python")
354
+ # st.markdown(f"```{python_code}```", unsafe_allow_html=True)
355
+ return python_code
356
+
357
+
358
+ if __name__ == "__main__":
359
+ show_examples = docs()
360
+ if show_examples:
361
+ display_example_snippets()
362
+
363
+ df = read_data()
364
+ display(df)
365
+
366
+ # -- data schema
367
+ import io
368
+
369
+ sio = io.StringIO()
370
+ df.info(buf=sio)
371
+ df_info = sio.getvalue()
372
+ # st.markdown(f"```{df_info}```", unsafe_allow_html=True)
373
+
374
+
375
+
376
+ # run and execute SQL script
377
+ def sql_cells(df):
378
+ st.write("---")
379
+ st.header("SQL")
380
+ hint = """Type SQL to query the loaded dataset, data is stored in a table named 'df'.
381
+ For example, to select 10 rows:
382
+ SELECT * FROM df LIMIT 10
383
+ Describe the table:
384
+ DESCRIBE TABLE df
385
+ """
386
+ number_cells = st.sidebar.number_input("Number of SQL cells to use", value=1, max_value=40)
387
+ for i in range(number_cells):
388
+ key = f"sql{i}"
389
+ col1, col2 = st.columns([2, 1])
390
+ st.markdown("<br>", unsafe_allow_html=True)
391
+ show_panel = False #col2.checkbox("Show cell config panel", key=f"{i}-sql")
392
+
393
+
394
+ col1.write(f"> `IN[{i+1}]`")
395
+
396
+ # with col2:
397
+ # -- GPT AI
398
+ query = ask_gpt_sql(df_info, key=f"{key}-gpt")
399
+ content = None
400
+ if query and st.button("Use SQL", key=f"{key}-use-sql"):
401
+ content = query
402
+ # with col1:
403
+ sql = code_editor("sql", hint, show_panel=show_panel, key=key, content=content if content else None)
404
+ if sql:
405
+ st.code(sql, language="sql")
406
+ st.write(f"`OUT[{i+1}]`")
407
+ res = query_data(sql, df)
408
+ display_results(sql, res, f"{key}{sql}")
409
+
410
+ # run and dexectue python script
411
+ def python_cells():
412
+ st.write("---")
413
+ st.markdown("### Python")
414
+ hint = """Type Python command (one-liner) to execute or manipulate the dataframe e.g. `df.sample(7)`. By default, results are rendered using `st.write()`.
415
+ πŸ“Š Visulaization example: from "movies" dataset, plot average rating by genre:
416
+ st.line_chart(df.groupby("Genre")[["RottenTomatoes", "AudienceScore"]].mean())
417
+ πŸ—Ί Maps example: show the top 10 populated cities in the world on map (from "Cities Population" dataset)
418
+ st.map(df.sort_values(by='population', ascending=False)[:10])
419
+
420
+ NOTE: for multi-lines, a semi-colon can be used to end each line e.g.
421
+ print("first line");
422
+ print("second line);
423
+ """
424
+ hint = """Type Python code here (use semicolons to end each line)"""
425
+ help = """
426
+ For multiple lines, use semicolons e.g.
427
+
428
+ ```python
429
+
430
+ fig, ax = plt.subplots();
431
+ ax.hist(df[[col1, col2]]);
432
+ st.pyplot(fig);
433
+ ```
434
+ or
435
+
436
+ ```python
437
+ groups = [group for _, group in df.groupby('class')];
438
+ for i in range(3):
439
+ st.write(groups[i]['name'].iloc[0])
440
+ st.bar_chart(groups[i].mean())
441
+ ```
442
+ """
443
+ number_cells = st.sidebar.number_input("Number of Python cells to use", value=1, max_value=40, min_value=1, help=help)
444
+ for i in range(number_cells):
445
+ # st.markdown("<br><br><br>", unsafe_allow_html=True)
446
+ col1, col2 = st.columns([2, 1])
447
+ # col1.write(f"> `IN[{i+1}]`")
448
+ show_panel = False # col2.checkbox("Show cell config panel", key=f"panel{i}")
449
+
450
+ # -- GPT AI
451
+ query = ask_gpt_python(df_info, key=f"{i}-gpt")
452
+ content = None
453
+ if query and st.checkbox("Use generated code", key=f"{i}-use-python"):
454
+ content = query
455
+ user_script = code_editor("python", hint, show_panel=show_panel, key=i, content=content if content else None)
456
+ if user_script:
457
+ df.rename(columns={"lng": "lon"}, inplace=True) # hot-fix for "World Population" dataset
458
+ st.write(f"> `IN[{i+1}]`")
459
+ st.code(user_script, language="python")
460
+ st.write(f"> `OUT[{i+1}]`")
461
+ run_python_script(user_script, key=f"{user_script}{i}")
462
+
463
+
464
+ if st.sidebar.checkbox("Show SQL cells", value=True):
465
+ sql_cells(df)
466
+ if st.sidebar.checkbox("Show Python cells", value=True):
467
+ python_cells()
468
+
469
+ st.sidebar.write("---")
470
+
471
+ if st.sidebar.checkbox("Generate Data Profile Report", help="pandas profiling, generated by [ydata-profiling](https://github.com/ydataai/ydata-profiling)"):
472
+ st.write("---")
473
+ st.header("Data Profiling")
474
+ profile = data_profiler(df)
475
+ st_profile_report(profile)
476
+
477
+ st.write("---")