Spaces:

dkondic
/

data-analyst

Sleeping

App Files Files Community

Dacho688 commited on Jul 29, 2024

Commit

d6042ff

1 Parent(s): 0f55eee

new app

Browse files

git commit

Files changed (4) hide show

app.py +29 -23
app_original.py +133 -0
requirements.txt +6 -2
test_app.py +29 -23

app.py CHANGED Viewed

@@ -9,32 +9,37 @@ from streaming import stream_to_gradio
 from huggingface_hub import login
 from gradio.data_classes import FileData
-login(os.getenv("HUGGINGFACEHUB_API_TOKEN"))
 llm_engine = HfEngine("meta-llama/Meta-Llama-3.1-70B-Instruct")
 agent = ReactCodeAgent(
     tools=[],
     llm_engine=llm_engine,
-    additional_authorized_imports=["numpy", "pandas", "matplotlib.pyplot", "seaborn", "scipy.stats"],
     max_iterations=10,
 )
 base_prompt = """You are an expert data analyst.
-According to the features you have and the data structure given below, determine which feature should be the target.
-Then list 3 interesting questions that could be asked on this data, for instance about specific correlations with target variable.
-Then answer these questions one by one, by finding the relevant numbers.
-Meanwhile, plot some figures using matplotlib/seaborn and save them to the (already existing) folder './figures/': take care to clear each figure with plt.clf() before doing another plot.
-In your final answer: summarize these correlations and trends
 After each number derive real worlds insights, for instance: "Correlation between is_december and boredness is 1.3453, which suggest people are more bored in winter".
-Your final answer should be a long string with at least 3 numbered and detailed parts.
 Structure of the data:
 {structure_notes}
-The data file is passed to you as the variable data_file, it is a pandas dataframe, you can use it directly.
-DO NOT try to load data_file, it is already a dataframe pre-loaded in your python interpreter!
 """
 example_notes="""This data is about the Titanic wreck in 1912.
@@ -75,9 +80,9 @@ def interact_with_agent(file_input, additional_notes):
     prompt = base_prompt.format(structure_notes=data_structure_notes)
     if additional_notes and len(additional_notes) > 0:
-        prompt += "\nAdditional notes on the data:\n" + additional_notes
-    messages = [gr.ChatMessage(role="user", content=prompt)]
     yield messages + [
         gr.ChatMessage(role="assistant", content="⏳ _Starting task..._")
     ]
@@ -101,18 +106,19 @@ def interact_with_agent(file_input, additional_notes):
 with gr.Blocks(
     theme=gr.themes.Soft(
-        primary_hue=gr.themes.colors.yellow,
-        secondary_hue=gr.themes.colors.blue,
     )
 ) as demo:
     gr.Markdown("""# Llama-3.1 Data analyst 📊🤔
-Drop a `.csv` file below, add notes to describe this data if needed, and **Llama-3.1-70B will analyze the file content and draw figures for you!**""")
     file_input = gr.File(label="Your file to analyze")
     text_input = gr.Textbox(
-        label="Additional notes to support the analysis"
     )
-    submit = gr.Button("Run analysis!", variant="primary")
     chatbot = gr.Chatbot(
         label="Data Analyst Agent",
         type="messages",
@@ -121,13 +127,13 @@ Drop a `.csv` file below, add notes to describe this data if needed, and **Llama
             "https://em-content.zobj.net/source/twitter/53/robot-face_1f916.png",
         ),
     )
-    gr.Examples(
-        examples=[["./example/titanic.csv", example_notes]],
-        inputs=[file_input, text_input],
-        cache_examples=False
-    )
     submit.click(interact_with_agent, [file_input, text_input], [chatbot])
 if __name__ == "__main__":
-    demo.launch()

 from huggingface_hub import login
 from gradio.data_classes import FileData
+#login(os.getenv("HUGGINGFACEHUB_API_TOKEN"))
 llm_engine = HfEngine("meta-llama/Meta-Llama-3.1-70B-Instruct")
 agent = ReactCodeAgent(
     tools=[],
     llm_engine=llm_engine,
+    additional_authorized_imports=["numpy", "pandas", "matplotlib", "seaborn","scipy"],
     max_iterations=10,
 )
 base_prompt = """You are an expert data analyst.
+You are given a data file and the data structure below.
+The data file is passed to you as the variable data_file, it is a pandas dataframe, you can use it directly.
+DO NOT try to load data_file, it is already a dataframe pre-loaded in your python interpreter!
+When importing packages use this format: from package import module
+For example: from matplotlib import pyplot as plt
+Not: import matplotlib.pyplot as plt
+As you work, check for NoneType values and convert to NAN.
+Use the data file to answer the question or solve a problem given below.
+In your final answer: summarize your findings
 After each number derive real worlds insights, for instance: "Correlation between is_december and boredness is 1.3453, which suggest people are more bored in winter".
 Structure of the data:
 {structure_notes}
+Question/Problem:
 """
 example_notes="""This data is about the Titanic wreck in 1912.
     prompt = base_prompt.format(structure_notes=data_structure_notes)
     if additional_notes and len(additional_notes) > 0:
+        prompt += additional_notes
+    messages = [gr.ChatMessage(role="user", content=additional_notes)]
     yield messages + [
         gr.ChatMessage(role="assistant", content="⏳ _Starting task..._")
     ]
 with gr.Blocks(
     theme=gr.themes.Soft(
+        primary_hue=gr.themes.colors.blue,
+        secondary_hue=gr.themes.colors.yellow,
     )
 ) as demo:
     gr.Markdown("""# Llama-3.1 Data analyst 📊🤔
+Drop a `.csv` file below and ask a question about your data.
+**Llama-3.1-70B will analyze and answer.**""")
     file_input = gr.File(label="Your file to analyze")
     text_input = gr.Textbox(
+        label="Ask a question about your data?"
     )
+    submit = gr.Button("Run", variant="primary")
     chatbot = gr.Chatbot(
         label="Data Analyst Agent",
         type="messages",
             "https://em-content.zobj.net/source/twitter/53/robot-face_1f916.png",
         ),
     )
+    # gr.Examples(
+    #     examples=[["./example/titanic.csv", example_notes]],
+    #     inputs=[file_input, text_input],
+    #     cache_examples=False
+    # )
     submit.click(interact_with_agent, [file_input, text_input], [chatbot])
 if __name__ == "__main__":
+    demo.launch(server_port=7861)

app_original.py ADDED Viewed

	@@ -0,0 +1,133 @@

+import os
+import shutil
+import gradio as gr
+from transformers import ReactCodeAgent, HfEngine, Tool
+import pandas as pd
+from gradio import Chatbot
+from streaming import stream_to_gradio
+from huggingface_hub import login
+from gradio.data_classes import FileData
+login(os.getenv("HUGGINGFACEHUB_API_TOKEN"))
+llm_engine = HfEngine("meta-llama/Meta-Llama-3.1-70B-Instruct")
+agent = ReactCodeAgent(
+    tools=[],
+    llm_engine=llm_engine,
+    additional_authorized_imports=["numpy", "pandas", "matplotlib.pyplot", "seaborn", "scipy.stats"],
+    max_iterations=10,
+)
+base_prompt = """You are an expert data analyst.
+According to the features you have and the data structure given below, determine which feature should be the target.
+Then list 3 interesting questions that could be asked on this data, for instance about specific correlations with target variable.
+Then answer these questions one by one, by finding the relevant numbers.
+Meanwhile, plot some figures using matplotlib/seaborn and save them to the (already existing) folder './figures/': take care to clear each figure with plt.clf() before doing another plot.
+In your final answer: summarize these correlations and trends
+After each number derive real worlds insights, for instance: "Correlation between is_december and boredness is 1.3453, which suggest people are more bored in winter".
+Your final answer should be a long string with at least 3 numbered and detailed parts.
+Structure of the data:
+{structure_notes}
+The data file is passed to you as the variable data_file, it is a pandas dataframe, you can use it directly.
+DO NOT try to load data_file, it is already a dataframe pre-loaded in your python interpreter!
+"""
+example_notes="""This data is about the Titanic wreck in 1912.
+The target figure is the survival of passengers, notes by 'Survived'
+pclass: A proxy for socio-economic status (SES)
+1st = Upper
+2nd = Middle
+3rd = Lower
+age: Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5
+sibsp: The dataset defines family relations in this way...
+Sibling = brother, sister, stepbrother, stepsister
+Spouse = husband, wife (mistresses and fiancés were ignored)
+parch: The dataset defines family relations in this way...
+Parent = mother, father
+Child = daughter, son, stepdaughter, stepson
+Some children travelled only with a nanny, therefore parch=0 for them."""
+def get_images_in_directory(directory):
+    image_extensions = {'.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff'}
+    image_files = []
+    for root, dirs, files in os.walk(directory):
+        for file in files:
+            if os.path.splitext(file)[1].lower() in image_extensions:
+                image_files.append(os.path.join(root, file))
+    return image_files
+def interact_with_agent(file_input, additional_notes):
+    shutil.rmtree("./figures")
+    os.makedirs("./figures")
+    data_file = pd.read_csv(file_input)
+    data_structure_notes = f"""- Description (output of .describe()):
+    {data_file.describe()}
+    - Columns with dtypes:
+    {data_file.dtypes}"""
+    prompt = base_prompt.format(structure_notes=data_structure_notes)
+    if additional_notes and len(additional_notes) > 0:
+        prompt += "\nAdditional notes on the data:\n" + additional_notes
+    messages = [gr.ChatMessage(role="user", content=prompt)]
+    yield messages + [
+        gr.ChatMessage(role="assistant", content="⏳ _Starting task..._")
+    ]
+    plot_image_paths = {}
+    for msg in stream_to_gradio(agent, prompt, data_file=data_file):
+        messages.append(msg)
+        for image_path in get_images_in_directory("./figures"):
+            if image_path not in plot_image_paths:
+                image_message = gr.ChatMessage(
+                    role="assistant",
+                    content=FileData(path=image_path, mime_type="image/png"),
+                )
+                plot_image_paths[image_path] = True
+                messages.append(image_message)
+        yield messages + [
+            gr.ChatMessage(role="assistant", content="⏳ _Still processing..._")
+        ]
+    yield messages
+with gr.Blocks(
+    theme=gr.themes.Soft(
+        primary_hue=gr.themes.colors.yellow,
+        secondary_hue=gr.themes.colors.blue,
+    )
+) as demo:
+    gr.Markdown("""# Llama-3.1 Data analyst 📊🤔
+Drop a `.csv` file below, add notes to describe this data if needed, and **Llama-3.1-70B will analyze the file content and draw figures for you!**""")
+    file_input = gr.File(label="Your file to analyze")
+    text_input = gr.Textbox(
+        label="Additional notes to support the analysis"
+    )
+    submit = gr.Button("Run analysis!", variant="primary")
+    chatbot = gr.Chatbot(
+        label="Data Analyst Agent",
+        type="messages",
+        avatar_images=(
+            None,
+            "https://em-content.zobj.net/source/twitter/53/robot-face_1f916.png",
+        ),
+    )
+    gr.Examples(
+        examples=[["./example/titanic.csv", example_notes]],
+        inputs=[file_input, text_input],
+        cache_examples=False
+    )
+    submit.click(interact_with_agent, [file_input, text_input], [chatbot])
+if __name__ == "__main__":
+    demo.launch()

requirements.txt CHANGED Viewed

@@ -1,5 +1,9 @@
-git+https://github.com/huggingface/transformers.git#egg=transformers[agents]
 matplotlib
 seaborn
 scikit-learn
-scipy

+#git+https://github.com/huggingface/transformers.git#egg=transformers[agents]
 matplotlib
 seaborn
 scikit-learn
+scipy
+transformers
+pandas==2.2.2
+huggingface_hub
+transformers

test_app.py CHANGED Viewed

@@ -9,32 +9,37 @@ from test_streaming import stream_to_gradio
 from huggingface_hub import login
 from gradio.data_classes import FileData
-login(os.getenv("HUGGINGFACEHUB_API_TOKEN"))
 llm_engine = HfEngine("meta-llama/Meta-Llama-3.1-70B-Instruct")
 agent = ReactCodeAgent(
     tools=[],
     llm_engine=llm_engine,
-    additional_authorized_imports=["numpy", "pandas", "matplotlib.pyplot", "seaborn", "scipy.stats"],
     max_iterations=10,
 )
 base_prompt = """You are an expert data analyst.
-According to the features you have and the data structure given below, determine which feature should be the target.
-Then list 3 interesting questions that could be asked on this data, for instance about specific correlations with target variable.
-Then answer these questions one by one, by finding the relevant numbers.
-Meanwhile, plot some figures using matplotlib/seaborn and save them to the (already existing) folder './figures/': take care to clear each figure with plt.clf() before doing another plot.
-In your final answer: summarize these correlations and trends
 After each number derive real worlds insights, for instance: "Correlation between is_december and boredness is 1.3453, which suggest people are more bored in winter".
-Your final answer should be a long string with at least 3 numbered and detailed parts.
 Structure of the data:
 {structure_notes}
-The data file is passed to you as the variable data_file, it is a pandas dataframe, you can use it directly.
-DO NOT try to load data_file, it is already a dataframe pre-loaded in your python interpreter!
 """
 example_notes="""This data is about the Titanic wreck in 1912.
@@ -75,9 +80,9 @@ def interact_with_agent(file_input, additional_notes):
     prompt = base_prompt.format(structure_notes=data_structure_notes)
     if additional_notes and len(additional_notes) > 0:
-        prompt += "\nAdditional notes on the data:\n" + additional_notes
-    messages = [gr.ChatMessage(role="user", content=prompt)]
     yield messages + [
         gr.ChatMessage(role="assistant", content="⏳ _Starting task..._")
     ]
@@ -101,18 +106,19 @@ def interact_with_agent(file_input, additional_notes):
 with gr.Blocks(
     theme=gr.themes.Soft(
-        primary_hue=gr.themes.colors.yellow,
-        secondary_hue=gr.themes.colors.blue,
     )
 ) as demo:
     gr.Markdown("""# Llama-3.1 Data analyst 📊🤔
-Drop a `.csv` file below, add notes to describe this data if needed, and **Llama-3.1-70B will analyze the file content and draw figures for you!**""")
     file_input = gr.File(label="Your file to analyze")
     text_input = gr.Textbox(
-        label="Additional notes to support the analysis"
     )
-    submit = gr.Button("Run analysis!", variant="primary")
     chatbot = gr.Chatbot(
         label="Data Analyst Agent",
         type="messages",
@@ -121,13 +127,13 @@ Drop a `.csv` file below, add notes to describe this data if needed, and **Llama
             "https://em-content.zobj.net/source/twitter/53/robot-face_1f916.png",
         ),
     )
-    gr.Examples(
-        examples=[["./example/titanic.csv", example_notes]],
-        inputs=[file_input, text_input],
-        cache_examples=False
-    )
     submit.click(interact_with_agent, [file_input, text_input], [chatbot])
 if __name__ == "__main__":
-    demo.launch()

 from huggingface_hub import login
 from gradio.data_classes import FileData
+#login(os.getenv("HUGGINGFACEHUB_API_TOKEN"))
 llm_engine = HfEngine("meta-llama/Meta-Llama-3.1-70B-Instruct")
 agent = ReactCodeAgent(
     tools=[],
     llm_engine=llm_engine,
+    additional_authorized_imports=["numpy", "pandas", "matplotlib", "seaborn","scipy"],
     max_iterations=10,
 )
 base_prompt = """You are an expert data analyst.
+You are given a data file and the data structure below.
+The data file is passed to you as the variable data_file, it is a pandas dataframe, you can use it directly.
+DO NOT try to load data_file, it is already a dataframe pre-loaded in your python interpreter!
+When importing packages use this format: from package import module
+For example: from matplotlib import pyplot as plt
+Not: import matplotlib.pyplot as plt
+As you work, check for NoneType values and convert to NAN.
+Use the data file to answer the question or solve a problem given below.
+In your final answer: summarize your findings
 After each number derive real worlds insights, for instance: "Correlation between is_december and boredness is 1.3453, which suggest people are more bored in winter".
 Structure of the data:
 {structure_notes}
+Question/Problem:
 """
 example_notes="""This data is about the Titanic wreck in 1912.
     prompt = base_prompt.format(structure_notes=data_structure_notes)
     if additional_notes and len(additional_notes) > 0:
+        prompt += additional_notes
+    messages = [gr.ChatMessage(role="user", content=additional_notes)]
     yield messages + [
         gr.ChatMessage(role="assistant", content="⏳ _Starting task..._")
     ]
 with gr.Blocks(
     theme=gr.themes.Soft(
+        primary_hue=gr.themes.colors.blue,
+        secondary_hue=gr.themes.colors.yellow,
     )
 ) as demo:
     gr.Markdown("""# Llama-3.1 Data analyst 📊🤔
+Drop a `.csv` file below and ask a question about your data.
+**Llama-3.1-70B will analyze and answer.**""")
     file_input = gr.File(label="Your file to analyze")
     text_input = gr.Textbox(
+        label="Ask a question about your data?"
     )
+    submit = gr.Button("Run", variant="primary")
     chatbot = gr.Chatbot(
         label="Data Analyst Agent",
         type="messages",
             "https://em-content.zobj.net/source/twitter/53/robot-face_1f916.png",
         ),
     )
+    # gr.Examples(
+    #     examples=[["./example/titanic.csv", example_notes]],
+    #     inputs=[file_input, text_input],
+    #     cache_examples=False
+    # )
     submit.click(interact_with_agent, [file_input, text_input], [chatbot])
 if __name__ == "__main__":
+    demo.launch(server_port=7861)