Spaces:

vectara
/

cfpb-assistant

Running

App Files Files Community

david-oplatka commited on Oct 18, 2024

Commit

b6fadc7

1 Parent(s): 4afa25e

Add Assistant Files

Browse files

Files changed (8) hide show

.gitignore +143 -0
Dockerfile +25 -0
Vectara-logo.png +0 -0
agent.py +89 -0
app.py +204 -0
create_table.sql +15 -0
requirements.txt +10 -0
utils.py +74 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,143 @@

+.DS_Store
+# Byte-compiled / optimized / DLL files
+__pycache__/
+crawlers/__pycache__/
+core/__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.env*
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# secrets file in TOML format
+secrets.toml
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# project file
+project.yaml
+.idea/
+ev_database.db

Dockerfile ADDED Viewed

	@@ -0,0 +1,25 @@

+FROM python:3.10
+WORKDIR /app
+COPY ./requirements.txt /app/requirements.txt
+RUN pip3 install --no-cache-dir -r /app/requirements.txt
+# User
+RUN useradd -m -u 1000 user
+USER user
+ENV HOME /home/user
+ENV PATH $HOME/.local/bin:$PATH
+WORKDIR $HOME
+RUN mkdir app
+WORKDIR $HOME/app
+COPY . $HOME/app
+EXPOSE 8501
+CMD streamlit run app.py \
+    --server.headless true \
+    --server.enableCORS false \
+    --server.enableXsrfProtection false \
+    --server.fileWatcherType none

Vectara-logo.png ADDED Viewed

agent.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import os
+from typing import Optional
+from pydantic import Field, BaseModel
+from omegaconf import OmegaConf
+from llama_index.core.utilities.sql_wrapper import SQLDatabase
+from sqlalchemy import create_engine, text
+from dotenv import load_dotenv
+load_dotenv(override=True)
+from vectara_agentic.agent import Agent
+from vectara_agentic.tools import ToolsFactory, VectaraToolFactory
+def create_assistant_tools(cfg):
+    class QueryCFPBComplaints(BaseModel):
+        query: str = Field(description="The user query.")
+    vec_factory = VectaraToolFactory(vectara_api_key=cfg.api_keys,
+                                        vectara_customer_id=cfg.customer_id,
+                                        vectara_corpus_id=cfg.corpus_ids)
+    summarizer = 'vectara-experimental-summary-ext-2023-12-11-med-omni'
+    ask_complaints = vec_factory.create_rag_tool(
+        tool_name = "ask_complaints",
+        tool_description = """
+        Given a user query,
+        returns a response to a user question about customer complaints about bank services.
+        """,
+        tool_args_schema = QueryCFPBComplaints,
+        reranker = "multilingual_reranker_v1", rerank_k = 100,
+        n_sentences_before = 2, n_sentences_after = 2, lambda_val = 0.005,
+        summary_num_results = 5,
+        vectara_summarizer = summarizer,
+        include_citations = False,
+    )
+    tools_factory = ToolsFactory()
+    db_tools = tools_factory.database_tools(
+                tool_name_prefix = "cfpb",
+                content_description = 'Customer complaints about five banks (Bank of America, Wells Fargo, Capital One, Chase, and CITI Bank)',
+                sql_database = SQLDatabase(create_engine('sqlite:///cfpb_database.db')),
+            )
+    return (tools_factory.standard_tools() +
+            tools_factory.guardrail_tools() +
+            db_tools +
+            [ask_complaints]
+    )
+def initialize_agent(_cfg, update_func=None):
+    cfpb_complaints_bot_instructions = """
+    - You are a helpful research assistant, with expertise in complaints from the Consumer Financial Protection Bureau, in conversation with a user.
+    - Before answering any user query, use cfpb_describe_tables to understand schema of each table, and use get_sample_data
+      to get sample data from each table in the database, so that you can understand NULL and unique values for each column.
+    - For a query with multiple sub-questions, break down the query into the sub-questions,
+      and make separate calls to the ask_complaints tool to answer each sub-question,
+      then combine the answers to provide a complete response.
+    - Use the database tools (cfpb_load_data, cfpb_describe_tables and cfpb_list_tables) to answer analytical queries.
+    - IMPORTANT: When using database_tools, always call the ev_load_sample_data tool with the table you want to query
+      to understand the table structure, column naming, and values in the table. Never call the cfpb_load_data tool for a query until you have called cfpb_load_sample_data.
+    - When providing links, try to put the name of the website or source of information for the displayed text. Don't just say 'Source'.
+    - Never discuss politics, and always respond politely.
+    """
+    agent = Agent(
+        tools=create_assistant_tools(_cfg),
+        topic="Customer complaints from the Consumer Financial Protection Bureau (CFPB)",
+        custom_instructions=cfpb_complaints_bot_instructions,
+        update_func=update_func
+    )
+    agent.report()
+    return agent
+def get_agent_config() -> OmegaConf:
+    cfg = OmegaConf.create({
+        'customer_id': str(os.environ['VECTARA_CUSTOMER_ID']),
+        'corpus_ids': str(os.environ['VECTARA_CORPUS_IDS']),
+        'api_keys': str(os.environ['VECTARA_API_KEYS']),
+        'examples': os.environ.get('QUERY_EXAMPLES', None),
+        'demo_name': "cfpb-assistant",
+        'demo_welcome': "Welcome to the CFPB Customer Complaints demo.",
+        'demo_description': "This assistant can help you gain insights into customer complaints to banks recorded by the Consumer Financial Protection Bureau.",
+    })
+    return cfg

app.py ADDED Viewed

	@@ -0,0 +1,204 @@

+from PIL import Image
+import sys
+import os
+import uuid
+import streamlit as st
+from streamlit_pills import pills
+from streamlit_feedback import streamlit_feedback
+import nest_asyncio
+import asyncio
+from utils import thumbs_feedback, escape_dollars_outside_latex, send_amplitude_data
+import sqlite3
+from datasets import load_dataset
+from vectara_agentic.agent import AgentStatusType
+from agent import initialize_agent, get_agent_config
+initial_prompt = "How can I help you today?"
+# Setup for HTTP API Calls to Amplitude Analytics
+if 'device_id' not in st.session_state:
+    st.session_state.device_id = str(uuid.uuid4())
+if "feedback_key" not in st.session_state:
+        st.session_state.feedback_key = 0
+def toggle_logs():
+    st.session_state.show_logs = not st.session_state.show_logs
+def show_example_questions():
+    if len(st.session_state.example_messages) > 0 and st.session_state.first_turn:
+        selected_example = pills("Queries to Try:", st.session_state.example_messages, index=None)
+        if selected_example:
+            st.session_state.ex_prompt = selected_example
+            st.session_state.first_turn = False
+            return True
+    return False
+def update_func(status_type: AgentStatusType, msg: str):
+    if status_type != AgentStatusType.AGENT_UPDATE:
+        output = f"{status_type.value} - {msg}"
+        st.session_state.log_messages.append(output)
+async def launch_bot():
+    def reset():
+        st.session_state.messages = [{"role": "assistant", "content": initial_prompt, "avatar": "🦖"}]
+        st.session_state.thinking_message = "Agent at work..."
+        st.session_state.log_messages = []
+        st.session_state.prompt = None
+        st.session_state.ex_prompt = None
+        st.session_state.first_turn = True
+        st.session_state.show_logs = False
+        if 'agent' not in st.session_state:
+            st.session_state.agent = initialize_agent(cfg, update_func=update_func)
+    if 'cfg' not in st.session_state:
+        cfg = get_agent_config()
+        st.session_state.cfg = cfg
+        st.session_state.ex_prompt = None
+        example_messages = [example.strip() for example in cfg.examples.split(";")] if cfg.examples else []
+        st.session_state.example_messages = [em for em in example_messages if len(em)>0]
+        reset()
+    cfg = st.session_state.cfg
+    # left side content
+    with st.sidebar:
+        image = Image.open('Vectara-logo.png')
+        st.image(image, width=175)
+        st.markdown(f"## {cfg['demo_welcome']}")
+        st.markdown(f"{cfg['demo_description']}")
+        st.markdown("\n\n")
+        bc1, _ = st.columns([1, 1])
+        with bc1:
+            if st.button('Start Over'):
+                reset()
+                st.rerun()
+        st.divider()
+        st.markdown(
+            "## How this works?\n"
+            "This app was built with [Vectara](https://vectara.com).\n\n"
+            "It demonstrates the use of Agentic RAG functionality with Vectara"
+        )
+    if "messages" not in st.session_state.keys():
+        reset()
+    # Display chat messages
+    for message in st.session_state.messages:
+        with st.chat_message(message["role"], avatar=message["avatar"]):
+            st.write(message["content"])
+    example_container = st.empty()
+    with example_container:
+        if show_example_questions():
+            example_container.empty()
+            st.session_state.first_turn = False
+            st.rerun()
+    # User-provided prompt
+    if st.session_state.ex_prompt:
+        prompt = st.session_state.ex_prompt
+    else:
+        prompt = st.chat_input()
+    if prompt:
+        st.session_state.messages.append({"role": "user", "content": prompt, "avatar": '🧑‍💻'})
+        st.session_state.prompt = prompt  # Save the prompt in session state
+        st.session_state.log_messages = []
+        st.session_state.show_logs = False
+        with st.chat_message("user", avatar='🧑‍💻'):
+            print(f"Starting new question: {prompt}\n")
+            st.write(prompt)
+        st.session_state.ex_prompt = None
+    # Generate a new response if last message is not from assistant
+    if st.session_state.prompt:
+        with st.chat_message("assistant", avatar='🤖'):
+            with st.spinner(st.session_state.thinking_message):
+                res = st.session_state.agent.chat(st.session_state.prompt)
+                res = escape_dollars_outside_latex(res)
+            message = {"role": "assistant", "content": res, "avatar": '🤖'}
+            st.session_state.messages.append(message)
+            st.markdown(res)
+        send_amplitude_data(
+            user_query=st.session_state.messages[-2]["content"],
+            bot_response=st.session_state.messages[-1]["content"],
+            demo_name=cfg['demo_name']
+        )
+        st.session_state.ex_prompt = None
+        st.session_state.prompt = None
+        st.session_state.first_turn = False
+        st.rerun()
+    # Record user feedback
+    if (st.session_state.messages[-1]["role"] == "assistant") & (st.session_state.messages[-1]["content"] != initial_prompt):
+        streamlit_feedback(
+            feedback_type="thumbs", on_submit = thumbs_feedback, key = st.session_state.feedback_key,
+            kwargs = {"user_query": st.session_state.messages[-2]["content"],
+                      "bot_response": st.session_state.messages[-1]["content"],
+                      "demo_name": cfg["demo_name"]}
+        )
+    log_placeholder = st.empty()
+    with log_placeholder.container():
+        if st.session_state.show_logs:
+            st.button("Hide Logs", on_click=toggle_logs)
+            for msg in st.session_state.log_messages:
+                st.text(msg)
+        else:
+            if len(st.session_state.log_messages) > 0:
+                st.button("Show Logs", on_click=toggle_logs)
+    sys.stdout.flush()
+def setup_db():
+    db_path = 'cfpb_database.db'
+    conn = sqlite3.connect(db_path)
+    cursor = conn.cursor()
+    with st.spinner("Loading data... Please wait..."):
+        def table_populated() -> bool:
+            cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='cfpb_complaints'")
+            result = cursor.fetchone()
+            if not result:
+                    return False
+            return True
+        if table_populated():
+            print("Database table already populated, skipping setup")
+            conn.close()
+            return
+        else:
+            print("Populating database table")
+        # Execute the SQL commands to create the database table
+        with open('create_table.sql', 'r') as sql_file:
+            sql_script = sql_file.read()
+            cursor.executescript(sql_script)
+        hf_token = os.getenv('HF_TOKEN')
+        # Load data into cfpb_complaints table
+        df = load_dataset("vectara/cfpb-complaints", data_files="cfpb_complaints.csv", token=hf_token)['train'].to_pandas()
+        df.to_sql('cfpb_complaints', conn, if_exists='replace', index=False)
+        # Commit changes and close connection
+        conn.commit()
+        conn.close()
+if __name__ == "__main__":
+    st.set_page_config(page_title="CFPB Complaints Assistant", layout="wide")
+    setup_db()
+    nest_asyncio.apply()
+    asyncio.run(launch_bot())

create_table.sql ADDED Viewed

	@@ -0,0 +1,15 @@

+CREATE TABLE cfpb_complanints (
+    complaint_id INTEGER PRIMARY KEY,
+    company VARCHAR(37),
+    state VARCHAR(2),
+    zip_code INTEGER,
+    product VARCHAR(76),
+    sub_product VARCHAR(48),
+    issue VARCHAR(80),
+    sub_issue VARCHAR(145),
+    date_submitted TEXT,
+    date_received TEXT,
+    report_method VARCHAR(12),
+    complaint_status VARCHAR(31),
+    timely_response INTEGER
+);

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+omegaconf==2.3.0
+python-dotenv==1.0.1
+streamlit==1.32.2
+streamlit_pills==0.3.0
+streamlit-feedback==0.1.3
+langdetect==1.0.9
+langcodes==3.4.0
+datasets==2.19.2
+uuid==1.30
+vectara-agentic==0.1.15

utils.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import os
+import requests
+import json
+import re
+import streamlit as st
+from langdetect import detect_langs
+from langcodes import Language
+headers = {
+    'Content-Type': 'application/json',
+    'Accept': '*/*'
+}
+def identify_language(response):
+    lang_code = detect_langs(response)[0].lang
+    return Language.make(language=lang_code).display_name()
+def thumbs_feedback(feedback, **kwargs):
+    """
+    Sends feedback to Amplitude Analytics
+    """
+    send_amplitude_data(
+        user_query=kwargs.get("user_query", "No user input"),
+        bot_response=kwargs.get("bot_response", "No bot response"),
+        demo_name=kwargs.get("demo_name", "Unknown"),
+        feedback=feedback['score'],
+    )
+    st.session_state.feedback_key += 1
+def send_amplitude_data(user_query, bot_response, demo_name, feedback=None):
+    # Send query and response to Amplitude Analytics
+    data = {
+        "api_key": os.getenv('AMPLITUDE_TOKEN'),
+        "events": [{
+            "device_id": st.session_state.device_id,
+            "event_type": "submitted_query",
+            "event_properties": {
+                "Space Name": demo_name,
+                "Demo Type": "Agent",
+                "query": user_query,
+                "response": bot_response,
+                "Response Language": identify_language(bot_response)
+            }
+        }]
+    }
+    if feedback:
+        data["events"][0]["event_properties"]["feedback"] = feedback
+    response = requests.post('https://api2.amplitude.com/2/httpapi', headers=headers, data=json.dumps(data))
+    if response.status_code != 200:
+        print(f"Amplitude request failed with status code {response.status_code}. Response Text: {response.text}")
+def escape_dollars_outside_latex(text):
+    # Define a regex pattern to find LaTeX equations (double $$ only)
+    pattern = r'\$\$.*?\$\$'
+    latex_matches = re.findall(pattern, text, re.DOTALL)
+    # Placeholder to temporarily store LaTeX equations
+    placeholders = {}
+    for i, match in enumerate(latex_matches):
+        placeholder = f'__LATEX_PLACEHOLDER_{i}__'
+        placeholders[placeholder] = match
+        text = text.replace(match, placeholder)
+    # Escape dollar signs in the rest of the text
+    text = text.replace('$', '\\$')
+    # Replace placeholders with the original LaTeX equations
+    for placeholder, original in placeholders.items():
+        text = text.replace(placeholder, original)
+    return text