Spaces:

MachineLearningReply
/

q-and-a-tool

Running

App Files Files Community

Sourabh Zanwar

sourabhzanwar commited on Mar 15, 2024

Commit

3ccc981

unverified ·

1 Parent(s): 60fc52c

added login, upload floater options(#8)

Browse files

Co-authored-by: Sourabh Zanwar <s.zanwar@reply.de>

Files changed (10) hide show

.DS_Store +0 -0
.gitignore +2 -1
README.md +1 -1
app.py +199 -130
generate_keys.py +15 -0
hashed_password.pkl +0 -0
requirements.txt +5 -2
utils/check_pydantic_version.py +26 -0
utils/config.py +4 -2
utils/haystack.py +5 -1

.DS_Store CHANGED Viewed

Binary files a/.DS_Store and b/.DS_Store differ

.gitignore CHANGED Viewed

@@ -1,4 +1,5 @@
 .env
 .vscode
 .idea
-*.pyc

 .env
 .vscode
 .idea
+*.pyc
+**/.DS_Store

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-title: Haystack Search Pipeline with Streamlit
 emoji: 👑
 colorFrom: indigo
 colorTo: indigo

 ---
+title: Document Insights - Extractive & Generative Methods
 emoji: 👑
 colorFrom: indigo
 colorTo: indigo

app.py CHANGED Viewed

@@ -1,3 +1,7 @@
 from operator import index
 import streamlit as st
 import logging
@@ -12,17 +16,45 @@ from utils.ui import reset_results, set_initial_state
 import pandas as pd
 import haystack
 # Whether the file upload should be enabled or not
 DISABLE_FILE_UPLOAD = bool(os.getenv("DISABLE_FILE_UPLOAD"))
 # Define a function to handle file uploads
 def upload_files():
-    uploaded_files = st.sidebar.file_uploader(
-            "upload", type=["pdf", "txt", "docx"], accept_multiple_files=True, label_visibility="hidden"
         )
     return uploaded_files
-# Define a function to process a single file
 def process_file(data_file, preprocesor, document_store):
     # read file and add content
     file_contents = data_file.read().decode("utf-8")
@@ -47,10 +79,34 @@ def process_file(data_file, preprocesor, document_store):
     except Exception as e:
         print(e)
 try:
     args = parser.parse_args()
     preprocesor = start_preprocessor_node()
     document_store = start_document_store(type=args.store)
     retriever = start_retriever(document_store)
     reader = start_reader()
     st.set_page_config(
@@ -65,151 +121,164 @@ try:
     )
     st.sidebar.image("ml_logo.png", use_column_width=True)
-    # Sidebar for Task Selection
-    st.sidebar.header('Options:')
-    # OpenAI Key Input
-    openai_key = st.sidebar.text_input("Enter OpenAI Key:", type="password")
-    if openai_key:
-        task_options = ['Extractive', 'Generative']
-    else:
-        task_options = ['Extractive']
-    task_selection = st.sidebar.radio('Select the task:', task_options)
-    # Check the task and initialize pipeline accordingly
-    if task_selection == 'Extractive':
-        pipeline_extractive = initialize_pipeline("extractive", document_store, retriever, reader)
-    elif task_selection == 'Generative' and openai_key:  # Check for openai_key to ensure user has entered it
-        pipeline_rag = initialize_pipeline("rag", document_store, retriever, reader, openai_key=openai_key)
-    set_initial_state()
-    st.write('# ' + args.name)
-    # File upload block
-    if not DISABLE_FILE_UPLOAD:
-        st.sidebar.write("## File Upload:")
-        #data_files = st.sidebar.file_uploader(
-        #    "upload", type=["pdf", "txt", "docx"], accept_multiple_files=True, label_visibility="hidden"
-        #)
-        data_files = upload_files()
-        if data_files is not None:
-            for data_file in data_files:
-                # Upload file
-                if data_file:
-                    try:
-                        #raw_json = upload_doc(data_file)
-                        # Call the process_file function for each uploaded file
-                        if args.store == 'inmemory':
-                            processed_data = process_file(data_file, preprocesor, document_store)
-                        st.sidebar.write(str(data_file.name) + " &nbsp;&nbsp; ✅ ")
-                    except Exception as e:
-                        st.sidebar.write(str(data_file.name) + " &nbsp;&nbsp; ❌ ")
-                        st.sidebar.write("_This file could not be parsed, see the logs for more information._")
-    if "question" not in st.session_state:
-        st.session_state.question = ""
-    # Search bar
-    question = st.text_input("", value=st.session_state.question, max_chars=100, on_change=reset_results)
-    run_pressed = st.button("Run")
-    run_query = (
-        run_pressed or question != st.session_state.question #or task_selection != st.session_state.task
-    )
-    # Get results for query
-    if run_query and question:
-        if task_selection == 'Extractive':
-            reset_results()
-            st.session_state.question = question
-            with st.spinner("🔎 &nbsp;&nbsp; Running your pipeline"):
-                try:
-                    st.session_state.results_extractive = query(pipeline_extractive, question)
-                    st.session_state.task = task_selection
-                except JSONDecodeError as je:
-                    st.error(
-                        "👓 &nbsp;&nbsp; An error occurred reading the results. Is the document store working?"
-                    )
-                except Exception as e:
-                    logging.exception(e)
-                    st.error("🐞 &nbsp;&nbsp; An error occurred during the request.")
-        elif task_selection == 'Generative':
-            reset_results()
-            st.session_state.question = question
-            with st.spinner("🔎 &nbsp;&nbsp; Running your pipeline"):
-                try:
-                    st.session_state.results_generative = query(pipeline_rag, question)
-                    st.session_state.task = task_selection
-                except JSONDecodeError as je:
-                    st.error(
-                        "👓 &nbsp;&nbsp; An error occurred reading the results. Is the document store working?"
-                    )
-                except Exception as e:
-                    if "API key is invalid" in str(e):
-                        logging.exception(e)
-                        st.error("🐞 &nbsp;&nbsp; incorrect API key provided. You can find your API key at https://platform.openai.com/account/api-keys.")
-                    else:
                         logging.exception(e)
                         st.error("🐞 &nbsp;&nbsp; An error occurred during the request.")
-    # Display results
-    if (st.session_state.results_extractive or st.session_state.results_generative) and run_query:
-        # Handle Extractive Answers
-        if task_selection == 'Extractive':
-            results = st.session_state.results_extractive
-            st.subheader("Extracted Answers:")
-            if 'answers' in results:
-                answers = results['answers']
-                treshold = 0.2
-                higher_then_treshold = any(ans.score > treshold for ans in answers)
-                if not higher_then_treshold:
-                    st.markdown(f"<span style='color:red'>Please note none of the answers achieved a score higher then {int(treshold) * 100}%. Which probably means that the desired answer is not in the searched documents.</span>", unsafe_allow_html=True)
-                for count, answer in enumerate(answers):
-                    if answer.answer:
-                        text, context = answer.answer, answer.context
-                        start_idx = context.find(text)
-                        end_idx = start_idx + len(text)
-                        score = round(answer.score, 3)
-                        st.markdown(f"**Answer {count + 1}:**")
-                        st.markdown(
-                            context[:start_idx] + str(annotation(body=text, label=f'SCORE {score}', background='#964448', color='#ffffff')) + context[end_idx:],
-                            unsafe_allow_html=True,
-                        )
-                    else:
-                        st.info(
-                            "🤔 &nbsp;&nbsp; Haystack is unsure whether any of the documents contain an answer to your question. Try to reformulate it!"
                         )
-        # Handle Generative Answers
-        elif task_selection == 'Generative':
-            results = st.session_state.results_generative
-            st.subheader("Generated Answer:")
-            if 'results' in results:
-                st.markdown("**Answer:**")
-                st.write(results['results'][0])
-        # Handle Retrieved Documents
-        if 'documents' in results:
-            retrieved_documents = results['documents']
-            st.subheader("Retriever Results:")
-            data = []
-            for i, document in enumerate(retrieved_documents):
-                # Truncate the content
-                truncated_content = (document.content[:150] + '...') if len(document.content) > 150 else document.content
-                data.append([i + 1, document.meta['name'], truncated_content])
-            # Convert data to DataFrame and display using Streamlit
-            df = pd.DataFrame(data, columns=['Ranked Context', 'Document Name', 'Content'])
-            st.table(df)
 except SystemExit as e:
-    os._exit(e.code)

+from utils.check_pydantic_version import use_pydantic_v1
+use_pydantic_v1() #This function has to be run before importing haystack. as haystack requires pydantic v1 to run
 from operator import index
 import streamlit as st
 import logging
 import pandas as pd
 import haystack
+from datetime import datetime
+import streamlit.components.v1 as components
+import streamlit_authenticator as stauth
+import pickle
+from streamlit_modal import Modal
+import numpy as np
+names = ['mlreply']
+usernames = ['mlreply']
+with open('hashed_password.pkl','rb') as f:
+    hashed_passwords = pickle.load(f)
 # Whether the file upload should be enabled or not
 DISABLE_FILE_UPLOAD = bool(os.getenv("DISABLE_FILE_UPLOAD"))
+def show_documents_list(retrieved_documents):
+    data = []
+    for i, document in enumerate(retrieved_documents):
+        data.append([document.meta['name']])
+    df = pd.DataFrame(data, columns=['Uploaded Document Name'])
+    df.drop_duplicates(subset=['Uploaded Document Name'], inplace=True)
+    df.index = np.arange(1, len(df) + 1)
+    return df
 # Define a function to handle file uploads
 def upload_files():
+    uploaded_files = upload_container.file_uploader(
+            "upload", type=["pdf", "txt", "docx"], accept_multiple_files=True, label_visibility="hidden", key=1
         )
     return uploaded_files
+# Define a function to process a single file
 def process_file(data_file, preprocesor, document_store):
     # read file and add content
     file_contents = data_file.read().decode("utf-8")
     except Exception as e:
         print(e)
+# Define a function to upload the documents to haystack document store
+def upload_document():
+    if data_files is not None:
+        for data_file in data_files:
+            # Upload file
+            if data_file:
+                try:
+                    #raw_json = upload_doc(data_file)
+                    # Call the process_file function for each uploaded file
+                    if args.store == 'inmemory':
+                        processed_data = process_file(data_file, preprocesor, document_store)
+                    #upload_container.write(str(data_file.name) + " &nbsp;&nbsp; ✅ ")
+                except Exception as e:
+                    upload_container.write(str(data_file.name) + " &nbsp;&nbsp; ❌ ")
+                    upload_container.write("_This file could not be parsed, see the logs for more information._")
+# Define a function to reset the documents in haystack document store
+def reset_documents():
+    print('\nReseting documents list at ' + str(datetime.now()) + '\n')
+    st.session_state.data_files = None
+    document_store.delete_documents()
 try:
     args = parser.parse_args()
     preprocesor = start_preprocessor_node()
     document_store = start_document_store(type=args.store)
+    document_store.get_all_documents()
     retriever = start_retriever(document_store)
     reader = start_reader()
     st.set_page_config(
     )
     st.sidebar.image("ml_logo.png", use_column_width=True)
+    authenticator = stauth.Authenticate(names, usernames, hashed_passwords, "document_search", "random_text", cookie_expiry_days=1)
+    name, authentication_status, username = authenticator.login("Login", "main")
+    if authentication_status == False:
+        st.error("Username/Password is incorrect")
+    if authentication_status == None:
+        st.warning("Please enter your username and password")
+    if authentication_status:
+        # Sidebar for Task Selection
+        st.sidebar.header('Options:')
+        # OpenAI Key Input
+        openai_key = st.sidebar.text_input("Enter LLM-authorization Key:", type="password")
+        if openai_key:
+            task_options = ['Extractive', 'Generative']
+        else:
+            task_options = ['Extractive']
+        task_selection = st.sidebar.radio('Select the task:', task_options)
+        # Check the task and initialize pipeline accordingly
+        if task_selection == 'Extractive':
+            pipeline_extractive = initialize_pipeline("extractive", document_store, retriever, reader)
+        elif task_selection == 'Generative' and openai_key:  # Check for openai_key to ensure user has entered it
+            pipeline_rag = initialize_pipeline("rag", document_store, retriever, reader, openai_key=openai_key)
+        set_initial_state()
+        modal = Modal("Manage Files", key="demo-modal")
+        open_modal = st.sidebar.button("Manage Files", use_container_width=True)
+        if open_modal:
+            modal.open()
+        st.write('# ' + args.name)
+        if modal.is_open():
+            with modal.container():
+                if not DISABLE_FILE_UPLOAD:
+                    upload_container = st.container()
+                    data_files = upload_files()
+                    upload_document()
+                    st.session_state.sidebar_state = 'collapsed'
+                st.table(show_documents_list(document_store.get_all_documents()))
+        # File upload block
+       # if not DISABLE_FILE_UPLOAD:
+        #    upload_container = st.sidebar.container()
+         #   upload_container.write("## File Upload:")
+          #  data_files = upload_files()
+            # Button to update files in the documentStore
+           # upload_container.button('Upload Files', on_click=upload_document, args=())
+        # Button to reset the documents in DocumentStore
+        st.sidebar.button("Reset documents", on_click=reset_documents, args=(), use_container_width=True)
+        if "question" not in st.session_state:
+            st.session_state.question = ""
+        # Search bar
+        question = st.text_input("Question", value=st.session_state.question, max_chars=100, on_change=reset_results, label_visibility="hidden")
+        run_pressed = st.button("Run")
+        run_query = (
+            run_pressed or question != st.session_state.question #or task_selection != st.session_state.task
+        )
+        # Get results for query
+        if run_query and question:
+            if task_selection == 'Extractive':
+                reset_results()
+                st.session_state.question = question
+                with st.spinner("🔎 &nbsp;&nbsp; Running your pipeline"):
+                    try:
+                        st.session_state.results_extractive = query(pipeline_extractive, question)
+                        st.session_state.task = task_selection
+                    except JSONDecodeError as je:
+                        st.error(
+                            "👓 &nbsp;&nbsp; An error occurred reading the results. Is the document store working?"
+                        )
+                    except Exception as e:
                         logging.exception(e)
                         st.error("🐞 &nbsp;&nbsp; An error occurred during the request.")
+            elif task_selection == 'Generative':
+                reset_results()
+                st.session_state.question = question
+                with st.spinner("🔎 &nbsp;&nbsp; Running your pipeline"):
+                    try:
+                        st.session_state.results_generative = query(pipeline_rag, question)
+                        st.session_state.task = task_selection
+                    except JSONDecodeError as je:
+                        st.error(
+                            "👓 &nbsp;&nbsp; An error occurred reading the results. Is the document store working?"
                         )
+                    except Exception as e:
+                        if "API key is invalid" in str(e):
+                            logging.exception(e)
+                            st.error("🐞 &nbsp;&nbsp; incorrect API key provided. You can find your API key at https://platform.openai.com/account/api-keys.")
+                        else:
+                            logging.exception(e)
+                            st.error("🐞 &nbsp;&nbsp; An error occurred during the request.")
+        # Display results
+        if (st.session_state.results_extractive or st.session_state.results_generative) and run_query:
+            # Handle Extractive Answers
+            if task_selection == 'Extractive':
+                results = st.session_state.results_extractive
+                st.subheader("Extracted Answers:")
+                if 'answers' in results:
+                    answers = results['answers']
+                    treshold = 0.2
+                    higher_then_treshold = any(ans.score > treshold for ans in answers)
+                    if not higher_then_treshold:
+                        st.markdown(f"<span style='color:red'>Please note none of the answers achieved a score higher then {int(treshold) * 100}%. Which probably means that the desired answer is not in the searched documents.</span>", unsafe_allow_html=True)
+                    for count, answer in enumerate(answers):
+                        if answer.answer:
+                            text, context = answer.answer, answer.context
+                            start_idx = context.find(text)
+                            end_idx = start_idx + len(text)
+                            score = round(answer.score, 3)
+                            st.markdown(f"**Answer {count + 1}:**")
+                            st.markdown(
+                                context[:start_idx] + str(annotation(body=text, label=f'SCORE {score}', background='#964448', color='#ffffff')) + context[end_idx:],
+                                unsafe_allow_html=True,
+                            )
+                        else:
+                            st.info(
+                                "🤔 &nbsp;&nbsp; Haystack is unsure whether any of the documents contain an answer to your question. Try to reformulate it!"
+                            )
+            # Handle Generative Answers
+            elif task_selection == 'Generative':
+                results = st.session_state.results_generative
+                st.subheader("Generated Answer:")
+                if 'results' in results:
+                    st.markdown("**Answer:**")
+                    st.write(results['results'][0])
+            # Handle Retrieved Documents
+            if 'documents' in results:
+                retrieved_documents = results['documents']
+                st.subheader("Retriever Results:")
+                data = []
+                for i, document in enumerate(retrieved_documents):
+                    # Truncate the content
+                    truncated_content = (document.content[:150] + '...') if len(document.content) > 150 else document.content
+                    data.append([i + 1, document.meta['name'], truncated_content])
+                # Convert data to DataFrame and display using Streamlit
+                df = pd.DataFrame(data, columns=['Ranked Context', 'Document Name', 'Content'])
+                st.table(df)
 except SystemExit as e:
+    os._exit(e.code)

generate_keys.py ADDED Viewed

	@@ -0,0 +1,15 @@

+# -*- coding: utf-8 -*-
+import pickle
+from pathlib import Path
+import streamlit_authenticator as stauth
+names = ['mlreply']
+usernames = ['mlreply']
+passwords = ['mlreply1']
+hashed_passwords = stauth.Hasher((passwords)).generate()
+with open('hashed_password.pkl','wb') as f:
+    pickle.dump(hashed_passwords, f)

hashed_password.pkl ADDED Viewed

Binary file (78 Bytes). View file

requirements.txt CHANGED Viewed

@@ -1,7 +1,10 @@
 safetensors==0.3.3.post1
-farm-haystack[inference,weaviate,opensearch]==1.20.0
 milvus-haystack
 streamlit==1.23.0
 markdown
 st-annotated-text
-datasets

+scikit-learn==1.3.2
 safetensors==0.3.3.post1
+farm-haystack[inference,weaviate,opensearch,file-conversion,pdf]==1.20.0
 milvus-haystack
 streamlit==1.23.0
+streamlit-authenticator==0.1.5
+streamlit_modal
 markdown
 st-annotated-text
+datasets

utils/check_pydantic_version.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import pydantic
+import os
+import fileinput
+def replace_string_in_files(folder_path, old_str, new_str):
+    for subdir, dirs, files in os.walk(folder_path):
+        for file in files:
+            file_path = os.path.join(subdir, file)
+            # Check if the file is a text file (you can modify this condition based on your needs)
+            if file.endswith(".txt") or file.endswith(".py"):
+                # Open the file in place for editing
+                with fileinput.FileInput(file_path, inplace=True) as f:
+                    for line in f:
+                        # Replace the old string with the new string
+                        print(line.replace(old_str, new_str), end='')
+def use_pydantic_v1():
+    module_file_path = pydantic.__file__
+    module_file_path = module_file_path.split('pydantic')[0] + 'haystack'
+    with open(module_file_path+'/schema.py','r') as f:
+        haystack_schema_file = f.read()
+    if 'from pydantic.v1' not in haystack_schema_file:
+        replace_string_in_files(module_file_path, 'from pydantic', 'from pydantic.v1')

utils/config.py CHANGED Viewed

@@ -8,12 +8,14 @@ parser = argparse.ArgumentParser(description='This app lists animals')
 document_store_choices = ('inmemory', 'weaviate', 'milvus', 'opensearch')
 parser.add_argument('--store', choices=document_store_choices, default='inmemory', help='DocumentStore selection (default: %(default)s)')
-parser.add_argument('--name', default="My Search App")
 model_configs = {
     'EMBEDDING_MODEL': os.getenv("EMBEDDING_MODEL", "sentence-transformers/all-MiniLM-L12-v2"),
     'GENERATIVE_MODEL': os.getenv("GENERATIVE_MODEL", "gpt-4"),
-    'EXTRACTIVE_MODEL': os.getenv("EXTRACTIVE_MODEL", "deepset/roberta-base-squad2"),
     'OPENAI_KEY': os.getenv("OPENAI_KEY"),
     'COHERE_KEY': os.getenv("COHERE_KEY"),
 }

 document_store_choices = ('inmemory', 'weaviate', 'milvus', 'opensearch')
 parser.add_argument('--store', choices=document_store_choices, default='inmemory', help='DocumentStore selection (default: %(default)s)')
+parser.add_argument('--name', default="Document Insights: Extractive & Generative Methods")
 model_configs = {
     'EMBEDDING_MODEL': os.getenv("EMBEDDING_MODEL", "sentence-transformers/all-MiniLM-L12-v2"),
     'GENERATIVE_MODEL': os.getenv("GENERATIVE_MODEL", "gpt-4"),
+    #'EXTRACTIVE_MODEL': os.getenv("EXTRACTIVE_MODEL", "deepset/roberta-base-squad2"),
+    'EXTRACTIVE_MODEL': os.getenv("EXTRACTIVE_MODEL", "deepset/gelectra-large-germanquad"),
+    #'EXTRACTIVE_MODEL': os.getenv("EXTRACTIVE_MODEL", "MachineLearningReply/bert-base-german-legal-qa"),
     'OPENAI_KEY': os.getenv("OPENAI_KEY"),
     'COHERE_KEY': os.getenv("COHERE_KEY"),
 }

utils/haystack.py CHANGED Viewed

@@ -6,6 +6,7 @@ from haystack.schema import Answer
 from haystack.document_stores import BaseDocumentStore
 from haystack.document_stores import InMemoryDocumentStore, OpenSearchDocumentStore, WeaviateDocumentStore
 from haystack.nodes import EmbeddingRetriever, FARMReader, PromptNode, PreProcessor
 from milvus_haystack import MilvusDocumentStore
 #Use this file to set up your Haystack pipeline and querying
@@ -99,7 +100,8 @@ def start_haystack_extractive(_document_store: BaseDocumentStore, _retriever: Em
 def start_haystack_rag(_document_store: BaseDocumentStore, _retriever: EmbeddingRetriever, openai_key):
     prompt_node = PromptNode(default_prompt_template="deepset/question-answering",
                              model_name_or_path=model_configs['GENERATIVE_MODEL'],
-                             api_key=openai_key)
     pipe = Pipeline()
     pipe.add_node(component=_retriever, name="Retriever", inputs=["Query"])
@@ -118,3 +120,5 @@ def initialize_pipeline(task, document_store, retriever, reader, openai_key = ""
         return start_haystack_extractive(document_store, retriever, reader)
     elif task == 'rag':
         return start_haystack_rag(document_store, retriever, openai_key)

 from haystack.document_stores import BaseDocumentStore
 from haystack.document_stores import InMemoryDocumentStore, OpenSearchDocumentStore, WeaviateDocumentStore
 from haystack.nodes import EmbeddingRetriever, FARMReader, PromptNode, PreProcessor
+#from haystack.nodes import TextConverter, FileTypeClassifier, PDFToTextConverter
 from milvus_haystack import MilvusDocumentStore
 #Use this file to set up your Haystack pipeline and querying
 def start_haystack_rag(_document_store: BaseDocumentStore, _retriever: EmbeddingRetriever, openai_key):
     prompt_node = PromptNode(default_prompt_template="deepset/question-answering",
                              model_name_or_path=model_configs['GENERATIVE_MODEL'],
+                             api_key=openai_key,
+                             max_length=500)
     pipe = Pipeline()
     pipe.add_node(component=_retriever, name="Retriever", inputs=["Query"])
         return start_haystack_extractive(document_store, retriever, reader)
     elif task == 'rag':
         return start_haystack_rag(document_store, retriever, openai_key)