Spaces:

arslan-ahmed
/

talk-to-your-docs

Running

File size: 13,739 Bytes

import gdown
from dotenv import load_dotenv
import datetime
import openai
import uuid
import gradio as gr
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.chains import ConversationalRetrievalChain
from langchain.chains import RetrievalQA

import os
from langchain.chat_models import ChatOpenAI
from langchain import OpenAI
from langchain.document_loaders import WebBaseLoader, TextLoader, Docx2txtLoader, PyMuPDFLoader
from whatsapp_chat_custom import WhatsAppChatLoader # use this instead of from langchain.document_loaders import WhatsAppChatLoader

from collections import deque
import re
from bs4 import BeautifulSoup
import requests
from urllib.parse import urlparse
import mimetypes
from pathlib import Path
import tiktoken
from ttyd_functions import *
from ttyd_consts import *

###############################################################################################

load_dotenv()

# select the mode at runtime when starting container - modes options are in ttyd_consts.py
if (os.getenv("TTYD_MODE",'')).split('_')[0]=='personalBot':
    mode = mode_arslan
    gDriveUrl = (os.getenv("GDRIVE_FOLDER_URL",'')).replace('?usp=sharing','')
    # output folder of googe drive folder will be taken as input dir of personalBot
    gdown.download_folder(url=gDriveUrl, output=mode.inputDir, quiet=True)
    if os.getenv("TTYD_MODE",'')!='personalBot_arslan':
        mode.title=''
        mode.welcomeMsg=''

elif os.getenv("TTYD_MODE",'')=='nustian':
    mode = mode_nustian
else:
    mode = mode_general


if mode.type!='userInputDocs':
    # local vector store as opposed to gradio state vector store
    vsDict_hard = localData_vecStore(os.getenv("OPENAI_API_KEY"), inputDir=mode.inputDir, file_list=mode.file_list, url_list=mode.url_list)

###############################################################################################

                                    # Gradio

###############################################################################################

def generateExamples(api_key_st, vsDict_st):
    qa_chain = RetrievalQA.from_llm(llm=ChatOpenAI(openai_api_key=api_key_st, temperature=0), 
                    retriever=vsDict_st['chromaClient'].as_retriever(search_type="similarity", search_kwargs={"k": 4}))

    result = qa_chain({'query': exp_query})
    answer = result['result'].strip('\n')
    grSamples = [[]]
    if answer.startswith('1. '):
        lines = answer.split("\n")  # split the answers into individual lines
        list_items = [line.split(". ")[1] for line in lines]  # extract each answer after the numbering
        grSamples = [[x] for x in list_items] # gr takes list of each item as a list

    return grSamples

# initialize chatbot function sets the QA Chain, and also sets/updates any other components to start chatting. updateQaChain function only updates QA chain and will be called whenever Adv Settings are updated.
def initializeChatbot(temp, k, modelName, stdlQs, api_key_st, vsDict_st, progress=gr.Progress()):
    progress(0.1, waitText_initialize)
    qa_chain_st = updateQaChain(temp, k, modelName, stdlQs, api_key_st, vsDict_st)
    progress(0.5, waitText_initialize)
    #generate welcome message
    if mode.welcomeMsg:
        welMsg = mode.welcomeMsg
    else:
        welMsg = qa_chain_st({'question': initialize_prompt, 'chat_history':[]})['answer']
    print('Chatbot initialized at ', datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
    # exSamples = generateExamples(api_key_st, vsDict_st)
    # exSamples_vis = True if exSamples[0] else False

    return qa_chain_st, btn.update(interactive=True), initChatbot_btn.update('Chatbot ready. Now visit the chatbot Tab.', interactive=False)\
        , aKey_tb.update(), gr.Tabs.update(selected='cb'), chatbot.update(value=[('', welMsg)])


def setApiKey(api_key):
    api_key = transformApi(api_key)
    try:
        openai.Model.list(api_key=api_key) # test the API key
        api_key_st = api_key
        return aKey_tb.update('API Key accepted', interactive=False, type='text'), aKey_btn.update(interactive=False), api_key_st
    except Exception as e:
        return aKey_tb.update(str(e), type='text'), *[x.update() for x in [aKey_btn, api_key_state]]
    
# convert user uploaded data to vectorstore
def uiData_vecStore(userFiles, userUrls, api_key_st, vsDict_st={}, progress=gr.Progress()):
    opComponents = [data_ingest_btn, upload_fb, urls_tb]
    # parse user data
    file_paths = []
    documents = []
    if userFiles is not None:
        if not isinstance(userFiles, list): userFiles = [userFiles]
        file_paths = [file.name for file in userFiles]
    userUrls = [x.strip() for x in userUrls.split(",")] if userUrls else []
    #create documents
    documents = data_ingestion(file_list=file_paths, url_list=userUrls, prog=progress)
    if documents:
        for file in file_paths:
            os.remove(file)
    else:
        return {}, '', *[x.update() for x in opComponents]
    # Splitting and Chunks
    docs = split_docs(documents)
    # Embeddings
    try:
        api_key_st='Null' if api_key_st is None or api_key_st=='' else api_key_st
        openai.Model.list(api_key=api_key_st) # test the API key
        embeddings = OpenAIEmbeddings(openai_api_key=api_key_st)
    except Exception as e:
        return {}, str(e), *[x.update() for x in opComponents]
    
    progress(0.5, 'Creating Vector Database')
    vsDict_st = getVsDict(embeddings, docs, vsDict_st)
    # get sources from metadata
    src_str = getSourcesFromMetadata(vsDict_st['chromaClient'].get()['metadatas'])
    src_str = str(src_str[1]) + ' source document(s) successfully loaded in vector store.'+'\n\n' + src_str[0]
    
    progress(1, 'Data loaded')
    return vsDict_st, src_str, *[x.update(interactive=False) for x in [data_ingest_btn, upload_fb]], urls_tb.update(interactive=False, placeholder='')

# just update the QA Chain, no updates to any UI
def updateQaChain(temp, k, modelName, stdlQs, api_key_st, vsDict_st):
    # if we are not adding data from ui, then use vsDict_hard as vectorstore
    if vsDict_st=={} and mode.type!='userInputDocs': vsDict_st=vsDict_hard
    modelName = modelName.split('(')[0].strip() # so we can provide any info in brackets
    # check if the input model is chat model or legacy model
    try:
        ChatOpenAI(openai_api_key=api_key_st, temperature=0,model_name=modelName,max_tokens=1).predict('')
        llm = ChatOpenAI(openai_api_key=api_key_st, temperature=float(temp),model_name=modelName)
    except:
        OpenAI(openai_api_key=api_key_st, temperature=0,model_name=modelName,max_tokens=1).predict('')
        llm = OpenAI(openai_api_key=api_key_st, temperature=float(temp),model_name=modelName)
    # settingsUpdated = 'Settings updated:'+ ' Model=' + modelName + ', Temp=' + str(temp)+ ', k=' + str(k)
    # gr.Info(settingsUpdated)
    
    # Now create QA Chain using the LLM
    if stdlQs==0: # 0th index i.e. first option
        qa_chain_st = RetrievalQA.from_llm(
                    llm=llm, 
                    retriever=vsDict_st['chromaClient'].as_retriever(search_type="similarity", search_kwargs={"k": int(k)}),
                    return_source_documents=True,
                    input_key = 'question', output_key='answer' # to align with ConversationalRetrievalChain for downstream functions
                )
    else:
        rephQs = False if stdlQs==1 else True
        qa_chain_st = ConversationalRetrievalChain.from_llm(
                    llm=llm, 
                    retriever=vsDict_st['chromaClient'].as_retriever(search_type="similarity", search_kwargs={"k": int(k)}),
                    rephrase_question=rephQs,
                    return_source_documents=True,
                    return_generated_question=True
                )
    
    return qa_chain_st
        

def respond(message, chat_history, qa_chain):
    result = qa_chain({'question': message, "chat_history": [tuple(x) for x in chat_history]})
    src_docs = getSourcesFromMetadata([x.metadata for x in result["source_documents"]], sourceOnly=False)[0]
    # streaming
    streaming_answer = ""
    for ele in "".join(result['answer']):
        streaming_answer += ele
        yield "", chat_history + [(message, streaming_answer)], src_docs, btn.update('Please wait...', interactive=False)
    
    chat_history.extend([(message, result['answer'])])
    yield "", chat_history, src_docs, btn.update('Send Message', interactive=True)

#####################################################################################################

with gr.Blocks(theme=gr.themes.Default(primary_hue='orange', secondary_hue='gray', neutral_hue='blue'), css="footer {visibility: hidden}") as demo:

    # Initialize state variables - stored in this browser session - these can only be used within input or output of .click/.submit etc, not as a python var coz they are not stored in backend, only as a frontend gradio component
    # but if you initialize it with a default value, that value will be stored in backend and accessible across all users. You can also change it with statear.value='newValue'
    qa_state = gr.State()
    api_key_state = gr.State(os.getenv("OPENAI_API_KEY") if mode.type=='personalBot' else 'Null')
    chromaVS_state = gr.State({})


    # Setup the Gradio Layout
    gr.Markdown(mode.title)
    with gr.Tabs() as tabs:
        with gr.Tab('Initialization', id='init'):
            with gr.Row():
                with gr.Column():
                    aKey_tb = gr.Textbox(label="OpenAI API Key", type='password'\
                            , info='You can find OpenAI API key at https://platform.openai.com/account/api-keys'\
                            , placeholder='Enter your API key here and hit enter to begin chatting')
                    aKey_btn = gr.Button("Submit API Key")
            with gr.Row(visible=mode.uiAddDataVis):
                upload_fb = gr.Files(scale=5, label="Upload (multiple) Files - pdf/txt/docx supported", file_types=['.doc', '.docx', 'text', '.pdf', '.csv'])
                urls_tb = gr.Textbox(scale=5, label="Enter URLs starting with https (comma separated)"\
                                    , info=url_tb_info\
                                    , placeholder=url_tb_ph)
                data_ingest_btn = gr.Button("Load Data")
            status_tb = gr.TextArea(label='Status bar', show_label=False, visible=mode.uiAddDataVis)
            initChatbot_btn = gr.Button("Initialize Chatbot", variant="primary")

        with gr.Tab('Chatbot', id='cb'):
            with gr.Row():
                chatbot = gr.Chatbot(label="Chat History", scale=2)
                srcDocs = gr.TextArea(label="References")
            msg = gr.Textbox(label="User Input",placeholder="Type your questions here")
            with gr.Row():
                btn = gr.Button("Send Message", interactive=False, variant="primary")
                clear = gr.ClearButton(components=[msg, chatbot, srcDocs], value="Clear chat history")
            # exp_comp = gr.Dataset(scale=0.7, samples=[['123'],['456'], ['123'],['456'],['456']], components=[msg], label='Examples (auto generated by LLM)', visible=False)
            # gr.Examples(examples=exps,  inputs=msg)
            with gr.Accordion("Advance Settings - click to expand", open=False):
                with gr.Row():
                    with gr.Column():
                        temp_sld = gr.Slider(minimum=0, maximum=1, step=0.1, value=0.7, label="Temperature", info='Sampling temperature to use when calling LLM. Defaults to 0.7')
                        k_sld = gr.Slider(minimum=1, maximum=10, step=1, value=mode.k, label="K", info='Number of relavant documents to return from Vector Store. Defaults to 4')
                        model_dd = gr.Dropdown(label='Model Name'\
                                , choices=model_dd_choices\
                                , value=model_dd_choices[0], allow_custom_value=True\
                                , info=model_dd_info)
                    stdlQs_rb = gr.Radio(label='Standalone Question', info=stdlQs_rb_info\
                            , type='index', value=stdlQs_rb_choices[1]\
                            , choices=stdlQs_rb_choices)
                   
    ### Setup the Gradio Event Listeners

    # API button
    aKey_btn_args = {'fn':setApiKey, 'inputs':[aKey_tb], 'outputs':[aKey_tb, aKey_btn, api_key_state]}
    aKey_btn.click(**aKey_btn_args)
    aKey_tb.submit(**aKey_btn_args)

    # Data Ingest Button
    data_ingest_event = data_ingest_btn.click(uiData_vecStore, [upload_fb, urls_tb, api_key_state, chromaVS_state], [chromaVS_state, status_tb, data_ingest_btn, upload_fb, urls_tb])

    # Adv Settings
    advSet_args = {'fn':updateQaChain, 'inputs':[temp_sld, k_sld, model_dd, stdlQs_rb, api_key_state, chromaVS_state], 'outputs':[qa_state]}
    temp_sld.release(**advSet_args)
    k_sld.release(**advSet_args)
    model_dd.change(**advSet_args)
    stdlQs_rb.change(**advSet_args)

    # Initialize button
    initCb_args = {'fn':initializeChatbot, 'inputs':[temp_sld, k_sld, model_dd, stdlQs_rb, api_key_state, chromaVS_state], 'outputs':[qa_state, btn, initChatbot_btn, aKey_tb, tabs, chatbot]}
    if mode.type=='personalBot':
        demo.load(**initCb_args) # load Chatbot UI directly on startup
    initChatbot_btn.click(**initCb_args)

    # Chatbot submit button
    chat_btn_args = {'fn':respond, 'inputs':[msg, chatbot,  qa_state], 'outputs':[msg, chatbot, srcDocs, btn]}
    btn.click(**chat_btn_args)
    msg.submit(**chat_btn_args)

demo.queue(concurrency_count=10)
demo.launch(show_error=True)