File size: 3,500 Bytes
251c3bb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import streamlit as st
from dotenv import load_dotenv 
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
from langchain.vectorstores import FAISS
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.llms import HuggingFaceHub
from html_template import css, bot_template, user_template


def get_pdf_text(pdf_docs):
    text = ''
    for pdf in pdf_docs:
        reader = PdfReader(pdf)
        for page in reader.pages:
            text += page.extract_text()
    return text

def get_text_chuks(raw_text):
    text_splitter = CharacterTextSplitter(
        separator = '\n',
        chunk_size = 1000,
        chunk_overlap  = 200,
        length_function = len
    )
    chunks = text_splitter.split_text(raw_text)
    return chunks

def get_vector_store(text_chunks):
    # embeddings = OpenAIEmbeddings()
    embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
    vector_store = FAISS.from_texts(text_chunks, embeddings)
    return vector_store

def get_conversation_chain(vectorstore):
    llm = ChatOpenAI(temperature=0.2)
    # llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.2, "max_length":512})
    memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
    conversation_chain = ConversationalRetrievalChain.from_llm(
        llm=llm,
        retriever=vectorstore.as_retriever(),
        memory=memory,
        # retriever_kwargs={"k": 1},
    )
    return conversation_chain

def handle_user_question(user_question):
    response = st.session_state.conversation({"question": user_question})
    st.session_state.chat_history = response['chat_history']

    for i, message in enumerate(st.session_state.chat_history):
        if i % 2 == 0:
            st.write(user_template.replace(
                "{{MSG}}", message.content), unsafe_allow_html=True)
        else:
            st.write(bot_template.replace(
                "{{MSG}}", message.content), unsafe_allow_html=True)

def main():
  load_dotenv()
  st.set_page_config(page_title='Chat with your PDFs', page_icon='📂', layout='wide')
  st.header('Chat with multiple PDFs :books:')
  
#   st.write(bot_template.replace('{{MSG}}', 'hello user'), unsafe_allow_html=True)
#   st.write(user_template.replace('{{MSG}}', 'hello bot'), unsafe_allow_html=True)

  
  
  st.write(css, unsafe_allow_html=True)

  if 'conversation' not in st.session_state:
    st.session_state.conversation = None
  if "chat_history" not in st.session_state:
    st.session_state.chat_history = None

  with st.sidebar:
    st.subheader('Document')
    pdf_docs = st.file_uploader('Upload your PDFs here and click on Process', accept_multiple_files=True)
    
    if st.button('Process'):
      with st.spinner('Processing...'):
        # get pdf text
        raw_text = get_pdf_text(pdf_docs)
        # get the text chunks
        text_chunks = get_text_chuks(raw_text)
        # create vector store
        vectorstore = get_vector_store(text_chunks)
        # create conversation chain
        st.session_state.conversation = get_conversation_chain(vectorstore)

  user_question = st.text_input('Ask a question about your pdf')
  if user_question:
     handle_user_question(user_question)

if __name__ == '__main__':
  main()