HarshSanghavi commited on
Commit
8f4d57a
1 Parent(s): f527632

code setup for chatbot

Browse files
Files changed (8) hide show
  1. .gitattributes +3 -0
  2. Document.pdf +3 -0
  3. GPT OUTPUT.docx +3 -0
  4. GPT OUTPUT.pdf +3 -0
  5. app.py +104 -0
  6. app_config.py +19 -0
  7. functions.py +63 -0
  8. requirements.txt +11 -0
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ Document.pdf filter=lfs diff=lfs merge=lfs -text
37
+ GPT[[:space:]]OUTPUT.docx filter=lfs diff=lfs merge=lfs -text
38
+ GPT[[:space:]]OUTPUT.pdf filter=lfs diff=lfs merge=lfs -text
Document.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6e3f9050436b8378c016a68fed3dc1496fedfc2e2eb0e993895d234e3aaabb3a
3
+ size 7575218
GPT OUTPUT.docx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d1a876e10d48280e2e27551bcb4357c4dfd6339b5201c0343c074574372dd6e2
3
+ size 1386219
GPT OUTPUT.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:78f662edaf06fef24c4aa0953ee776d8fad70eee2a4b433209029d08bc75ff17
3
+ size 1351401
app.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import random
3
+ from app_config import SYSTEM_PROMPT, NLP_MODEL_NAME, NUMBER_OF_VECTORS_FOR_RAG, NLP_MODEL_TEMPERATURE, NLP_MODEL_MAX_TOKENS, VECTOR_MAX_TOKENS
4
+ from functions import get_vectorstore_with_doc_from_pdf, tiktoken_len, get_vectorstore_with_doc_from_word
5
+ from langchain.memory import ConversationSummaryBufferMemory
6
+ from langchain_core.messages import SystemMessage, HumanMessage, AIMessage
7
+ from langchain.chains.summarize import load_summarize_chain
8
+ from langchain.prompts import PromptTemplate
9
+ from langchain_groq import ChatGroq
10
+ from dotenv import load_dotenv
11
+ from pathlib import Path
12
+ import os
13
+ from streamlit_pdf_viewer import pdf_viewer
14
+ env_path = Path('.') / '.env'
15
+ load_dotenv(dotenv_path=env_path)
16
+
17
+ def response_generator(prompt: str) -> str:
18
+ """this function can be used for general quetion answers which are related to tyrex and tyre recycling
19
+
20
+ Args:
21
+ prompt (string): user query
22
+
23
+ Returns:
24
+ string: answer of the query
25
+ """
26
+
27
+ try:
28
+ retriever = st.session_state.retriever
29
+ docs = retriever.invoke(prompt)
30
+ my_context = [doc.page_content for doc in docs]
31
+ my_context = '\n\n'.join(my_context)
32
+
33
+ system_message = SystemMessage(content = SYSTEM_PROMPT.format(context=my_context, previous_message_summary=st.session_state.rag_memory.moving_summary_buffer))
34
+ chat_messages = (system_message + st.session_state.rag_memory.chat_memory.messages + HumanMessage(content=prompt)).messages
35
+ print("total tokens: ", tiktoken_len(str(chat_messages)))
36
+ # print("my_context*********",my_context)
37
+ response = st.session_state.llm.invoke(chat_messages)
38
+ return response.content
39
+
40
+ except Exception as error:
41
+ print(error)
42
+ return "Oops! something went wrong, please try again."
43
+
44
+
45
+ st.markdown(
46
+ """
47
+ <style>
48
+ .st-emotion-cache-janbn0 {
49
+ flex-direction: row-reverse;
50
+ text-align: right;
51
+ }
52
+ </style>
53
+ """,
54
+ unsafe_allow_html=True,
55
+ )
56
+
57
+ # When user gives input
58
+ with st.sidebar:
59
+ st.header("Hitachi Support Bot")
60
+ button = st.toggle("View Doc file.")
61
+
62
+ if button:
63
+ pdf_viewer("GPT OUTPUT.pdf")
64
+ else:
65
+ print("SYSTEM MESSAGE")
66
+ if "messages" not in st.session_state:
67
+ st.session_state.messages=[{"role": "system", "content": SYSTEM_PROMPT}]
68
+
69
+ print("SYSTEM MODEL")
70
+ if "llm" not in st.session_state:
71
+ st.session_state.llm = ChatGroq(temperature=NLP_MODEL_TEMPERATURE, groq_api_key=str(os.getenv('GROQ_API_KEY')), model_name=NLP_MODEL_NAME)
72
+
73
+ print("rag")
74
+ if "rag_memory" not in st.session_state:
75
+ st.session_state.rag_memory = ConversationSummaryBufferMemory(llm=st.session_state.llm, max_token_limit= 5000)
76
+
77
+ print("retrival")
78
+ if "retriever" not in st.session_state:
79
+ # vector_store = get_vectorstore_with_doc_from_pdf('GPT OUTPUT.pdf')
80
+ vector_store = get_vectorstore_with_doc_from_word('GPT OUTPUT.docx')
81
+ st.session_state.retriever = vector_store.as_retriever(k=NUMBER_OF_VECTORS_FOR_RAG)
82
+
83
+ print("container")
84
+ # Display chat messages from history
85
+ container = st.container(height=700)
86
+ for message in st.session_state.messages:
87
+ if message["role"] != "system":
88
+ with container.chat_message(message["role"]):
89
+ st.write(message["content"])
90
+
91
+ if prompt := st.chat_input("Enter your query here... "):
92
+ with container.chat_message("user"):
93
+ st.write(prompt)
94
+ st.session_state.messages.append({"role":"user" , "content":prompt})
95
+
96
+ with container.chat_message("assistant"):
97
+ response = response_generator(prompt=prompt)
98
+ print("******************************************************** Response ********************************************************")
99
+ print("MY RESPONSE IS:", response)
100
+ st.write(response)
101
+
102
+ print("Response is:", response)
103
+ st.session_state.rag_memory.save_context({'input': prompt}, {'output': response})
104
+ st.session_state.messages.append({"role":"assistant" , "content":response})
app_config.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ SYSTEM_PROMPT = """
3
+ 1. You are Support bot for hitachi corporation. You must answer of any user questions using context only.
4
+ 2. if you can't provide the answer of the quetions then only tell them "Thank you for your question! I'm here to help with information related to Hitachi corporation.the answer of this question is not given in this video. If you have any queries about those topics, feel free to ask. For other questions, I recommend reaching out to the appropriate source." nothing else.
5
+ 3. User can also give you some greetings like thank you, welcome, please, sorry etc... so you have to handle it appropriately without giving any unnecessary information which is not wanted by user.
6
+ 4. any information must be answered from provided context only, you must not to answer outside to the context.
7
+
8
+ context: {context}
9
+ """
10
+
11
+
12
+ NLP_MODEL_NAME = "llama3-70b-8192"
13
+ REASONING_MODEL_NAME = "mixtral-8x7b-32768"
14
+ REASONING_MODEL_TEMPERATURE = 0
15
+ NLP_MODEL_TEMPERATURE = 0
16
+ NLP_MODEL_MAX_TOKENS = 5400
17
+ VECTOR_MAX_TOKENS = 6000
18
+ VECTORS_TOKEN_OVERLAP_SIZE = 20
19
+ NUMBER_OF_VECTORS_FOR_RAG = 1
functions.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tiktoken
2
+ from langchain_text_splitters import CharacterTextSplitter
3
+ from langchain_chroma import Chroma
4
+ from langchain_community.embeddings import HuggingFaceBgeEmbeddings
5
+ from langchain.document_loaders import PyMuPDFLoader,Docx2txtLoader
6
+ from transformers import pipeline
7
+ from app_config import VECTOR_MAX_TOKENS, VECTORS_TOKEN_OVERLAP_SIZE
8
+ from langchain.docstore.document import Document
9
+ from dotenv import load_dotenv
10
+ from pathlib import Path
11
+ import os
12
+ env_path = Path('.') / '.env'
13
+ load_dotenv(dotenv_path=env_path)
14
+
15
+ tokenizer = tiktoken.get_encoding('cl100k_base')
16
+
17
+ # create the length function
18
+ def tiktoken_len(text):
19
+ tokens = tokenizer.encode(
20
+ text,
21
+ disallowed_special=()
22
+ )
23
+ return len(tokens)
24
+
25
+
26
+ def get_vectorstore_with_doc_from_pdf(pdf_path):
27
+ model_name = "BAAI/bge-small-en"
28
+ model_kwargs = {"device": "cpu"}
29
+ encode_kwargs = {"normalize_embeddings": True}
30
+ hf = HuggingFaceBgeEmbeddings(
31
+ model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
32
+ )
33
+
34
+ loader = PyMuPDFLoader(pdf_path)
35
+ documents = loader.load()
36
+ print(len(documents))
37
+
38
+ all_splits = [doc.page_content for doc in documents]
39
+
40
+ vectorstore = Chroma.from_texts(texts=all_splits, embedding=hf)
41
+ return vectorstore
42
+
43
+ def get_vectorstore_with_doc_from_word(word_path):
44
+ model_name = "BAAI/bge-small-en"
45
+ model_kwargs = {"device": "cpu"}
46
+ encode_kwargs = {"normalize_embeddings": True}
47
+ hf = HuggingFaceBgeEmbeddings(
48
+ model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
49
+ )
50
+
51
+ loader = Docx2txtLoader(word_path)
52
+ documents = loader.load()
53
+ text_splitter = CharacterTextSplitter(
54
+ separator="Page :",
55
+ )
56
+
57
+ # all_splits = text_splitter.split_text(data)
58
+ print(len(documents))
59
+ print("all splits ........................")
60
+ all_splits = text_splitter.split_text(documents[0].page_content)
61
+ print(len(all_splits))
62
+ vectorstore = Chroma.from_texts(texts=all_splits, embedding=hf)
63
+ return vectorstore
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ langchain
3
+ langchain_groq
4
+ python-dotenv
5
+ langchain_community
6
+ langchain_chroma
7
+ tiktoken
8
+ sentence_transformers
9
+ pymupdf
10
+ docx2txt
11
+ streamlit_pdf_viewer