vidhiparikh commited on
Commit
51a81da
1 Parent(s): d1418c2

Upload app.py

Browse files

The code implements a conversational AI chatbot leveraging Langchain's document processing, text splitting, and retrieval capabilities, alongside the LLamaCpp conversational language model. It utilizes PyPDF2 for document parsing and HuggingFace's SentenceTransformer for generating embeddings. The chatbot integrates with Gradio for user interaction, enabling natural language queries and responses sourced from documents. This technical setup combines Langchain's functionality with LLamaCpp's conversational abilities, facilitating efficient and contextually relevant interactions through document-based retrieval.

Files changed (1) hide show
  1. app.py +146 -0
app.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import PyPDF2
2
+ import gradio as gr
3
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
4
+ from langchain_community.llms import LlamaCpp
5
+
6
+ from langchain.embeddings import HuggingFaceEmbeddings
7
+ from langchain.memory import ConversationBufferMemory
8
+ from langchain.chains import ConversationalRetrievalChain
9
+ from langchain_community.vectorstores import FAISS
10
+
11
+ from langchain.prompts import PromptTemplate
12
+ from sentence_transformers import SentenceTransformer, util
13
+ from langchain.callbacks.manager import CallbackManager
14
+ from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
15
+
16
+ # Customized file paths
17
+ pdf_files = ["C:/Users/vidhi/OneDrive/Desktop/CVs/final/CV_Vidhi_Parikh.pdf"]
18
+
19
+ # Function to extract documents from PDF files
20
+ def extract_documents_from_pdf(pdf_files):
21
+ documents = []
22
+ metadata = []
23
+ content = []
24
+ for pdf in pdf_files:
25
+ pdf_reader = PyPDF2.PdfReader(pdf)
26
+ for index, page in enumerate(pdf_reader.pages):
27
+ document_page = {'title': pdf + " page " + str(index + 1),'content': page.extract_text()}
28
+ documents.append(document_page)
29
+ for doc in documents:
30
+ content.append(doc["content"])
31
+ metadata.append({
32
+ "title": doc["title"]
33
+ })
34
+ print("Documents extracted from PDF files.")
35
+ return content, metadata
36
+
37
+ # Function to split documents into text chunks
38
+ def split_documents_into_chunks(content, metadata):
39
+ text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
40
+ chunk_size=512,
41
+ chunk_overlap=256,
42
+ )
43
+ split_documents = text_splitter.create_documents(content, metadatas=metadata)
44
+ print(f"Documents split into {len(split_documents)} passages.")
45
+ return split_documents
46
+
47
+ # Function to ingest split documents into the vector database
48
+ def ingest_into_vector_database(split_documents):
49
+ embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
50
+ database = FAISS.from_documents(split_documents, embeddings)
51
+ DB_PATH = 'vectorstore/vector_database'
52
+ database.save_local(DB_PATH)
53
+ return database
54
+
55
+ # Customized conversation template
56
+ template = """[INST]
57
+ As an AI, provide accurate and relevant information based on the provided document. Your responses should adhere to the following guidelines:
58
+ - Answer the question based on the provided documents.
59
+ - Be concise and factual, limited to 50 words and 2-3 sentences. Begin your response without using introductory phrases like yes, no, etc.
60
+ - Maintain an ethical and unbiased tone, avoiding harmful or offensive content.
61
+ - If the document does not contain relevant information, state "I cannot provide an answer based on the provided document."
62
+ - Avoid using confirmatory phrases like "Yes, you are correct" or any similar validation in your responses.
63
+ - Do not fabricate information or include questions in your responses.
64
+ - Do not prompt to select answers. Do not ask additional questions.
65
+ - Cite the source of where exactly the information in the document is found and mention it in your responses.
66
+ {question}
67
+ [/INST]
68
+ """
69
+
70
+ # Callback manager for handling callbacks
71
+ callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
72
+
73
+ # Function to create a conversational chain
74
+ def create_conversational_chain(database):
75
+ llama_llm = LlamaCpp(
76
+ model_path="llama-2-7b-chat.Q8_0.gguf",
77
+ temperature=0.75,
78
+ max_tokens=200,
79
+ top_p=1,
80
+ callback_manager=callback_manager,
81
+ n_ctx=3000)
82
+
83
+ retriever = database.as_retriever()
84
+ CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(template)
85
+
86
+ memory = ConversationBufferMemory(
87
+ memory_key='chat_history', return_messages=True, output_key='answer')
88
+
89
+ conversation_chain = (ConversationalRetrievalChain.from_llm
90
+ (llm=llama_llm,
91
+ retriever=retriever,
92
+ #condense_question_prompt=CONDENSE_QUESTION_PROMPT,
93
+ memory=memory,
94
+ return_source_documents=True))
95
+ print("Conversational Chain created.")
96
+ return conversation_chain
97
+
98
+ # Function to validate the answer against source documents
99
+ def validate_answer(response_answer, source_documents):
100
+ model = SentenceTransformer('all-MiniLM-L6-v2')
101
+ similarity_threshold = 0.5
102
+ source_texts = [doc.page_content for doc in source_documents]
103
+
104
+ answer_embedding = model.encode(response_answer, convert_to_tensor=True)
105
+ source_embeddings = model.encode(source_texts, convert_to_tensor=True)
106
+
107
+ cosine_scores = util.pytorch_cos_sim(answer_embedding, source_embeddings)
108
+
109
+ if any(score.item() > similarity_threshold for score in cosine_scores[0]):
110
+ return True
111
+
112
+ return False
113
+
114
+ # Extract documents from PDF files
115
+ content, metadata = extract_documents_from_pdf(pdf_files)
116
+
117
+ # Split documents into text chunks
118
+ split_documents = split_documents_into_chunks(content, metadata)
119
+
120
+ # Ingest split documents into the vector database
121
+ vector_database = ingest_into_vector_database(split_documents)
122
+ print("Vector database created.")
123
+
124
+ # Create the conversation chain
125
+ conversation_chain = create_conversational_chain(vector_database)
126
+
127
+ # Function for the chatbot
128
+ def chat_with_bot(input_text):
129
+ user_query = input_text
130
+ response = conversation_chain({"question": user_query})
131
+ print("Response:", response)
132
+ print("Answer:", response['answer'])
133
+ return response['answer']
134
+
135
+ # Create Gradio interface
136
+ iface = gr.Interface(
137
+ fn=chat_with_bot,
138
+ inputs=gr.inputs.Textbox(lines=2, label="User Input"),
139
+ outputs="text",
140
+ layout="vertical",
141
+ title="Simple Chatbot",
142
+ description="Enter your message and the chatbot will respond."
143
+ )
144
+
145
+ # Launch the interface
146
+ iface.launch()