File size: 6,494 Bytes
51a81da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
436241a
 
51a81da
5f0d8ce
51a81da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fe4fe9d
 
 
 
 
 
 
 
 
 
 
 
 
51a81da
1bc21a7
fe4fe9d
51a81da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import PyPDF2
import gradio as gr
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.llms import LlamaCpp

from langchain.embeddings import HuggingFaceEmbeddings 
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain_community.vectorstores import FAISS

from langchain.prompts import PromptTemplate
from sentence_transformers import SentenceTransformer, util
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

from ctransformers import AutoModelForCausalLM

# Customized file paths
pdf_files = ["CV_Vidhi_Parikh.pdf"]

# Function to extract documents from PDF files
def extract_documents_from_pdf(pdf_files):
    documents = []
    metadata = []
    content = []
    for pdf in pdf_files:
        pdf_reader = PyPDF2.PdfReader(pdf)
        for index, page in enumerate(pdf_reader.pages):
            document_page = {'title': pdf + " page " + str(index + 1),'content': page.extract_text()}
            documents.append(document_page)
    for doc in documents:
        content.append(doc["content"])
        metadata.append({
            "title": doc["title"]
        })
    print("Documents extracted from PDF files.")
    return content, metadata

# Function to split documents into text chunks
def split_documents_into_chunks(content, metadata):
    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=512,
        chunk_overlap=256,
    )
    split_documents = text_splitter.create_documents(content, metadatas=metadata)
    print(f"Documents split into {len(split_documents)} passages.")
    return split_documents

# Function to ingest split documents into the vector database
def ingest_into_vector_database(split_documents):
    embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    database = FAISS.from_documents(split_documents, embeddings)
    DB_PATH = 'vectorstore/vector_database'
    database.save_local(DB_PATH)
    return database

# Customized conversation template
template = """[INST]
As an AI, provide accurate and relevant information based on the provided document. Your responses should adhere to the following guidelines:
- Answer the question based on the provided documents.
- Be concise and factual, limited to 50 words and 2-3 sentences. Begin your response without using introductory phrases like yes, no, etc.
- Maintain an ethical and unbiased tone, avoiding harmful or offensive content.
- If the document does not contain relevant information, state "I cannot provide an answer based on the provided document."
- Avoid using confirmatory phrases like "Yes, you are correct" or any similar validation in your responses.
- Do not fabricate information or include questions in your responses.
- Do not prompt to select answers. Do not ask additional questions.
- Cite the source of where exactly the information in the document is found and mention it in your responses.
{question}
[/INST]
"""

# Callback manager for handling callbacks
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

# Function to create a conversational chain
def create_conversational_chain(database):
    model_name = “llama-7b-hf”
    model_directory = “models”
    #Check if the model file exists in the specified directory
    model_file = os.path.join(model_directory, model_name)
    if os.path.exists(model_file):
        model_path = model_file
        print(“Model file found in the directory. Using the local model file.”)
    else:
        model_path = model_name
        print(“Model file not found in the directory. Downloading the model from the repository.”)
    #Load the model
    model = AutoModelForCausalLM.from_pretrained(model_path)
    print(model_path)
    llama_llm = LlamaCpp(
        # Set gpu_layers to the number of layers to offload to GPU. Set to 0 if no GPU acceleration is available on your system. llama-2-7b-chat.Q8_0.gguf
        model_path = model_path,
        temperature=0.75,
        max_tokens=200,
        top_p=1,
        callback_manager=callback_manager,
        n_ctx=3000)

    retriever = database.as_retriever()
    CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(template)

    memory = ConversationBufferMemory(
        memory_key='chat_history', return_messages=True, output_key='answer')

    conversation_chain = (ConversationalRetrievalChain.from_llm
                          (llm=llama_llm,
                           retriever=retriever,
                           #condense_question_prompt=CONDENSE_QUESTION_PROMPT,
                           memory=memory,
                           return_source_documents=True))
    print("Conversational Chain created.")
    return conversation_chain

# Function to validate the answer against source documents
def validate_answer(response_answer, source_documents):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    similarity_threshold = 0.5  
    source_texts = [doc.page_content for doc in source_documents]

    answer_embedding = model.encode(response_answer, convert_to_tensor=True)
    source_embeddings = model.encode(source_texts, convert_to_tensor=True)

    cosine_scores = util.pytorch_cos_sim(answer_embedding, source_embeddings)

    if any(score.item() > similarity_threshold for score in cosine_scores[0]):
        return True  

    return False

# Extract documents from PDF files
content, metadata = extract_documents_from_pdf(pdf_files)

# Split documents into text chunks
split_documents = split_documents_into_chunks(content, metadata)

# Ingest split documents into the vector database
vector_database = ingest_into_vector_database(split_documents)
print("Vector database created.")

# Create the conversation chain
conversation_chain = create_conversational_chain(vector_database)

# Function for the chatbot
def chat_with_bot(input_text):
    user_query = input_text
    response = conversation_chain({"question": user_query})
    print("Response:", response)
    print("Answer:", response['answer'])
    return response['answer']

# Create Gradio interface
iface = gr.Interface(
    fn=chat_with_bot,
    inputs=gr.inputs.Textbox(lines=2, label="User Input"),
    outputs="text",
    layout="vertical",
    title="Simple Chatbot",
    description="Enter your message and the chatbot will respond."
)

# Launch the interface
iface.launch()