File size: 5,067 Bytes
51a81da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5f0d8ce
51a81da
f8ac855
51a81da
 
 
 
 
f8ac855
 
51a81da
 
 
 
 
 
f8ac855
51a81da
 
f8ac855
51a81da
 
 
 
 
 
 
 
f8ac855
51a81da
 
f8ac855
51a81da
 
 
 
 
 
f8ac855
51a81da
 
 
 
 
f8ac855
51a81da
 
 
 
 
 
f8ac855
51a81da
37393b4
 
 
 
 
 
 
f8ac855
51a81da
 
 
 
 
 
 
 
 
 
 
f8ac855
51a81da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f8ac855
 
 
51a81da
f8ac855
51a81da
f8ac855
51a81da
 
f8ac855
 
51a81da
 
 
f8ac855
c2c21c4
51a81da
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import PyPDF2
import gradio as gr
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.llms import LlamaCpp

from langchain.embeddings import HuggingFaceEmbeddings 
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain_community.vectorstores import FAISS

from langchain.prompts import PromptTemplate
from sentence_transformers import SentenceTransformer, util
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

pdf_files = ["CV_Vidhi_Parikh.pdf"]

def extract_documents(pdf_files):
    documents = []
    metadata = []
    content = []
    for pdf in pdf_files:
        pdf_reader = PyPDF2.PdfReader(pdf)
        for index, text in enumerate(pdf_reader.pages):
            document_page = {'title': pdf + " page " + str(index + 1),'content': pdf_reader.pages[index].extract_text()}
            documents.append(document_page)
    for doc in documents:
        content.append(doc["content"])
        metadata.append({
            "title": doc["title"]
        })
    print("Content and metadata extracted from the documents.")
    return content, metadata

def split_text_chunks(content, metadata):
    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=512,
        chunk_overlap=256,
    )
    split_documents = text_splitter.create_documents(content, metadatas=metadata)
    print(f"Documents split into {len(split_documents)} passages.")
    return split_documents

def ingest_into_database(split_documents):
    embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    database = FAISS.from_documents(split_documents, embeddings)
    DB_PATH = 'vectorstore/db_faiss'
    database.save_local(DB_PATH)
    return database

template = """[INST]
As an AI, provide accurate and relevant information based on the provided document. Your responses should adhere to the following guidelines:
- Answer the question based on the provided documents.
- Be direct and factual, limited to 50 words and 2-3 sentences. Begin your response without using introductory phrases like yes, no etc.
- Maintain an ethical and unbiased tone, avoiding harmful or offensive content.
- If the document does not contain relevant information, state "I cannot provide an answer based on the provided document."
- Avoid using confirmatory phrases like "Yes, you are correct" or any similar validation in your responses.
- Do not fabricate information or include questions in your responses.
- Do not prompt to select answers. Do not ask additional questions.
- Cite the source of where exactly is the information in the document and mention it in your responses.
{question}
[/INST]
"""

callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

def create_conversation_chain(database):
    llama_llm = LlamaCpp(
    model_path="llama-2-7b-chat.Q8_0.gguf",
    temperature=0.75,
    max_tokens=200,
    top_p=1,
    callback_manager=callback_manager,
    n_ctx=3000)

    retriever = database.as_retriever()
    CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(template)

    memory = ConversationBufferMemory(
        memory_key='chat_history', return_messages=True, output_key='answer')

    conversation_chain = (ConversationalRetrievalChain.from_llm
                          (llm=llama_llm,
                           retriever=retriever,
                           #condense_question_prompt=CONDENSE_QUESTION_PROMPT,
                           memory=memory,
                           return_source_documents=True))
    print("Conversational Chain created for the LLM using the vector store.")
    return conversation_chain

def validate_answer(response_answer, source_documents):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    similarity_threshold = 0.5  
    source_texts = [doc.page_content for doc in source_documents]

    answer_embedding = model.encode(response_answer, convert_to_tensor=True)
    source_embeddings = model.encode(source_texts, convert_to_tensor=True)

    cosine_scores = util.pytorch_cos_sim(answer_embedding, source_embeddings)

    if any(score.item() > similarity_threshold for score in cosine_scores[0]):
        return True  

    return False

content, metadata = extract_documents(pdf_files)
split_documents = split_text_chunks(content, metadata)
database = ingest_into_database(split_documents)
print("Vector database created.")
conversation_chain = create_conversation_chain(database)

def chat(input_text):
    user_query = input_text
    response = conversation_chain({"question": user_query})
    print("Answer: ", response)
    print("    Only answer:", response['answer'])
    return response['answer']

iface = gr.Interface(
    fn=chat,
    inputs=gr.Textbox(lines=2, label="User Input"),
    outputs="text",
    layout="vertical",
    title="Simple Chatbot",
    description="Enter your message and the chatbot will respond."
)

iface.launch()