Spaces:
Sleeping
Sleeping
File size: 5,063 Bytes
23f71e4 63dccee 5cff118 23f71e4 e0c369a 23f71e4 a394a1a 5cff118 a394a1a 23f71e4 e732ed3 f53001a e732ed3 f53001a e732ed3 23f71e4 5cff118 23f71e4 2c39139 23f71e4 5cff118 23f71e4 5cff118 23f71e4 4f43089 23f71e4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 |
import yaml
import fitz
import torch
import gradio as gr
import weaviate
import os
from PIL import Image
from config import MODEL_CONFIG
from langchain_openai import OpenAI
from langchain_openai import OpenAIEmbeddings
from langchain_weaviate.vectorstores import WeaviateVectorStore
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains import ConversationalRetrievalChain
from langchain_community.document_loaders import PyPDFLoader
from langchain.prompts import PromptTemplate
os.environ["HUGGINGFACE_API_TOKEN"] = os.getenv("HUGGINGFACE_API_TOKEN")
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
class PDFChatBot:
def __init__(self):
"""
Initialize the PDFChatBot instance.
"""
self.processed = False
self.page = 0
self.chat_history = []
# Initialize other attributes to None
self.prompt = None
self.documents = None
self.embeddings = None
self.vectordb = None
self.tokenizer = None
self.model = None
self.pipeline = None
self.chain = None
def add_text(self, history, text):
"""
Add user-entered text to the chat history.
Parameters:
history (list): List of chat history tuples.
text (str): User-entered text.
Returns:
list: Updated chat history.
"""
if not text:
raise gr.Error('Enter text')
history.append((text, ''))
return history
def create_prompt_template(self):
"""
Create a prompt template for the chatbot.
"""
template = """
You are an AI Assistant that help user answer question from user.
Combine the chat history and follow up question into a standalone question.
Chat History: {chat_history}
Question: {question}
Answer: """
self.prompt = PromptTemplate.from_template(template)
def load_embeddings(self):
"""
Load embeddings from Hugging Face and set in the config file.
"""
self.embeddings = OpenAIEmbeddings(model=MODEL_CONFIG.MODEL_EMBEDDINGS)
def load_vectordb(self):
"""
Load the vector database from the documents and embeddings.
"""
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(self.documents)
weaviate_client = weaviate.connect_to_wcs(
cluster_url=os.getenv("WEAVIATE_URL"),
auth_credentials=weaviate.auth.AuthApiKey(os.getenv("WEAVIATE_API_KEY"))
)
self.vectordb = WeaviateVectorStore.from_documents(docs, self.embeddings, client=weaviate_client)
def create_chain(self):
"""
Create a Conversational Retrieval Chain
"""
llm = OpenAI(openai_api_key=os.getenv("OPENAI_API_KEY"))
self.chain = ConversationalRetrievalChain.from_llm(
llm,
chain_type="stuff",
retriever=self.vectordb.as_retriever(search_kwargs={"k": 1}),
condense_question_prompt=self.prompt,
return_source_documents=True
)
def process_file(self, file):
"""
Process the uploaded PDF file and initialize necessary components: Tokenizer, VectorDB and LLM.
Parameters:
file (FileStorage): The uploaded PDF file.
"""
self.create_prompt_template()
self.documents = PyPDFLoader(file.name).load()
self.load_embeddings()
self.load_vectordb()
self.create_chain()
def generate_response(self, history, query, file):
"""
Generate a response based on user query and chat history.
Parameters:
history (list): List of chat history tuples.
query (str): User's query.
file (FileStorage): The uploaded PDF file.
Returns:
tuple: Updated chat history and a space.
"""
if not query:
raise gr.Error(message='Submit a question')
if not file:
raise gr.Error(message='Upload a PDF')
if not self.processed:
self.process_file(file)
self.processed = True
result = self.chain({"question": query, 'chat_history': self.chat_history}, return_only_outputs=True)
self.chat_history.append((query, result["answer"]))
self.page = 0
for char in result['answer']:
history[-1][-1] += char
return history, " "
def render_file(self, file):
"""
Renders a specific page of a PDF file as an image.
Parameters:
file (FileStorage): The PDF file.
Returns:
PIL.Image.Image: The rendered page as an image.
"""
doc = fitz.open(file.name)
page = doc[self.page]
pix = page.get_pixmap(matrix=fitz.Matrix(300 / 72, 300 / 72))
image = Image.frombytes('RGB', [pix.width, pix.height], pix.samples)
return image |