Documentation
Browse files- app.py +26 -2
- utils/document_parsing.py +83 -2
- utils/llm_generation.py +46 -3
- utils/retrieval.py +12 -1
app.py
CHANGED
@@ -16,15 +16,39 @@ llm_model_name = "gpt-4o-mini"
|
|
16 |
llm_generator = None
|
17 |
|
18 |
|
19 |
-
def set_api_key(api_key: str):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
if api_key.strip():
|
21 |
os.environ["OPENAI_API_KEY"] = api_key
|
22 |
else:
|
23 |
raise gr.Error("Please provide a valid API key")
|
24 |
|
25 |
|
26 |
-
def process_inputs(api_key: str, pdf_file, questions: str):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
|
|
|
|
|
|
|
28 |
# Setup Api KEY
|
29 |
set_api_key(api_key)
|
30 |
|
|
|
16 |
llm_generator = None
|
17 |
|
18 |
|
19 |
+
def set_api_key(api_key: str) -> None:
|
20 |
+
"""
|
21 |
+
Sets the OpenAI API key as an environment variable.
|
22 |
+
|
23 |
+
Parameters:
|
24 |
+
api_key (str): The OpenAI API key to be set.
|
25 |
+
|
26 |
+
Returns:
|
27 |
+
None: This function does not return any value.
|
28 |
+
|
29 |
+
Raises:
|
30 |
+
gr.Error: If the provided API key is empty or consists only of whitespace characters.
|
31 |
+
"""
|
32 |
if api_key.strip():
|
33 |
os.environ["OPENAI_API_KEY"] = api_key
|
34 |
else:
|
35 |
raise gr.Error("Please provide a valid API key")
|
36 |
|
37 |
|
38 |
+
def process_inputs(api_key: str, pdf_file, questions: str) -> str:
|
39 |
+
"""
|
40 |
+
This function processes the inputs, sets up the API key, validates the PDF file, parses the PDF,
|
41 |
+
creates a vector store, generates an LLM generator, validates the questions, retrieves top similar chunks,
|
42 |
+
generates answers, and returns the output in JSON format.
|
43 |
+
|
44 |
+
Parameters:
|
45 |
+
api_key (str): The OpenAI API key for accessing the LLM model.
|
46 |
+
pdf_file (File): The uploaded PDF file.
|
47 |
+
questions (str): The list of questions, one per line.
|
48 |
|
49 |
+
Returns:
|
50 |
+
str: The output in JSON format containing the answers to the questions.
|
51 |
+
"""
|
52 |
# Setup Api KEY
|
53 |
set_api_key(api_key)
|
54 |
|
utils/document_parsing.py
CHANGED
@@ -10,6 +10,17 @@ class DocParsing:
|
|
10 |
chunk_overlap = 50
|
11 |
|
12 |
def __init__(self, file_path, model_name, max_model_tokens=384):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
self.file_path = file_path
|
14 |
|
15 |
# Initialize the tokenizer for all-MiniLM
|
@@ -18,16 +29,59 @@ class DocParsing:
|
|
18 |
self.max_model_tokens = max_model_tokens
|
19 |
|
20 |
def process_pdf(self):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
self.load_pdf()
|
22 |
self.create_chunks()
|
23 |
return self.chunks
|
24 |
|
25 |
def load_pdf(self):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
loader = PyPDFLoader(self.file_path)
|
27 |
self.documents = loader.load()
|
28 |
|
29 |
def create_chunks(self):
|
30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
self.chunks = []
|
32 |
for doc in self.documents:
|
33 |
self.chunks.extend(
|
@@ -37,10 +91,37 @@ class DocParsing:
|
|
37 |
)
|
38 |
|
39 |
def tokenize(self, text):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
return self.tokenizer.encode(text, add_special_tokens=False)
|
41 |
|
42 |
def token_split_document(self, doc: Document, chunk_size=350, chunk_overlap=50):
|
43 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
tokens = self.tokenize(doc.page_content)
|
45 |
chunks = []
|
46 |
start = 0
|
|
|
10 |
chunk_overlap = 50
|
11 |
|
12 |
def __init__(self, file_path, model_name, max_model_tokens=384):
|
13 |
+
"""
|
14 |
+
Initialize the DocParsing class with the provided file path, model name, and maximum model tokens.
|
15 |
+
|
16 |
+
Parameters:
|
17 |
+
file_path (str): The path to the PDF file to be processed.
|
18 |
+
model_name (str): The name of the transformer model to be used for tokenization.
|
19 |
+
max_model_tokens (int, optional): The maximum number of tokens allowed for each chunk. Defaults to 384.
|
20 |
+
|
21 |
+
Returns:
|
22 |
+
None
|
23 |
+
"""
|
24 |
self.file_path = file_path
|
25 |
|
26 |
# Initialize the tokenizer for all-MiniLM
|
|
|
29 |
self.max_model_tokens = max_model_tokens
|
30 |
|
31 |
def process_pdf(self):
|
32 |
+
"""
|
33 |
+
Process the PDF file by loading it, splitting it into chunks, and returning the chunks.
|
34 |
+
|
35 |
+
This function first calls the `load_pdf` method to load the PDF file into a list of Document objects.
|
36 |
+
Then, it calls the `create_chunks` method to split each Document into smaller chunks based on the specified
|
37 |
+
chunk size and overlap. Finally, it returns the list of chunks.
|
38 |
+
|
39 |
+
Parameters:
|
40 |
+
None
|
41 |
+
|
42 |
+
Returns:
|
43 |
+
list: A list of Document objects, where each Document represents a chunk of the PDF file.
|
44 |
+
"""
|
45 |
self.load_pdf()
|
46 |
self.create_chunks()
|
47 |
return self.chunks
|
48 |
|
49 |
def load_pdf(self):
|
50 |
+
"""
|
51 |
+
Load the PDF file specified by the file_path attribute into a list of Document objects.
|
52 |
+
|
53 |
+
This function uses the PyPDFLoader class from the langchain library to load the PDF file.
|
54 |
+
The loaded Document objects are stored in the self.documents attribute.
|
55 |
+
|
56 |
+
Parameters:
|
57 |
+
None
|
58 |
+
|
59 |
+
Returns:
|
60 |
+
None
|
61 |
+
|
62 |
+
Raises:
|
63 |
+
FileNotFoundError: If the specified file_path does not exist or cannot be accessed.
|
64 |
+
"""
|
65 |
loader = PyPDFLoader(self.file_path)
|
66 |
self.documents = loader.load()
|
67 |
|
68 |
def create_chunks(self):
|
69 |
+
"""
|
70 |
+
Split the loaded PDF documents into smaller chunks based on the specified chunk size and overlap.
|
71 |
+
|
72 |
+
This function iterates through each Document object in the self.documents list and calls the
|
73 |
+
token_split_document method to split the Document into smaller chunks. The resulting chunks are
|
74 |
+
then appended to the self.chunks list.
|
75 |
+
|
76 |
+
Parameters:
|
77 |
+
None
|
78 |
+
|
79 |
+
Returns:
|
80 |
+
None
|
81 |
+
|
82 |
+
Attributes:
|
83 |
+
self.chunks (list): A list of Document objects, where each Document represents a chunk of the PDF file.
|
84 |
+
"""
|
85 |
self.chunks = []
|
86 |
for doc in self.documents:
|
87 |
self.chunks.extend(
|
|
|
91 |
)
|
92 |
|
93 |
def tokenize(self, text):
|
94 |
+
"""
|
95 |
+
Tokenize the input text using the transformer model's tokenizer.
|
96 |
+
|
97 |
+
This method uses the tokenizer provided by the transformer model to encode the input text.
|
98 |
+
The special tokens are not added to the encoded tokens.
|
99 |
+
|
100 |
+
Parameters:
|
101 |
+
text (str): The input text to be tokenized.
|
102 |
+
|
103 |
+
Returns:
|
104 |
+
list: A list of integers representing the tokenized input text.
|
105 |
+
"""
|
106 |
return self.tokenizer.encode(text, add_special_tokens=False)
|
107 |
|
108 |
def token_split_document(self, doc: Document, chunk_size=350, chunk_overlap=50):
|
109 |
+
"""
|
110 |
+
Split a single Document into multiple chunks based on token length.
|
111 |
+
|
112 |
+
This function tokenizes the input Document's page content, then splits the tokens into smaller chunks
|
113 |
+
of specified size. Overlapping chunks are created by moving the start index forward by the difference
|
114 |
+
between chunk size and overlap. Each chunk is then decoded back into text and a new Document is created
|
115 |
+
with the same metadata but truncated text.
|
116 |
+
|
117 |
+
Parameters:
|
118 |
+
doc (Document): The input Document to be split into chunks.
|
119 |
+
chunk_size (int, optional): The size of each chunk in tokens. Defaults to 350.
|
120 |
+
chunk_overlap (int, optional): The overlap between chunks in tokens. Defaults to 50.
|
121 |
+
|
122 |
+
Returns:
|
123 |
+
list: A list of Document objects, where each Document represents a chunk of the input Document.
|
124 |
+
"""
|
125 |
tokens = self.tokenize(doc.page_content)
|
126 |
chunks = []
|
127 |
start = 0
|
utils/llm_generation.py
CHANGED
@@ -26,6 +26,15 @@ json_schema = {
|
|
26 |
|
27 |
class LLMGeneration:
|
28 |
def __init__(self, llm_model_name="gpt-4o-mini"):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
self.llm_model_name = llm_model_name
|
30 |
self.llm = ChatOpenAI(
|
31 |
model_name=self.llm_model_name,
|
@@ -41,6 +50,18 @@ class LLMGeneration:
|
|
41 |
self.create_initial_prompt()
|
42 |
|
43 |
def create_initial_prompt(self):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
# System message for the chain
|
45 |
system_message = SystemMessage(
|
46 |
content=(
|
@@ -61,8 +82,21 @@ class LLMGeneration:
|
|
61 |
|
62 |
self.initial_prompt_messages = [system_message] + few_shots
|
63 |
|
64 |
-
def create_human_message_prompt(self, query: str, docs: List[Document]):
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
# Prepare the context from the retrieved chunks
|
67 |
context = "\n\n".join(
|
68 |
[f"<context>{doc.page_content}</context>" for doc in docs]
|
@@ -76,15 +110,24 @@ class LLMGeneration:
|
|
76 |
|
77 |
return HumanMessagePromptTemplate.from_template(human_message)
|
78 |
|
79 |
-
def generate_answer(self, query: str, docs: List[Document]):
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
|
|
|
|
|
|
|
81 |
# Create the prompt template
|
82 |
prompt = ChatPromptTemplate.from_messages(
|
83 |
self.initial_prompt_messages
|
84 |
+ [self.create_human_message_prompt(query, docs)]
|
85 |
)
|
86 |
|
87 |
-
# Create and run the chain with the
|
88 |
chain = LLMChain(
|
89 |
llm=self.llm,
|
90 |
prompt=prompt,
|
|
|
26 |
|
27 |
class LLMGeneration:
|
28 |
def __init__(self, llm_model_name="gpt-4o-mini"):
|
29 |
+
"""
|
30 |
+
Initialize the LLMGeneration class with a specified LLM model.
|
31 |
+
|
32 |
+
Parameters:
|
33 |
+
llm_model_name (str): The name of the LLM model to be used. Default is "gpt-4o-mini".
|
34 |
+
|
35 |
+
Returns:
|
36 |
+
None
|
37 |
+
"""
|
38 |
self.llm_model_name = llm_model_name
|
39 |
self.llm = ChatOpenAI(
|
40 |
model_name=self.llm_model_name,
|
|
|
50 |
self.create_initial_prompt()
|
51 |
|
52 |
def create_initial_prompt(self):
|
53 |
+
"""
|
54 |
+
Prepares the initial prompt for the LLMChain.
|
55 |
+
|
56 |
+
This function creates a system message and few-shot examples for the LLMChain.
|
57 |
+
The system message instructs the assistant to use the provided context to answer the user's question,
|
58 |
+
and to follow a structured JSON format for the answer. It also specifies the conditions for providing an answer.
|
59 |
+
|
60 |
+
The few-shot examples include a context and a question, along with the expected answer in JSON format.
|
61 |
+
|
62 |
+
Returns:
|
63 |
+
None. The initial prompt messages are stored in the `initial_prompt_messages` attribute of the class instance.
|
64 |
+
"""
|
65 |
# System message for the chain
|
66 |
system_message = SystemMessage(
|
67 |
content=(
|
|
|
82 |
|
83 |
self.initial_prompt_messages = [system_message] + few_shots
|
84 |
|
85 |
+
def create_human_message_prompt(self, query: str, docs: List[Document]) -> HumanMessagePromptTemplate:
|
86 |
+
"""
|
87 |
+
Prepares a human message prompt for the LLMChain.
|
88 |
+
|
89 |
+
This function constructs a human message that includes the provided context and a question.
|
90 |
+
The context is extracted from the list of documents and formatted as per the required structure.
|
91 |
+
The question is included in the human message.
|
92 |
|
93 |
+
Parameters:
|
94 |
+
query (str): The user's question for which an answer needs to be generated.
|
95 |
+
docs (List[Document]): A list of documents retrieved from the search engine. Each document contains a 'page_content' attribute.
|
96 |
+
|
97 |
+
Returns:
|
98 |
+
HumanMessagePromptTemplate: A human message prompt template that can be used with the LLMChain.
|
99 |
+
"""
|
100 |
# Prepare the context from the retrieved chunks
|
101 |
context = "\n\n".join(
|
102 |
[f"<context>{doc.page_content}</context>" for doc in docs]
|
|
|
110 |
|
111 |
return HumanMessagePromptTemplate.from_template(human_message)
|
112 |
|
113 |
+
def generate_answer(self, query: str, docs: List[Document]) -> str:
|
114 |
+
"""
|
115 |
+
Generate an answer to the user's query using the provided documents and the LLM model.
|
116 |
+
|
117 |
+
Parameters:
|
118 |
+
query (str): The user's question for which an answer needs to be generated.
|
119 |
+
docs (List[Document]): A list of documents retrieved from the search engine. Each document contains a 'page_content' attribute.
|
120 |
|
121 |
+
Returns:
|
122 |
+
str: The answer to the user's query. If no answer is found, returns an empty string.
|
123 |
+
"""
|
124 |
# Create the prompt template
|
125 |
prompt = ChatPromptTemplate.from_messages(
|
126 |
self.initial_prompt_messages
|
127 |
+ [self.create_human_message_prompt(query, docs)]
|
128 |
)
|
129 |
|
130 |
+
# Create and run the chain with the gpt-40-mini model
|
131 |
chain = LLMChain(
|
132 |
llm=self.llm,
|
133 |
prompt=prompt,
|
utils/retrieval.py
CHANGED
@@ -6,6 +6,16 @@ from typing import List
|
|
6 |
|
7 |
class Retrieval:
|
8 |
def __init__(self, model_name, max_model_tokens=384):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
self.model_name = model_name
|
10 |
self.embeddings = HuggingFaceEmbeddings(
|
11 |
model_name=model_name,
|
@@ -13,12 +23,13 @@ class Retrieval:
|
|
13 |
)
|
14 |
|
15 |
def create_vector_store(self, chunks: List[Document]):
|
16 |
-
|
17 |
self.chunks = chunks
|
18 |
# Create FAISS vector store
|
19 |
self.vectorstore = FAISS.from_documents(self.chunks, self.embeddings)
|
20 |
|
21 |
def search(self, query, k=10) -> List[Document]:
|
|
|
22 |
# Retrieve top 10 similar chunks
|
23 |
similar_docs = self.vectorstore.similarity_search(query, k)
|
24 |
|
|
|
6 |
|
7 |
class Retrieval:
|
8 |
def __init__(self, model_name, max_model_tokens=384):
|
9 |
+
"""
|
10 |
+
Initialize Retrieval class with HuggingFace embeddings and FAISS vector store.
|
11 |
+
|
12 |
+
Parameters:
|
13 |
+
model_name (str): The name of the HuggingFace model to use for embeddings.
|
14 |
+
max_model_tokens (int, optional): The maximum number of tokens to use for encoding. Defaults to 384.
|
15 |
+
|
16 |
+
Returns:
|
17 |
+
None
|
18 |
+
"""
|
19 |
self.model_name = model_name
|
20 |
self.embeddings = HuggingFaceEmbeddings(
|
21 |
model_name=model_name,
|
|
|
23 |
)
|
24 |
|
25 |
def create_vector_store(self, chunks: List[Document]):
|
26 |
+
"""Creates a new vector store for similarity search"""
|
27 |
self.chunks = chunks
|
28 |
# Create FAISS vector store
|
29 |
self.vectorstore = FAISS.from_documents(self.chunks, self.embeddings)
|
30 |
|
31 |
def search(self, query, k=10) -> List[Document]:
|
32 |
+
"""Search top matching documents"""
|
33 |
# Retrieve top 10 similar chunks
|
34 |
similar_docs = self.vectorstore.similarity_search(query, k)
|
35 |
|