Spaces:

agoyal496
/

AskMyPDF

Sleeping

App Files Files Community

agoyal496 commited on 26 days ago

Commit

24412da

•

1 Parent(s): d16857c

Documentation

Browse files

Files changed (4) hide show

app.py +26 -2
utils/document_parsing.py +83 -2
utils/llm_generation.py +46 -3
utils/retrieval.py +12 -1

app.py CHANGED Viewed

@@ -16,15 +16,39 @@ llm_model_name = "gpt-4o-mini"
 llm_generator = None
-def set_api_key(api_key: str):
     if api_key.strip():
         os.environ["OPENAI_API_KEY"] = api_key
     else:
         raise gr.Error("Please provide a valid API key")
-def process_inputs(api_key: str, pdf_file, questions: str):
     # Setup Api KEY
     set_api_key(api_key)

 llm_generator = None
+def set_api_key(api_key: str) -> None:
+    """
+    Sets the OpenAI API key as an environment variable.
+    Parameters:
+    api_key (str): The OpenAI API key to be set.
+    Returns:
+    None: This function does not return any value.
+    Raises:
+    gr.Error: If the provided API key is empty or consists only of whitespace characters.
+    """
     if api_key.strip():
         os.environ["OPENAI_API_KEY"] = api_key
     else:
         raise gr.Error("Please provide a valid API key")
+def process_inputs(api_key: str, pdf_file, questions: str) -> str:
+    """
+    This function processes the inputs, sets up the API key, validates the PDF file, parses the PDF,
+    creates a vector store, generates an LLM generator, validates the questions, retrieves top similar chunks,
+    generates answers, and returns the output in JSON format.
+    Parameters:
+    api_key (str): The OpenAI API key for accessing the LLM model.
+    pdf_file (File): The uploaded PDF file.
+    questions (str): The list of questions, one per line.
+    Returns:
+    str: The output in JSON format containing the answers to the questions.
+    """
     # Setup Api KEY
     set_api_key(api_key)

utils/document_parsing.py CHANGED Viewed

@@ -10,6 +10,17 @@ class DocParsing:
     chunk_overlap = 50
     def __init__(self, file_path, model_name, max_model_tokens=384):
         self.file_path = file_path
         # Initialize the tokenizer for all-MiniLM
@@ -18,16 +29,59 @@ class DocParsing:
         self.max_model_tokens = max_model_tokens
     def process_pdf(self):
         self.load_pdf()
         self.create_chunks()
         return self.chunks
     def load_pdf(self):
         loader = PyPDFLoader(self.file_path)
         self.documents = loader.load()
     def create_chunks(self):
-        # Split documents into chunks
         self.chunks = []
         for doc in self.documents:
             self.chunks.extend(
@@ -37,10 +91,37 @@ class DocParsing:
             )
     def tokenize(self, text):
         return self.tokenizer.encode(text, add_special_tokens=False)
     def token_split_document(self, doc: Document, chunk_size=350, chunk_overlap=50):
-        """Split a single Document into multiple Documents based on token length."""
         tokens = self.tokenize(doc.page_content)
         chunks = []
         start = 0

     chunk_overlap = 50
     def __init__(self, file_path, model_name, max_model_tokens=384):
+        """
+        Initialize the DocParsing class with the provided file path, model name, and maximum model tokens.
+        Parameters:
+        file_path (str): The path to the PDF file to be processed.
+        model_name (str): The name of the transformer model to be used for tokenization.
+        max_model_tokens (int, optional): The maximum number of tokens allowed for each chunk. Defaults to 384.
+        Returns:
+        None
+        """
         self.file_path = file_path
         # Initialize the tokenizer for all-MiniLM
         self.max_model_tokens = max_model_tokens
     def process_pdf(self):
+        """
+        Process the PDF file by loading it, splitting it into chunks, and returning the chunks.
+        This function first calls the `load_pdf` method to load the PDF file into a list of Document objects.
+        Then, it calls the `create_chunks` method to split each Document into smaller chunks based on the specified
+        chunk size and overlap. Finally, it returns the list of chunks.
+        Parameters:
+        None
+        Returns:
+        list: A list of Document objects, where each Document represents a chunk of the PDF file.
+        """
         self.load_pdf()
         self.create_chunks()
         return self.chunks
     def load_pdf(self):
+        """
+        Load the PDF file specified by the file_path attribute into a list of Document objects.
+        This function uses the PyPDFLoader class from the langchain library to load the PDF file.
+        The loaded Document objects are stored in the self.documents attribute.
+        Parameters:
+        None
+        Returns:
+        None
+        Raises:
+        FileNotFoundError: If the specified file_path does not exist or cannot be accessed.
+        """
         loader = PyPDFLoader(self.file_path)
         self.documents = loader.load()
     def create_chunks(self):
+        """
+        Split the loaded PDF documents into smaller chunks based on the specified chunk size and overlap.
+        This function iterates through each Document object in the self.documents list and calls the
+        token_split_document method to split the Document into smaller chunks. The resulting chunks are
+        then appended to the self.chunks list.
+        Parameters:
+        None
+        Returns:
+        None
+        Attributes:
+        self.chunks (list): A list of Document objects, where each Document represents a chunk of the PDF file.
+        """
         self.chunks = []
         for doc in self.documents:
             self.chunks.extend(
             )
     def tokenize(self, text):
+        """
+        Tokenize the input text using the transformer model's tokenizer.
+        This method uses the tokenizer provided by the transformer model to encode the input text.
+        The special tokens are not added to the encoded tokens.
+        Parameters:
+        text (str): The input text to be tokenized.
+        Returns:
+        list: A list of integers representing the tokenized input text.
+        """
         return self.tokenizer.encode(text, add_special_tokens=False)
     def token_split_document(self, doc: Document, chunk_size=350, chunk_overlap=50):
+        """
+        Split a single Document into multiple chunks based on token length.
+        This function tokenizes the input Document's page content, then splits the tokens into smaller chunks
+        of specified size. Overlapping chunks are created by moving the start index forward by the difference
+        between chunk size and overlap. Each chunk is then decoded back into text and a new Document is created
+        with the same metadata but truncated text.
+        Parameters:
+        doc (Document): The input Document to be split into chunks.
+        chunk_size (int, optional): The size of each chunk in tokens. Defaults to 350.
+        chunk_overlap (int, optional): The overlap between chunks in tokens. Defaults to 50.
+        Returns:
+        list: A list of Document objects, where each Document represents a chunk of the input Document.
+        """
         tokens = self.tokenize(doc.page_content)
         chunks = []
         start = 0

utils/llm_generation.py CHANGED Viewed

@@ -26,6 +26,15 @@ json_schema = {
 class LLMGeneration:
     def __init__(self, llm_model_name="gpt-4o-mini"):
         self.llm_model_name = llm_model_name
         self.llm = ChatOpenAI(
             model_name=self.llm_model_name,
@@ -41,6 +50,18 @@ class LLMGeneration:
         self.create_initial_prompt()
     def create_initial_prompt(self):
         # System message for the chain
         system_message = SystemMessage(
             content=(
@@ -61,8 +82,21 @@ class LLMGeneration:
         self.initial_prompt_messages = [system_message] + few_shots
-    def create_human_message_prompt(self, query: str, docs: List[Document]):
         # Prepare the context from the retrieved chunks
         context = "\n\n".join(
             [f"<context>{doc.page_content}</context>" for doc in docs]
@@ -76,15 +110,24 @@ class LLMGeneration:
         return HumanMessagePromptTemplate.from_template(human_message)
-    def generate_answer(self, query: str, docs: List[Document]):
         # Create the prompt template
         prompt = ChatPromptTemplate.from_messages(
             self.initial_prompt_messages
             + [self.create_human_message_prompt(query, docs)]
         )
-        # Create and run the chain with the hypothetical gpt-40-mini model
         chain = LLMChain(
             llm=self.llm,
             prompt=prompt,

 class LLMGeneration:
     def __init__(self, llm_model_name="gpt-4o-mini"):
+        """
+        Initialize the LLMGeneration class with a specified LLM model.
+        Parameters:
+        llm_model_name (str): The name of the LLM model to be used. Default is "gpt-4o-mini".
+        Returns:
+        None
+        """
         self.llm_model_name = llm_model_name
         self.llm = ChatOpenAI(
             model_name=self.llm_model_name,
         self.create_initial_prompt()
     def create_initial_prompt(self):
+        """
+        Prepares the initial prompt for the LLMChain.
+        This function creates a system message and few-shot examples for the LLMChain.
+        The system message instructs the assistant to use the provided context to answer the user's question,
+        and to follow a structured JSON format for the answer. It also specifies the conditions for providing an answer.
+        The few-shot examples include a context and a question, along with the expected answer in JSON format.
+        Returns:
+        None. The initial prompt messages are stored in the `initial_prompt_messages` attribute of the class instance.
+        """
         # System message for the chain
         system_message = SystemMessage(
             content=(
         self.initial_prompt_messages = [system_message] + few_shots
+    def create_human_message_prompt(self, query: str, docs: List[Document]) -> HumanMessagePromptTemplate:
+        """
+        Prepares a human message prompt for the LLMChain.
+        This function constructs a human message that includes the provided context and a question.
+        The context is extracted from the list of documents and formatted as per the required structure.
+        The question is included in the human message.
+        Parameters:
+        query (str): The user's question for which an answer needs to be generated.
+        docs (List[Document]): A list of documents retrieved from the search engine. Each document contains a 'page_content' attribute.
+        Returns:
+        HumanMessagePromptTemplate: A human message prompt template that can be used with the LLMChain.
+        """
         # Prepare the context from the retrieved chunks
         context = "\n\n".join(
             [f"<context>{doc.page_content}</context>" for doc in docs]
         return HumanMessagePromptTemplate.from_template(human_message)
+    def generate_answer(self, query: str, docs: List[Document]) -> str:
+        """
+        Generate an answer to the user's query using the provided documents and the LLM model.
+        Parameters:
+        query (str): The user's question for which an answer needs to be generated.
+        docs (List[Document]): A list of documents retrieved from the search engine. Each document contains a 'page_content' attribute.
+        Returns:
+        str: The answer to the user's query. If no answer is found, returns an empty string.
+        """
         # Create the prompt template
         prompt = ChatPromptTemplate.from_messages(
             self.initial_prompt_messages
             + [self.create_human_message_prompt(query, docs)]
         )
+        # Create and run the chain with the gpt-40-mini model
         chain = LLMChain(
             llm=self.llm,
             prompt=prompt,

utils/retrieval.py CHANGED Viewed

@@ -6,6 +6,16 @@ from typing import List
 class Retrieval:
     def __init__(self, model_name, max_model_tokens=384):
         self.model_name = model_name
         self.embeddings = HuggingFaceEmbeddings(
             model_name=model_name,
@@ -13,12 +23,13 @@ class Retrieval:
         )
     def create_vector_store(self, chunks: List[Document]):
         self.chunks = chunks
         # Create FAISS vector store
         self.vectorstore = FAISS.from_documents(self.chunks, self.embeddings)
     def search(self, query, k=10) -> List[Document]:
         # Retrieve top 10 similar chunks
         similar_docs = self.vectorstore.similarity_search(query, k)

 class Retrieval:
     def __init__(self, model_name, max_model_tokens=384):
+        """
+        Initialize Retrieval class with HuggingFace embeddings and FAISS vector store.
+        Parameters:
+        model_name (str): The name of the HuggingFace model to use for embeddings.
+        max_model_tokens (int, optional): The maximum number of tokens to use for encoding. Defaults to 384.
+        Returns:
+        None
+        """
         self.model_name = model_name
         self.embeddings = HuggingFaceEmbeddings(
             model_name=model_name,
         )
     def create_vector_store(self, chunks: List[Document]):
+        """Creates a new vector store for similarity search"""
         self.chunks = chunks
         # Create FAISS vector store
         self.vectorstore = FAISS.from_documents(self.chunks, self.embeddings)
     def search(self, query, k=10) -> List[Document]:
+        """Search top matching documents"""
         # Retrieve top 10 similar chunks
         similar_docs = self.vectorstore.similarity_search(query, k)