Spaces:

FridayMaster
/

CHATBOT1

Sleeping

App Files Files Community

FridayMaster commited on Aug 5, 2024

Commit

e4261d6

verified ·

1 Parent(s): 20f2cfe

Update app.py

Browse files

Files changed (1) hide show

app.py +17 -39

app.py CHANGED Viewed

@@ -1,27 +1,25 @@
 import pandas as pd
-import fitz  # PyMuPDF for PDF extraction
 import spacy
-from nltk.corpus import stopwords
-from transformers import AutoTokenizer, AutoModel
 import torch
 import gradio as gr
-import numpy as np
-from faiss import IndexFlatL2, normalize_L2
-from langchain.llms import OpenAI
-from langchain.chains import ConversationalRetrievalChain
 # Load and preprocess PDF text
 def extract_text_from_pdf(pdf_path):
     text = ""
-    with fitz.open(pdf_path) as pdf_document:
-        for page_num in range(len(pdf_document)):
-            page = pdf_document.load_page(page_num)
-            text += page.get_text()
     return text
 # Extract text from the PDF
-pdf_path = 'Getting_Started_with_Ubuntu_16.04.pdf'  # Reference to the PDF file in the same directory
-pdf_text = extract_text_from_pdf(pdf_path)
 # Convert the text to a DataFrame
 df = pd.DataFrame({'text': [pdf_text]})
@@ -38,7 +36,7 @@ class CustomEmbeddingModel:
             embeddings = self.model(**inputs).last_hidden_state.mean(dim=1)
         return embeddings[0].numpy()
-embedding_model = CustomEmbeddingModel('FridayMaster/fine_tune_embedding')  # Replace with your model name
 # Load Spacy model for preprocessing
 nlp = spacy.load("en_core_web_sm")
@@ -53,34 +51,13 @@ df['text'] = df['text'].apply(preprocess_text)
 df['text_embeddings'] = df['text'].apply(lambda x: embedding_model.embed_text(x))
 # Create FAISS vector store
-class SimpleFAISSIndex:
-    def __init__(self, embeddings):
-        self.index = IndexFlatL2(embeddings.shape[1])
-        normalize_L2(embeddings)
-        self.index.add(embeddings)
-    def search(self, query_embedding, k=1):
-        normalize_L2(query_embedding)
-        distances, indices = self.index.search(query_embedding, k)
-        return indices[0], distances[0]
-embeddings = np.array(df['text_embeddings'].tolist())
-vector_store = SimpleFAISSIndex(embeddings)
 # Create LangChain model and chain
 llm_model = OpenAI('gpt-3.5-turbo')  # You can replace this with a different LLM if desired
-class SimpleRetriever:
-    def __init__(self, vector_store, documents):
-        self.vector_store = vector_store
-        self.documents = documents
-    def retrieve(self, query):
-        query_embedding = embedding_model.embed_text(query).reshape(1, -1)
-        indices, _ = self.vector_store.search(query_embedding)
-        return [self.documents[idx] for idx in indices]
-retriever = SimpleRetriever(vector_store, df['text'].tolist())
 chain = ConversationalRetrievalChain.from_llm(llm_model, retriever=retriever)
 # Function to generate a response
@@ -102,3 +79,4 @@ if __name__ == "__main__":
     iface.launch()

 import pandas as pd
+import PyPDF2  # For PDF extraction
 import spacy
+from langchain.chains import ConversationalRetrievalChain
+from langchain.llms import OpenAI
+from langchain.vectorstores import FAISS
 import torch
+from transformers import AutoTokenizer, AutoModel
 import gradio as gr
 # Load and preprocess PDF text
 def extract_text_from_pdf(pdf_path):
     text = ""
+    with open(pdf_path, 'rb') as pdf_file:
+        pdf_reader = PyPDF2.PdfReader(pdf_file)
+        for page_num in range(len(pdf_reader.pages)):
+            page = pdf_reader.pages[page_num]
+            text += page.extract_text()
     return text
 # Extract text from the PDF
+pdf_text = extract_text_from_pdf('Getting_Started_with_Ubuntu_16.04.pdf')  # Replace with your PDF path
 # Convert the text to a DataFrame
 df = pd.DataFrame({'text': [pdf_text]})
             embeddings = self.model(**inputs).last_hidden_state.mean(dim=1)
         return embeddings[0].numpy()
+embedding_model = CustomEmbeddingModel('distilbert-base-uncased')  # Replace with your model name
 # Load Spacy model for preprocessing
 nlp = spacy.load("en_core_web_sm")
 df['text_embeddings'] = df['text'].apply(lambda x: embedding_model.embed_text(x))
 # Create FAISS vector store
+documents = df['text'].tolist()
+embeddings = df['text_embeddings'].tolist()
+vector_store = FAISS.from_documents(documents, embeddings)
 # Create LangChain model and chain
 llm_model = OpenAI('gpt-3.5-turbo')  # You can replace this with a different LLM if desired
+retriever = vector_store.as_retriever()
 chain = ConversationalRetrievalChain.from_llm(llm_model, retriever=retriever)
 # Function to generate a response
     iface.launch()