FridayMaster commited on
Commit
e4261d6
·
verified ·
1 Parent(s): 20f2cfe

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -39
app.py CHANGED
@@ -1,27 +1,25 @@
1
  import pandas as pd
2
- import fitz # PyMuPDF for PDF extraction
3
  import spacy
4
- from nltk.corpus import stopwords
5
- from transformers import AutoTokenizer, AutoModel
 
6
  import torch
 
7
  import gradio as gr
8
- import numpy as np
9
- from faiss import IndexFlatL2, normalize_L2
10
- from langchain.llms import OpenAI
11
- from langchain.chains import ConversationalRetrievalChain
12
 
13
  # Load and preprocess PDF text
14
  def extract_text_from_pdf(pdf_path):
15
  text = ""
16
- with fitz.open(pdf_path) as pdf_document:
17
- for page_num in range(len(pdf_document)):
18
- page = pdf_document.load_page(page_num)
19
- text += page.get_text()
 
20
  return text
21
 
22
  # Extract text from the PDF
23
- pdf_path = 'Getting_Started_with_Ubuntu_16.04.pdf' # Reference to the PDF file in the same directory
24
- pdf_text = extract_text_from_pdf(pdf_path)
25
 
26
  # Convert the text to a DataFrame
27
  df = pd.DataFrame({'text': [pdf_text]})
@@ -38,7 +36,7 @@ class CustomEmbeddingModel:
38
  embeddings = self.model(**inputs).last_hidden_state.mean(dim=1)
39
  return embeddings[0].numpy()
40
 
41
- embedding_model = CustomEmbeddingModel('FridayMaster/fine_tune_embedding') # Replace with your model name
42
 
43
  # Load Spacy model for preprocessing
44
  nlp = spacy.load("en_core_web_sm")
@@ -53,34 +51,13 @@ df['text'] = df['text'].apply(preprocess_text)
53
  df['text_embeddings'] = df['text'].apply(lambda x: embedding_model.embed_text(x))
54
 
55
  # Create FAISS vector store
56
- class SimpleFAISSIndex:
57
- def __init__(self, embeddings):
58
- self.index = IndexFlatL2(embeddings.shape[1])
59
- normalize_L2(embeddings)
60
- self.index.add(embeddings)
61
-
62
- def search(self, query_embedding, k=1):
63
- normalize_L2(query_embedding)
64
- distances, indices = self.index.search(query_embedding, k)
65
- return indices[0], distances[0]
66
-
67
- embeddings = np.array(df['text_embeddings'].tolist())
68
- vector_store = SimpleFAISSIndex(embeddings)
69
 
70
  # Create LangChain model and chain
71
  llm_model = OpenAI('gpt-3.5-turbo') # You can replace this with a different LLM if desired
72
-
73
- class SimpleRetriever:
74
- def __init__(self, vector_store, documents):
75
- self.vector_store = vector_store
76
- self.documents = documents
77
-
78
- def retrieve(self, query):
79
- query_embedding = embedding_model.embed_text(query).reshape(1, -1)
80
- indices, _ = self.vector_store.search(query_embedding)
81
- return [self.documents[idx] for idx in indices]
82
-
83
- retriever = SimpleRetriever(vector_store, df['text'].tolist())
84
  chain = ConversationalRetrievalChain.from_llm(llm_model, retriever=retriever)
85
 
86
  # Function to generate a response
@@ -102,3 +79,4 @@ if __name__ == "__main__":
102
  iface.launch()
103
 
104
 
 
 
1
  import pandas as pd
2
+ import PyPDF2 # For PDF extraction
3
  import spacy
4
+ from langchain.chains import ConversationalRetrievalChain
5
+ from langchain.llms import OpenAI
6
+ from langchain.vectorstores import FAISS
7
  import torch
8
+ from transformers import AutoTokenizer, AutoModel
9
  import gradio as gr
 
 
 
 
10
 
11
  # Load and preprocess PDF text
12
  def extract_text_from_pdf(pdf_path):
13
  text = ""
14
+ with open(pdf_path, 'rb') as pdf_file:
15
+ pdf_reader = PyPDF2.PdfReader(pdf_file)
16
+ for page_num in range(len(pdf_reader.pages)):
17
+ page = pdf_reader.pages[page_num]
18
+ text += page.extract_text()
19
  return text
20
 
21
  # Extract text from the PDF
22
+ pdf_text = extract_text_from_pdf('Getting_Started_with_Ubuntu_16.04.pdf') # Replace with your PDF path
 
23
 
24
  # Convert the text to a DataFrame
25
  df = pd.DataFrame({'text': [pdf_text]})
 
36
  embeddings = self.model(**inputs).last_hidden_state.mean(dim=1)
37
  return embeddings[0].numpy()
38
 
39
+ embedding_model = CustomEmbeddingModel('distilbert-base-uncased') # Replace with your model name
40
 
41
  # Load Spacy model for preprocessing
42
  nlp = spacy.load("en_core_web_sm")
 
51
  df['text_embeddings'] = df['text'].apply(lambda x: embedding_model.embed_text(x))
52
 
53
  # Create FAISS vector store
54
+ documents = df['text'].tolist()
55
+ embeddings = df['text_embeddings'].tolist()
56
+ vector_store = FAISS.from_documents(documents, embeddings)
 
 
 
 
 
 
 
 
 
 
57
 
58
  # Create LangChain model and chain
59
  llm_model = OpenAI('gpt-3.5-turbo') # You can replace this with a different LLM if desired
60
+ retriever = vector_store.as_retriever()
 
 
 
 
 
 
 
 
 
 
 
61
  chain = ConversationalRetrievalChain.from_llm(llm_model, retriever=retriever)
62
 
63
  # Function to generate a response
 
79
  iface.launch()
80
 
81
 
82
+