FridayMaster commited on
Commit
1842c48
·
verified ·
1 Parent(s): 90198de

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -16
app.py CHANGED
@@ -1,12 +1,12 @@
1
  import pandas as pd
2
  import fitz # PyMuPDF for PDF extraction
3
  import spacy
4
- from langchain.vectorstores import FAISS
 
 
5
  import torch
6
  from transformers import AutoTokenizer, AutoModel
7
  import gradio as gr
8
- from langchain_community.vectorstores import FAISS
9
-
10
 
11
  # Load and preprocess PDF text
12
  def extract_text_from_pdf(pdf_path):
@@ -18,13 +18,12 @@ def extract_text_from_pdf(pdf_path):
18
  return text
19
 
20
  # Extract text from the PDF
21
- pdf_text = extract_text_from_pdf('Getting Started with Ubuntu 16.04.pdf')
22
-
23
 
24
  # Convert the text to a DataFrame
25
  df = pd.DataFrame({'text': [pdf_text]})
26
 
27
- # Define your custom embedding model
28
  class CustomEmbeddingModel:
29
  def __init__(self, model_name):
30
  self.tokenizer = AutoTokenizer.from_pretrained(model_name)
@@ -39,7 +38,7 @@ class CustomEmbeddingModel:
39
  embedding_model = CustomEmbeddingModel('distilbert-base-uncased') # Replace with your model name
40
 
41
  # Load Spacy model for preprocessing
42
- nlp = spacy.load("en_core_web_sm")
43
 
44
  def preprocess_text(text):
45
  doc = nlp(text)
@@ -55,16 +54,15 @@ documents = df['text'].tolist()
55
  embeddings = df['text_embeddings'].tolist()
56
  vector_store = FAISS.from_documents(documents, embeddings)
57
 
 
 
 
 
 
58
  # Function to generate a response
59
- def generate_response(query):
60
- preprocessed_query = preprocess_text(query)
61
- query_embedding = embedding_model.embed_text(preprocessed_query)
62
- # Find the closest document in the vector store
63
- distances, indices = vector_store.search(query_embedding, k=1) # k=1 for the closest document
64
- if indices:
65
- response = documents[indices[0]]
66
- else:
67
- response = "No relevant information found."
68
  return response
69
 
70
  # Gradio interface
@@ -79,3 +77,4 @@ iface = gr.Interface(
79
  if __name__ == "__main__":
80
  iface.launch()
81
 
 
 
1
  import pandas as pd
2
  import fitz # PyMuPDF for PDF extraction
3
  import spacy
4
+ from langchain.chains import ConversationalRetrievalChain
5
+ from langchain.llms import OpenAI
6
+ from langchain_community.vectorstores import FAISS # Updated import
7
  import torch
8
  from transformers import AutoTokenizer, AutoModel
9
  import gradio as gr
 
 
10
 
11
  # Load and preprocess PDF text
12
  def extract_text_from_pdf(pdf_path):
 
18
  return text
19
 
20
  # Extract text from the PDF
21
+ pdf_text = extract_text_from_pdf('Getting Started with Ubuntu 16.04.pdf') # Ensure this path is correct
 
22
 
23
  # Convert the text to a DataFrame
24
  df = pd.DataFrame({'text': [pdf_text]})
25
 
26
+ # Load the custom embedding model
27
  class CustomEmbeddingModel:
28
  def __init__(self, model_name):
29
  self.tokenizer = AutoTokenizer.from_pretrained(model_name)
 
38
  embedding_model = CustomEmbeddingModel('distilbert-base-uncased') # Replace with your model name
39
 
40
  # Load Spacy model for preprocessing
41
+ nlp = spacy.load("en_core_web_sm") # Ensure the model is installed
42
 
43
  def preprocess_text(text):
44
  doc = nlp(text)
 
54
  embeddings = df['text_embeddings'].tolist()
55
  vector_store = FAISS.from_documents(documents, embeddings)
56
 
57
+ # Create LangChain model and chain
58
+ llm_model = OpenAI('gpt-3.5-turbo') # You can replace this with a different LLM if desired
59
+ retriever = vector_store.as_retriever()
60
+ chain = ConversationalRetrievalChain.from_llm(llm_model, retriever=retriever)
61
+
62
  # Function to generate a response
63
+ def generate_response(prompt):
64
+ result = chain({"query": prompt})
65
+ response = result["result"]
 
 
 
 
 
 
66
  return response
67
 
68
  # Gradio interface
 
77
  if __name__ == "__main__":
78
  iface.launch()
79
 
80
+