FridayMaster commited on
Commit
4c4e926
1 Parent(s): 7dbc572

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -11
app.py CHANGED
@@ -1,12 +1,14 @@
1
  import pandas as pd
2
  import fitz # PyMuPDF for PDF extraction
3
  import spacy
4
- from langchain.chains import ConversationalRetrievalChain # Ensure this class is available or use an alternative
5
- from langchain.llms import OpenAI
6
- from langchain.vectorstores import FAISS
7
- import torch
8
  from transformers import AutoTokenizer, AutoModel
 
9
  import gradio as gr
 
 
 
 
10
 
11
  # Load and preprocess PDF text
12
  def extract_text_from_pdf(pdf_path):
@@ -18,7 +20,8 @@ def extract_text_from_pdf(pdf_path):
18
  return text
19
 
20
  # Extract text from the PDF
21
- pdf_text = extract_text_from_pdf('Getting_Started_with_Ubuntu_16.04.pdf') # Replace with your PDF path
 
22
 
23
  # Convert the text to a DataFrame
24
  df = pd.DataFrame({'text': [pdf_text]})
@@ -35,7 +38,7 @@ class CustomEmbeddingModel:
35
  embeddings = self.model(**inputs).last_hidden_state.mean(dim=1)
36
  return embeddings[0].numpy()
37
 
38
- embedding_model = CustomEmbeddingModel('distilbert-base-uncased') # Replace with your model name
39
 
40
  # Load Spacy model for preprocessing
41
  nlp = spacy.load("en_core_web_sm")
@@ -50,15 +53,34 @@ df['text'] = df['text'].apply(preprocess_text)
50
  df['text_embeddings'] = df['text'].apply(lambda x: embedding_model.embed_text(x))
51
 
52
  # Create FAISS vector store
53
- documents = df['text'].tolist()
54
- embeddings = df['text_embeddings'].tolist()
55
- vector_store = FAISS.from_documents(documents, embeddings)
 
 
 
 
 
 
 
 
 
 
56
 
57
  # Create LangChain model and chain
58
  llm_model = OpenAI('gpt-3.5-turbo') # You can replace this with a different LLM if desired
59
- retriever = vector_store.as_retriever()
60
 
61
- # Create a conversational chain
 
 
 
 
 
 
 
 
 
 
62
  chain = ConversationalRetrievalChain.from_llm(llm_model, retriever=retriever)
63
 
64
  # Function to generate a response
 
1
  import pandas as pd
2
  import fitz # PyMuPDF for PDF extraction
3
  import spacy
4
+ from nltk.corpus import stopwords
 
 
 
5
  from transformers import AutoTokenizer, AutoModel
6
+ import torch
7
  import gradio as gr
8
+ import numpy as np
9
+ from faiss import IndexFlatL2, normalize_L2
10
+ from langchain.llms import OpenAI
11
+ from langchain.chains import ConversationalRetrievalChain
12
 
13
  # Load and preprocess PDF text
14
  def extract_text_from_pdf(pdf_path):
 
20
  return text
21
 
22
  # Extract text from the PDF
23
+ pdf_path = 'Getting_Started_with_Ubuntu_16.04.pdf' # Reference to the PDF file in the same directory
24
+ pdf_text = extract_text_from_pdf(pdf_path)
25
 
26
  # Convert the text to a DataFrame
27
  df = pd.DataFrame({'text': [pdf_text]})
 
38
  embeddings = self.model(**inputs).last_hidden_state.mean(dim=1)
39
  return embeddings[0].numpy()
40
 
41
+ embedding_model = CustomEmbeddingModel('FridayMaster/fine_tune_embedding') # Replace with your model name
42
 
43
  # Load Spacy model for preprocessing
44
  nlp = spacy.load("en_core_web_sm")
 
53
  df['text_embeddings'] = df['text'].apply(lambda x: embedding_model.embed_text(x))
54
 
55
  # Create FAISS vector store
56
+ class SimpleFAISSIndex:
57
+ def __init__(self, embeddings):
58
+ self.index = IndexFlatL2(embeddings.shape[1])
59
+ normalize_L2(embeddings)
60
+ self.index.add(embeddings)
61
+
62
+ def search(self, query_embedding, k=1):
63
+ normalize_L2(query_embedding)
64
+ distances, indices = self.index.search(query_embedding, k)
65
+ return indices[0], distances[0]
66
+
67
+ embeddings = np.array(df['text_embeddings'].tolist())
68
+ vector_store = SimpleFAISSIndex(embeddings)
69
 
70
  # Create LangChain model and chain
71
  llm_model = OpenAI('gpt-3.5-turbo') # You can replace this with a different LLM if desired
 
72
 
73
+ class SimpleRetriever:
74
+ def __init__(self, vector_store, documents):
75
+ self.vector_store = vector_store
76
+ self.documents = documents
77
+
78
+ def retrieve(self, query):
79
+ query_embedding = embedding_model.embed_text(query).reshape(1, -1)
80
+ indices, _ = self.vector_store.search(query_embedding)
81
+ return [self.documents[idx] for idx in indices]
82
+
83
+ retriever = SimpleRetriever(vector_store, df['text'].tolist())
84
  chain = ConversationalRetrievalChain.from_llm(llm_model, retriever=retriever)
85
 
86
  # Function to generate a response