tahirsher commited on
Commit
dd56502
β€’
1 Parent(s): e3d9b11

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -12
app.py CHANGED
@@ -15,18 +15,25 @@ GROQ_API_KEY = os.getenv("GROQ_Api_Key")
15
  tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
16
  embedding_model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
17
 
18
- # List of GitHub PDF URLs
19
  PDF_URLS = [
20
- "https://github.com/TahirSher/GenAI_Lawyers_Guide/blob/main/bi-partite.pdf",
21
- "https://github.com/TahirSher/GenAI_Lawyers_Guide/blob/main/bi%20pat%20graphs.pdf",
22
  # Add more document links as needed
23
  ]
24
 
25
- # Fetch and extract text from PDF files hosted on GitHub
26
- def fetch_pdf_text_from_github(urls):
 
 
 
 
 
 
27
  text = ""
28
  for url in urls:
29
- response = requests.get(url)
 
30
  if response.status_code == 200:
31
  pdf_file = BytesIO(response.content)
32
  try:
@@ -61,15 +68,13 @@ def compute_embeddings(text_chunks):
61
  # Create a FAISS vector store with embeddings
62
  @st.cache_resource
63
  def load_or_create_vector_store(text_chunks):
64
- # Compute embeddings for text chunks
65
  embeddings = compute_embeddings(text_chunks)
66
- # Create FAISS vector store
67
  vector_store = FAISS.from_texts(text_chunks, embeddings)
68
  return vector_store
69
 
70
  # Call Groq API for generating summary based on the query and retrieved text
71
  def generate_summary_with_groq(query, retrieved_text):
72
- url = "https://api.groq.com/v1/chat/completions" # Update with actual Groq API endpoint
73
  headers = {
74
  "Authorization": f"Bearer {GROQ_API_KEY}",
75
  "Content-Type": "application/json"
@@ -96,10 +101,10 @@ def user_input(user_question, vector_store):
96
  # Main function to run the Streamlit app
97
  def main():
98
  st.set_page_config(page_title="RAG-based PDF Chat", layout="centered", page_icon="πŸ“„")
99
- st.title("πŸ“„ Query PDF Documents on GitHub")
100
 
101
- # Load documents from GitHub
102
- raw_text = fetch_pdf_text_from_github(PDF_URLS)
103
  text_chunks = get_text_chunks(raw_text)
104
  vector_store = load_or_create_vector_store(text_chunks)
105
 
 
15
  tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
16
  embedding_model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
17
 
18
+ # List of Hugging Face PDF URLs
19
  PDF_URLS = [
20
+ "https://huggingface.co/spaces/tahirsher/GenAI_Lawyers_Guide/blob/main/bi-partite.pdf",
21
+ "https://huggingface.co/spaces/tahirsher/GenAI_Lawyers_Guide/blob/main/bi%20pat%20graphs.pdf",
22
  # Add more document links as needed
23
  ]
24
 
25
+ # Helper function to convert Hugging Face blob URLs to direct download URLs
26
+ def get_huggingface_raw_url(url):
27
+ if "huggingface.co" in url and "/blob/" in url:
28
+ return url.replace("/blob/", "/resolve/")
29
+ return url
30
+
31
+ # Fetch and extract text from PDF files hosted on Hugging Face
32
+ def fetch_pdf_text_from_huggingface(urls):
33
  text = ""
34
  for url in urls:
35
+ raw_url = get_huggingface_raw_url(url) # Convert to direct download link
36
+ response = requests.get(raw_url)
37
  if response.status_code == 200:
38
  pdf_file = BytesIO(response.content)
39
  try:
 
68
  # Create a FAISS vector store with embeddings
69
  @st.cache_resource
70
  def load_or_create_vector_store(text_chunks):
 
71
  embeddings = compute_embeddings(text_chunks)
 
72
  vector_store = FAISS.from_texts(text_chunks, embeddings)
73
  return vector_store
74
 
75
  # Call Groq API for generating summary based on the query and retrieved text
76
  def generate_summary_with_groq(query, retrieved_text):
77
+ url = "https://api.groq.com/v1/chat/completions"
78
  headers = {
79
  "Authorization": f"Bearer {GROQ_API_KEY}",
80
  "Content-Type": "application/json"
 
101
  # Main function to run the Streamlit app
102
  def main():
103
  st.set_page_config(page_title="RAG-based PDF Chat", layout="centered", page_icon="πŸ“„")
104
+ st.title("πŸ“„ Query PDF Documents on Hugging Face")
105
 
106
+ # Load documents from Hugging Face
107
+ raw_text = fetch_pdf_text_from_huggingface(PDF_URLS)
108
  text_chunks = get_text_chunks(raw_text)
109
  vector_store = load_or_create_vector_store(text_chunks)
110