tahirsher commited on
Commit
65033a4
1 Parent(s): f8e6141

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +44 -36
app.py CHANGED
@@ -6,7 +6,7 @@ from PyPDF2 import PdfReader
6
  from langchain.text_splitter import RecursiveCharacterTextSplitter
7
  from langchain.embeddings import HuggingFaceEmbeddings
8
  from langchain.vectorstores import FAISS
9
- from transformers import pipeline, AutoModel, AutoTokenizer
10
  import torch
11
 
12
  # Set up the page configuration as the first Streamlit command
@@ -15,16 +15,29 @@ st.set_page_config(page_title="RAG-based PDF Chat", layout="centered", page_icon
15
  # Load the summarization pipeline model
16
  @st.cache_resource
17
  def load_summarization_pipeline():
18
- summarizer = pipeline("summarization", model="facebook/bart-large-cnn") # Use a summarization model
19
  return summarizer
20
 
21
  summarizer = load_summarization_pipeline()
22
 
23
- # List of Hugging Face PDF URLs
24
- PDF_URLS = [
25
- "https://huggingface.co/spaces/tahirsher/GenAI_Lawyers_Guide/blob/main/administrator92ada0936848e501425591b4ad0cd417.pdf",
26
- "https://huggingface.co/spaces/tahirsher/GenAI_Lawyers_Guide/blob/main/Pakistan%20Penal%20Code.pdf",
27
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
  # Helper function to convert Hugging Face blob URLs to direct download URLs
30
  def get_huggingface_raw_url(url):
@@ -32,25 +45,28 @@ def get_huggingface_raw_url(url):
32
  return url.replace("/blob/", "/resolve/")
33
  return url
34
 
35
- # Fetch and extract text from PDF files hosted on Hugging Face
36
- def fetch_pdf_text_from_huggingface(urls):
37
- text = ""
38
- for url in urls:
39
- raw_url = get_huggingface_raw_url(url)
40
- response = requests.get(raw_url)
41
- if response.status_code == 200:
42
- pdf_file = BytesIO(response.content)
43
- try:
44
- pdf_reader = PdfReader(pdf_file)
45
- for page in pdf_reader.pages:
46
- page_text = page.extract_text()
47
- if page_text:
48
- text += page_text
49
- except Exception as e:
50
- st.error(f"Failed to read PDF from URL {url}: {e}")
51
- else:
52
- st.error(f"Failed to fetch PDF from URL: {url}")
53
- return text
 
 
 
54
 
55
  # Split text into manageable chunks
56
  @st.cache_data
@@ -70,14 +86,9 @@ def load_or_create_vector_store(text_chunks):
70
 
71
  # Generate summary based on the retrieved text
72
  def generate_summary_with_huggingface(query, retrieved_text):
73
- # Concatenate query and retrieved text for summarization
74
  summarization_input = f"{query}\n\nRelated information:\n{retrieved_text}"
75
-
76
- # Truncate input to fit within the model’s token length limit (approximately 1024 tokens)
77
  max_input_length = 1024
78
  summarization_input = summarization_input[:max_input_length]
79
-
80
- # Generate the summary
81
  summary = summarizer(summarization_input, max_length=500, min_length=50, do_sample=False)
82
  return summary[0]["summary_text"]
83
 
@@ -90,13 +101,10 @@ def user_input(user_question, vector_store):
90
  # Main function to run the Streamlit app
91
  def main():
92
  st.title("📄 Gen AI Lawyers Guide")
93
-
94
- # Load documents from Hugging Face
95
- raw_text = fetch_pdf_text_from_huggingface(PDF_URLS)
96
  text_chunks = get_text_chunks(raw_text)
97
  vector_store = load_or_create_vector_store(text_chunks)
98
 
99
- # User question input
100
  user_question = st.text_input("Ask a Question:", placeholder="Type your question here...")
101
 
102
  if st.button("Get Response"):
@@ -108,4 +116,4 @@ def main():
108
  st.markdown(f"**🤖 AI:** {answer}")
109
 
110
  if __name__ == "__main__":
111
- main()
 
6
  from langchain.text_splitter import RecursiveCharacterTextSplitter
7
  from langchain.embeddings import HuggingFaceEmbeddings
8
  from langchain.vectorstores import FAISS
9
+ from transformers import pipeline
10
  import torch
11
 
12
  # Set up the page configuration as the first Streamlit command
 
15
  # Load the summarization pipeline model
16
  @st.cache_resource
17
  def load_summarization_pipeline():
18
+ summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
19
  return summarizer
20
 
21
  summarizer = load_summarization_pipeline()
22
 
23
+ # Dictionary of Hugging Face PDF URLs grouped by folders
24
+ PDF_FOLDERS = {
25
+ "PPC and Administration": [
26
+ "https://huggingface.co/spaces/tahirsher/GenAI_Lawyers_Guide/blob/main/administrator92ada0936848e501425591b4ad0cd417.pdf",
27
+ "https://huggingface.co/spaces/tahirsher/GenAI_Lawyers_Guide/blob/main/Pakistan%20Penal%20Code.pdf",
28
+ ],
29
+ "IHC": [
30
+ "https://huggingface.co/spaces/tahirsher/GenAI_Lawyers_Guide/tree/main/IHC"
31
+ "LHC": [
32
+ "https://huggingface.co/spaces/tahirsher/GenAI_Lawyers_Guide/tree/main/LHC"
33
+ "Lahore High Court Rules and Orders": [
34
+ "https://huggingface.co/spaces/tahirsher/GenAI_Lawyers_Guide/tree/main/Lahore%20High%20Court%20Rules%20and%20Orders"
35
+ "PHC": [
36
+ "https://huggingface.co/spaces/tahirsher/GenAI_Lawyers_Guide/tree/main/PHC"
37
+ "SC": [
38
+ "https://huggingface.co/spaces/tahirsher/GenAI_Lawyers_Guide/tree/main/SC"
39
+ ],
40
+ }
41
 
42
  # Helper function to convert Hugging Face blob URLs to direct download URLs
43
  def get_huggingface_raw_url(url):
 
45
  return url.replace("/blob/", "/resolve/")
46
  return url
47
 
48
+ # Fetch and extract text from all PDFs in specified folders
49
+ def fetch_pdf_text_from_folders(pdf_folders):
50
+ all_text = ""
51
+ for folder_name, urls in pdf_folders.items():
52
+ folder_text = f"\n[Folder: {folder_name}]\n"
53
+ for url in urls:
54
+ raw_url = get_huggingface_raw_url(url)
55
+ response = requests.get(raw_url)
56
+ if response.status_code == 200:
57
+ pdf_file = BytesIO(response.content)
58
+ try:
59
+ pdf_reader = PdfReader(pdf_file)
60
+ for page in pdf_reader.pages:
61
+ page_text = page.extract_text()
62
+ if page_text:
63
+ folder_text += page_text
64
+ except Exception as e:
65
+ st.error(f"Failed to read PDF from URL {url}: {e}")
66
+ else:
67
+ st.error(f"Failed to fetch PDF from URL: {url}")
68
+ all_text += folder_text
69
+ return all_text
70
 
71
  # Split text into manageable chunks
72
  @st.cache_data
 
86
 
87
  # Generate summary based on the retrieved text
88
  def generate_summary_with_huggingface(query, retrieved_text):
 
89
  summarization_input = f"{query}\n\nRelated information:\n{retrieved_text}"
 
 
90
  max_input_length = 1024
91
  summarization_input = summarization_input[:max_input_length]
 
 
92
  summary = summarizer(summarization_input, max_length=500, min_length=50, do_sample=False)
93
  return summary[0]["summary_text"]
94
 
 
101
  # Main function to run the Streamlit app
102
  def main():
103
  st.title("📄 Gen AI Lawyers Guide")
104
+ raw_text = fetch_pdf_text_from_folders(PDF_FOLDERS)
 
 
105
  text_chunks = get_text_chunks(raw_text)
106
  vector_store = load_or_create_vector_store(text_chunks)
107
 
 
108
  user_question = st.text_input("Ask a Question:", placeholder="Type your question here...")
109
 
110
  if st.button("Get Response"):
 
116
  st.markdown(f"**🤖 AI:** {answer}")
117
 
118
  if __name__ == "__main__":
119
+ main()