Spaces:

tahirsher
/

GenAI_Lawyers_Guide

Sleeping

App Files Files Community

GenAI_Lawyers_Guide / app.py

tahirsher

Update app.py

cf97ba5 verified 14 days ago

raw

history blame

4.65 kB

	import os
	import requests
	import streamlit as st
	from io import BytesIO
	from PyPDF2 import PdfReader
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.embeddings import HuggingFaceEmbeddings
	from langchain.vectorstores import FAISS
	from transformers import pipeline
	import torch

	# Set up the page configuration as the first Streamlit command
	st.set_page_config(page_title="RAG-based PDF Chat", layout="centered", page_icon="📄")

	# Load the summarization pipeline model
	@st.cache_resource
	def load_summarization_pipeline():
	summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
	return summarizer

	summarizer = load_summarization_pipeline()

	# Dictionary of Hugging Face PDF URLs grouped by folders
	PDF_FOLDERS = {
	"PPC and Administration": [
	"https://huggingface.co/spaces/tahirsher/GenAI_Lawyers_Guide/tree/main/PPC%20and%20Administration",
	],
	"IHC": [
	"https://huggingface.co/spaces/tahirsher/GenAI_Lawyers_Guide/tree/main/IHC"
	"LHC": [
	"https://huggingface.co/spaces/tahirsher/GenAI_Lawyers_Guide/tree/main/LHC"
	"Lahore High Court Rules and Orders": [
	"https://huggingface.co/spaces/tahirsher/GenAI_Lawyers_Guide/tree/main/Lahore%20High%20Court%20Rules%20and%20Orders"
	"PHC": [
	"https://huggingface.co/spaces/tahirsher/GenAI_Lawyers_Guide/tree/main/PHC"
	"SC": [
	"https://huggingface.co/spaces/tahirsher/GenAI_Lawyers_Guide/tree/main/SC"
	],
	}

	# Helper function to convert Hugging Face blob URLs to direct download URLs
	def get_huggingface_raw_url(url):
	if "huggingface.co" in url and "/blob/" in url:
	return url.replace("/blob/", "/resolve/")
	return url

	# Fetch and extract text from all PDFs in specified folders
	def fetch_pdf_text_from_folders(pdf_folders):
	all_text = ""
	for folder_name, urls in pdf_folders.items():
	folder_text = f"\n[Folder: {folder_name}]\n"
	for url in urls:
	raw_url = get_huggingface_raw_url(url)
	response = requests.get(raw_url)
	if response.status_code == 200:
	pdf_file = BytesIO(response.content)
	try:
	pdf_reader = PdfReader(pdf_file)
	for page in pdf_reader.pages:
	page_text = page.extract_text()
	if page_text:
	folder_text += page_text
	except Exception as e:
	st.error(f"Failed to read PDF from URL {url}: {e}")
	else:
	st.error(f"Failed to fetch PDF from URL: {url}")
	all_text += folder_text
	return all_text

	# Split text into manageable chunks
	@st.cache_data
	def get_text_chunks(text):
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
	chunks = text_splitter.split_text(text)
	return chunks

	# Initialize embedding function
	embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

	# Create a FAISS vector store with embeddings
	@st.cache_resource
	def load_or_create_vector_store(text_chunks):
	vector_store = FAISS.from_texts(text_chunks, embedding=embedding_function)
	return vector_store

	# Generate summary based on the retrieved text
	def generate_summary_with_huggingface(query, retrieved_text):
	summarization_input = f"{query}\n\nRelated information:\n{retrieved_text}"
	max_input_length = 1024
	summarization_input = summarization_input[:max_input_length]
	summary = summarizer(summarization_input, max_length=500, min_length=50, do_sample=False)
	return summary[0]["summary_text"]

	# Generate response for user query
	def user_input(user_question, vector_store):
	docs = vector_store.similarity_search(user_question)
	context_text = " ".join([doc.page_content for doc in docs])
	return generate_summary_with_huggingface(user_question, context_text)

	# Main function to run the Streamlit app
	def main():
	st.title("📄 Gen AI Lawyers Guide")
	raw_text = fetch_pdf_text_from_folders(PDF_FOLDERS)
	text_chunks = get_text_chunks(raw_text)
	vector_store = load_or_create_vector_store(text_chunks)

	user_question = st.text_input("Ask a Question:", placeholder="Type your question here...")

	if st.button("Get Response"):
	if not user_question:
	st.warning("Please enter a question before submitting.")
	else:
	with st.spinner("Generating response..."):
	answer = user_input(user_question, vector_store)
	st.markdown(f"🤖 AI: {answer}")

	if __name__ == "__main__":
	main()