import os import streamlit as st import numpy as np import fitz # PyMuPDF from ultralytics import YOLO from sklearn.cluster import KMeans from sklearn.metrics.pairwise import cosine_similarity from langchain_core.output_parsers import StrOutputParser from langchain_community.document_loaders import PyMuPDFLoader from langchain_openai import OpenAIEmbeddings from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_core.prompts import ChatPromptTemplate from sklearn.decomposition import PCA from langchain_openai import ChatOpenAI import string import re # Load the trained model model = YOLO("best.pt") openai_api_key = os.environ.get("openai_api_key") # Define the class indices for figures, tables, and text figure_class_index = 4 # class index for figures table_class_index = 3 # class index for tables # Global variables to store embeddings and contents global_embeddings = None global_split_contents = None def clean_text(text): text = re.sub(r'\s+', ' ', text).strip() return text def remove_references(text): reference_patterns = [ r'\bReferences\b', r'\breferences\b', r'\bBibliography\b', r'\bCitations\b', r'\bWorks Cited\b', r'\bReference\b', r'\breference\b' ] lines = text.split('\n') for i, line in enumerate(lines): if any(re.search(pattern, line, re.IGNORECASE) for pattern in reference_patterns): return '\n'.join(lines[:i]) return text def save_uploaded_file(uploaded_file): with open(uploaded_file.name, 'wb') as f: f.write(uploaded_file.getbuffer()) return uploaded_file.name def summarize_pdf(pdf_file_path, num_clusters=10): embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small", api_key=openai_api_key) llm = ChatOpenAI(model="gpt-3.5-turbo", api_key=openai_api_key, temperature=0.3) prompt = ChatPromptTemplate.from_template( """Could you please provide a concise and comprehensive summary of the given Contexts? The summary should capture the main points and key details of the text while conveying the author's intended meaning accurately. Please ensure that the summary is well-organized and easy to read, with clear headings and subheadings to guide the reader through each section. The length of the summary should be appropriate to capture the main points and key details of the text, without including unnecessary information or becoming overly long. example of summary: ## Summary: ## Key points: Contexts: {topic}""" ) output_parser = StrOutputParser() chain = prompt | llm | output_parser loader = PyMuPDFLoader(pdf_file_path) docs = loader.load() full_text = "\n".join(doc.page_content for doc in docs) cleaned_full_text = remove_references(full_text) cleaned_full_text = clean_text(cleaned_full_text) text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0,separators=["\n\n", "\n",".", " "]) split_contents = text_splitter.split_text(cleaned_full_text) embeddings = embeddings_model.embed_documents(split_contents) X = np.array(embeddings) kmeans = KMeans(n_clusters=num_clusters, init='k-means++', random_state=0).fit(embeddings) cluster_centers = kmeans.cluster_centers_ closest_point_indices = [] for center in cluster_centers: distances = np.linalg.norm(embeddings - center, axis=1) closest_point_indices.append(np.argmin(distances)) extracted_contents = [split_contents[idx] for idx in closest_point_indices] results = chain.invoke({"topic": ' '.join(extracted_contents)}) summary_sentences = re.split(r'(?= similarity_threshold: most_similar_idx = np.argmax(similarity_matrix[i]) if most_similar_idx not in source_mapping: source_mapping[most_similar_idx] = len(relevant_sources) + 1 relevant_sources.append((most_similar_idx, extracted_contents[most_similar_idx])) citation_idx = source_mapping[most_similar_idx] citation = f"([Source {citation_idx}](#source-{citation_idx}))" cited_sentence = re.sub(r'([.!?])$', f" {citation}\\1", sentence) sentence_to_source[sentence] = citation_idx cited_results = cited_results.replace(sentence, cited_sentence) sources_list = "\n\n## Sources:\n" for idx, (original_idx, content) in enumerate(relevant_sources): sources_list += f"""
Source {idx + 1}
{content}
""" cited_results += sources_list return cited_results def qa_pdf(pdf_file_path, query, num_clusters=5, similarity_threshold=0.6): global global_embeddings, global_split_contents # Initialize models and embeddings embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small", api_key=openai_api_key) llm = ChatOpenAI(model="gpt-3.5-turbo", api_key=openai_api_key, temperature=0.3) prompt = ChatPromptTemplate.from_template( """Please provide a detailed and accurate answer to the given question based on the provided contexts. Ensure that the answer is comprehensive and directly addresses the query. If necessary, include relevant examples or details from the text. Question: {question} Contexts: {contexts}""" ) output_parser = StrOutputParser() chain = prompt | llm | output_parser # Load and process the PDF if not already loaded if global_embeddings is None or global_split_contents is None: loader = PyMuPDFLoader(pdf_file_path) docs = loader.load() full_text = "\n".join(doc.page_content for doc in docs) cleaned_full_text = remove_references(full_text) cleaned_full_text = clean_text(cleaned_full_text) text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=0, separators=["\n\n", "\n", ".", " "]) global_split_contents = text_splitter.split_text(cleaned_full_text) global_embeddings = embeddings_model.embed_documents(global_split_contents) # Embed the query and find the most relevant contexts query_embedding = embeddings_model.embed_query(query) similarity_scores = cosine_similarity([query_embedding], global_embeddings)[0] top_indices = np.argsort(similarity_scores)[-num_clusters:] relevant_contents = [global_split_contents[i] for i in top_indices] # Generate the answer using the LLM chain results = chain.invoke({"question": query, "contexts": ' '.join(relevant_contents)}) # Split the answer into sentences and embed them answer_sentences = re.split(r'(?= similarity_threshold: most_similar_idx = np.argmax(similarity_matrix[i]) if most_similar_idx not in source_mapping: source_mapping[most_similar_idx] = len(relevant_sources) + 1 relevant_sources.append((most_similar_idx, relevant_contents[most_similar_idx])) citation_idx = source_mapping[most_similar_idx] citation = f"[Source {citation_idx}]" cited_sentence = re.sub(r'([.!?])$', f" {citation}\\1", sentence) sentence_to_source[sentence] = citation_idx cited_results = cited_results.replace(sentence, cited_sentence) # Format the sources for markdown rendering sources_list = "\n\n## Sources:\n" for idx, (original_idx, content) in enumerate(relevant_sources): sources_list += f"""
Source {idx + 1}
{content}
""" cited_results += sources_list return cited_results def infer_image_and_get_boxes(image, confidence_threshold=0.6): results = model.predict(image) boxes = [ (int(box.xyxy[0][0]), int(box.xyxy[0][1]), int(box.xyxy[0][2]), int(box.xyxy[0][3]), int(box.cls[0])) for result in results for box in result.boxes if int(box.cls[0]) in {figure_class_index, table_class_index} and box.conf[0] > confidence_threshold ] return boxes def crop_images_from_boxes(image, boxes, scale_factor): figures = [] tables = [] for (x1, y1, x2, y2, cls) in boxes: cropped_img = image[int(y1 * scale_factor):int(y2 * scale_factor), int(x1 * scale_factor):int(x2 * scale_factor)] if cls == figure_class_index: figures.append(cropped_img) elif cls == table_class_index: tables.append(cropped_img) return figures, tables def process_pdf(pdf_file_path): doc = fitz.open(pdf_file_path) all_figures = [] all_tables = [] low_dpi = 50 high_dpi = 300 scale_factor = high_dpi / low_dpi low_res_pixmaps = [page.get_pixmap(dpi=low_dpi) for page in doc] for page_num, low_res_pix in enumerate(low_res_pixmaps): low_res_img = np.frombuffer(low_res_pix.samples, dtype=np.uint8).reshape(low_res_pix.height, low_res_pix.width, 3) boxes = infer_image_and_get_boxes(low_res_img) if boxes: high_res_pix = doc[page_num].get_pixmap(dpi=high_dpi) high_res_img = np.frombuffer(high_res_pix.samples, dtype=np.uint8).reshape(high_res_pix.height, high_res_pix.width, 3) figures, tables = crop_images_from_boxes(high_res_img, boxes, scale_factor) all_figures.extend(figures) all_tables.extend(tables) return all_figures, all_tables # Set the page configuration for a modern look # Set the page configuration for a modern look # Set the page configuration for a modern look st.set_page_config(page_title="PDF Reading Assistant", page_icon="📄", layout="wide") # Add some custom CSS for a modern look st.markdown(""" """, unsafe_allow_html=True) # Streamlit interface st.title("📄 PDF Reading Assistant") st.markdown("### Extract tables, figures, summaries, and answers from your PDF files easily.") uploaded_file = st.file_uploader("Upload a PDF", type="pdf") if uploaded_file: file_path = save_uploaded_file(uploaded_file) with st.container(): st.markdown("
Extract Tables and Figures
", unsafe_allow_html=True) with st.expander("Click to Extract Tables and Figures", expanded=True): with st.container(): extract_button = st.button("Extract") if extract_button: figures, tables = process_pdf(file_path) col1, col2 = st.columns(2) with col1: st.write("### Figures") if figures: for figure in figures: st.image(figure, use_column_width=True) else: st.write("No figures found.") with col2: st.write("### Tables") if tables: for table in tables: st.image(table, use_column_width=True) else: st.write("No tables found.") with st.container(): st.markdown("
Get Summary
", unsafe_allow_html=True) with st.expander("Click to Generate Summary", expanded=True): with st.container(): summary_button = st.button("Generate Summary") if summary_button: summary = summarize_pdf(file_path) st.markdown(summary, unsafe_allow_html=True) with st.container(): st.markdown("
Chat with your PDF
", unsafe_allow_html=True) st.write("### Chat with your PDF") if 'chat_history' not in st.session_state: st.session_state['chat_history'] = [] for chat in st.session_state['chat_history']: chat_user_class = "user" if chat["user"] else "" chat_bot_class = "bot" if chat["bot"] else "" st.markdown(f"
{chat['user']}
", unsafe_allow_html=True) st.markdown(f"
{chat['bot']}
", unsafe_allow_html=True) with st.form(key="chat_form", clear_on_submit=True): user_input = st.text_area("Ask a question about the PDF:", key="user_input") submit_button = st.form_submit_button(label="Send") if submit_button and user_input: st.session_state['chat_history'].append({"user": user_input, "bot": None}) answer = qa_pdf(file_path, user_input) st.session_state['chat_history'][-1]["bot"] = answer st.experimental_rerun()