import os import shutil import requests import json import gradio as gr import PyPDF2 import chromadb import csv from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_huggingface import HuggingFaceEmbeddings # Constants API_KEY = "c0165440493846b339438fab762683835cf8b78a9c2d3c1216555e491565ca6a" BASE_URL = "https://api.together.xyz/v1/chat/completions" CHUNK_SIZE = 6000 # Maximum words per chunk TEMP_SUMMARY_FILE = "temp_summaries.txt" COLLECTIONS_FILE = "collections.csv" # Function to convert PDF to text def pdf_to_text(file_path): with open(file_path, 'rb') as pdf_file: pdf_reader = PyPDF2.PdfReader(pdf_file) text = "" for page in pdf_reader.pages: text += page.extract_text() return text # Function to summarize text using LLM def summarize_text(text): user_prompt = f""" You are an expert in legal language and document summarization. Your task is to provide a concise and accurate summary of the given document. Keep the summary concise, ideally in 2000 words, while covering all essential points. Here is the document to summarize: {text} """ return call_llm(user_prompt) # Function to handle file upload, summarization, and saving to ChromaDB def handle_file_upload(files, collection_name): if not collection_name: return "Please provide a collection name." os.makedirs('uploaded_pdfs', exist_ok=True) text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=100) embeddings = HuggingFaceEmbeddings(model_name="thenlper/gte-small") client = chromadb.PersistentClient(path="./db") try: collection = client.create_collection(name=collection_name) except ValueError as e: return f"Error creating collection: {str(e)}. Please try a different collection name." file_names = [] with open(TEMP_SUMMARY_FILE, 'w', encoding='utf-8') as temp_file: for file in files: file_name = os.path.basename(file.name) file_names.append(file_name) file_path = os.path.join('uploaded_pdfs', file_name) shutil.copy(file.name, file_path) text = pdf_to_text(file_path) chunks = text_splitter.split_text(text) for i, chunk in enumerate(chunks): summary = summarize_text(chunk) temp_file.write(f"Summary of {file_name} (Part {i+1}):\n{summary}\n\n") # Process the temporary file and add to ChromaDB with open(TEMP_SUMMARY_FILE, 'r', encoding='utf-8') as temp_file: summaries = temp_file.read() summary_chunks = text_splitter.split_text(summaries) for i, chunk in enumerate(summary_chunks): vector = embeddings.embed_query(chunk) collection.add( embeddings=[vector], documents=[chunk], ids=[f"summary_{i}"] ) os.remove(TEMP_SUMMARY_FILE) # Update collections.csv update_collections_csv(collection_name, file_names) return "Files uploaded, summarized, and processed successfully." # Function to update collections.csv def update_collections_csv(collection_name, file_names): file_names_str = ", ".join(file_names) with open(COLLECTIONS_FILE, 'a', newline='') as csvfile: writer = csv.writer(csvfile) writer.writerow([collection_name, file_names_str]) # Function to read collections.csv def read_collections(): if not os.path.exists(COLLECTIONS_FILE): return "No collections found." with open(COLLECTIONS_FILE, 'r') as csvfile: reader = csv.reader(csvfile) collections = [f"Collection: {row[0]}\nFiles: {row[1]}\n\n" for row in reader] return "".join(collections) # Function to search vector database def search_vector_database(query, collection_name): if not collection_name: return "Please provide a collection name." embeddings = HuggingFaceEmbeddings(model_name="thenlper/gte-small") client = chromadb.PersistentClient(path="./db") try: collection = client.get_collection(name=collection_name) except ValueError as e: return f"Error accessing collection: {str(e)}. Make sure the collection name is correct." query_vector = embeddings.embed_query(query) results = collection.query(query_embeddings=[query_vector], n_results=2, include=["documents"]) return "\n\n".join(results["documents"][0]) # Function to call LLM def call_llm(prompt): headers = { "Authorization": f"Bearer {API_KEY}", "Content-Type": "application/json" } data = { "model": "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo", "messages": [{"role": "user", "content": prompt}], "temperature": 0.7, "top_p": 0.7, "top_k": 50, "repetition_penalty": 1, "stop": ["\"\""], "stream": False } response = requests.post(BASE_URL, headers=headers, data=json.dumps(data)) response.raise_for_status() return response.json()['choices'][0]['message']['content'] # Function to answer questions using Rachel.AI def answer_question(question, collection_name): context = search_vector_database(question, collection_name) prompt = f""" You are a paralegal AI assistant. Your role is to assist with legal inquiries by providing clear and concise answers based on the provided question and legal context. Always maintain a highly professional tone, ensuring that your responses are well-reasoned and legally accurate. Question: {question} Legal Context: {context} Please provide a detailed response considering the above information. """ return call_llm(prompt) # Gradio interface def gradio_interface(): with gr.Blocks(theme='gl198976/The-Rounded') as interface: gr.Markdown("# rachel.ai backend") gr.Markdown(""" ### Warning If you encounter an error when uploading files, try changing the collection name and upload again. Each collection name must be unique. """) with gr.Tab("Document Upload and Search"): with gr.Row(): with gr.Column(): collection_name_input = gr.Textbox(label="Collection Name", placeholder="Enter a unique name for this collection") file_upload = gr.Files(file_types=[".pdf"], label="Upload PDFs") upload_btn = gr.Button("Upload, Summarize, and Process Files") upload_status = gr.Textbox(label="Upload Status", interactive=False) with gr.Column(): search_query_input = gr.Textbox(label="Search Query") search_collection_name = gr.Textbox(label="Collection Name for Search", placeholder="Enter the collection name to search") search_output = gr.Textbox(label="Search Results", lines=10) search_btn = gr.Button("Search") api_details = gr.Markdown(""" ### API Endpoint Details - **URL:** http://0.0.0.0:7860/search_vector_database - **Method:** POST - **Example Usage:** ```python from gradio_client import Client client = Client("http://0.0.0.0:7860/") result = client.predict( "search query", # str in 'Search Query' Textbox component "name of collection given in ui", # str in 'Collection Name' Textbox component api_name="/search_vector_database" ) print(result) ``` """) with gr.Tab("Rachel.AI"): question_input = gr.Textbox(label="Ask a question") rachel_collection_name = gr.Textbox(label="Collection Name", placeholder="Enter the collection name to search") answer_output = gr.Textbox(label="Answer", lines=10) ask_btn = gr.Button("Ask Rachel.AI") rachel_api_details = gr.Markdown(""" ### API Endpoint Details for Rachel.AI - **URL:** http://0.0.0.0:7860/answer_question - **Method:** POST - **Example Usage:** ```python from gradio_client import Client client = Client("http://0.0.0.0:7860/") result = client.predict( "question", # str in 'Ask a question' Textbox component "collection_name", # str in 'Collection Name' Textbox component api_name="/answer_question" ) print(result) ``` """) with gr.Tab("Collections"): collections_output = gr.Textbox(label="Collections and Files", lines=20) refresh_btn = gr.Button("Refresh Collections") upload_btn.click(handle_file_upload, inputs=[file_upload, collection_name_input], outputs=[upload_status]) search_btn.click(search_vector_database, inputs=[search_query_input, search_collection_name], outputs=[search_output]) ask_btn.click(answer_question, inputs=[question_input, rachel_collection_name], outputs=[answer_output]) refresh_btn.click(read_collections, inputs=[], outputs=[collections_output]) interface.launch(server_name="0.0.0.0", server_port=7860) if __name__ == "__main__": gradio_interface()