Spaces:

shresthasingh
/

rachel.ai

Sleeping

App Files Files Community

shresthasingh commited on Aug 1

Commit

c7ea556

•

1 Parent(s): d97eaee

Create app.py

Browse files

Files changed (1) hide show

app.py +239 -0

app.py ADDED Viewed

	@@ -0,0 +1,239 @@

+import os
+import shutil
+import requests
+import json
+import gradio as gr
+import PyPDF2
+import chromadb
+import csv
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_huggingface import HuggingFaceEmbeddings
+# Constants
+API_KEY = "c0165440493846b339438fab762683835cf8b78a9c2d3c1216555e491565ca6a"
+BASE_URL = "https://api.together.xyz/v1/chat/completions"
+CHUNK_SIZE = 6000  # Maximum words per chunk
+TEMP_SUMMARY_FILE = "temp_summaries.txt"
+COLLECTIONS_FILE = "collections.csv"
+# Function to convert PDF to text
+def pdf_to_text(file_path):
+    with open(file_path, 'rb') as pdf_file:
+        pdf_reader = PyPDF2.PdfReader(pdf_file)
+        text = ""
+        for page in pdf_reader.pages:
+            text += page.extract_text()
+    return text
+# Function to summarize text using LLM
+def summarize_text(text):
+    user_prompt = f"""
+    You are an expert in legal language and document summarization. Your task is to provide a concise and accurate summary of the given document.
+    Keep the summary concise, ideally in 2000 words, while covering all essential points. Here is the document to summarize:
+    {text}
+    """
+    return call_llm(user_prompt)
+# Function to handle file upload, summarization, and saving to ChromaDB
+def handle_file_upload(files, collection_name):
+    if not collection_name:
+        return "Please provide a collection name."
+    os.makedirs('uploaded_pdfs', exist_ok=True)
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=100)
+    embeddings = HuggingFaceEmbeddings(model_name="thenlper/gte-small")
+    client = chromadb.PersistentClient(path="./db")
+    try:
+        collection = client.create_collection(name=collection_name)
+    except ValueError as e:
+        return f"Error creating collection: {str(e)}. Please try a different collection name."
+    file_names = []
+    with open(TEMP_SUMMARY_FILE, 'w', encoding='utf-8') as temp_file:
+        for file in files:
+            file_name = os.path.basename(file.name)
+            file_names.append(file_name)
+            file_path = os.path.join('uploaded_pdfs', file_name)
+            shutil.copy(file.name, file_path)
+            text = pdf_to_text(file_path)
+            chunks = text_splitter.split_text(text)
+            for i, chunk in enumerate(chunks):
+                summary = summarize_text(chunk)
+                temp_file.write(f"Summary of {file_name} (Part {i+1}):\n{summary}\n\n")
+    # Process the temporary file and add to ChromaDB
+    with open(TEMP_SUMMARY_FILE, 'r', encoding='utf-8') as temp_file:
+        summaries = temp_file.read()
+        summary_chunks = text_splitter.split_text(summaries)
+        for i, chunk in enumerate(summary_chunks):
+            vector = embeddings.embed_query(chunk)
+            collection.add(
+                embeddings=[vector],
+                documents=[chunk],
+                ids=[f"summary_{i}"]
+            )
+    os.remove(TEMP_SUMMARY_FILE)
+    # Update collections.csv
+    update_collections_csv(collection_name, file_names)
+    return "Files uploaded, summarized, and processed successfully."
+# Function to update collections.csv
+def update_collections_csv(collection_name, file_names):
+    file_names_str = ", ".join(file_names)
+    with open(COLLECTIONS_FILE, 'a', newline='') as csvfile:
+        writer = csv.writer(csvfile)
+        writer.writerow([collection_name, file_names_str])
+# Function to read collections.csv
+def read_collections():
+    if not os.path.exists(COLLECTIONS_FILE):
+        return "No collections found."
+    with open(COLLECTIONS_FILE, 'r') as csvfile:
+        reader = csv.reader(csvfile)
+        collections = [f"Collection: {row[0]}\nFiles: {row[1]}\n\n" for row in reader]
+    return "".join(collections)
+# Function to search vector database
+def search_vector_database(query, collection_name):
+    if not collection_name:
+        return "Please provide a collection name."
+    embeddings = HuggingFaceEmbeddings(model_name="thenlper/gte-small")
+    client = chromadb.PersistentClient(path="./db")
+    try:
+        collection = client.get_collection(name=collection_name)
+    except ValueError as e:
+        return f"Error accessing collection: {str(e)}. Make sure the collection name is correct."
+    query_vector = embeddings.embed_query(query)
+    results = collection.query(query_embeddings=[query_vector], n_results=2, include=["documents"])
+    return "\n\n".join(results["documents"][0])
+# Function to call LLM
+def call_llm(prompt):
+    headers = {
+        "Authorization": f"Bearer {API_KEY}",
+        "Content-Type": "application/json"
+    }
+    data = {
+        "model": "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
+        "messages": [{"role": "user", "content": prompt}],
+        "temperature": 0.7,
+        "top_p": 0.7,
+        "top_k": 50,
+        "repetition_penalty": 1,
+        "stop": ["\"\""],
+        "stream": False
+    }
+    response = requests.post(BASE_URL, headers=headers, data=json.dumps(data))
+    response.raise_for_status()
+    return response.json()['choices'][0]['message']['content']
+# Function to answer questions using Rachel.AI
+def answer_question(question, collection_name):
+    context = search_vector_database(question, collection_name)
+    prompt = f"""
+    You are a paralegal AI assistant. Your role is to assist with legal inquiries by providing clear and concise answers based on the provided question and legal context. Always maintain a highly professional tone, ensuring that your responses are well-reasoned and legally accurate.
+    Question: {question}
+    Legal Context: {context}
+    Please provide a detailed response considering the above information.
+    """
+    return call_llm(prompt)
+# Gradio interface
+def gradio_interface():
+    with gr.Blocks(theme='gl198976/The-Rounded') as interface:
+        gr.Markdown("# rachel.ai backend")
+        gr.Markdown("""
+        ### Warning
+        If you encounter an error when uploading files, try changing the collection name and upload again.
+        Each collection name must be unique.
+        """)
+        with gr.Tab("Document Upload and Search"):
+            with gr.Row():
+                with gr.Column():
+                    collection_name_input = gr.Textbox(label="Collection Name", placeholder="Enter a unique name for this collection")
+                    file_upload = gr.Files(file_types=[".pdf"], label="Upload PDFs")
+                    upload_btn = gr.Button("Upload, Summarize, and Process Files")
+                    upload_status = gr.Textbox(label="Upload Status", interactive=False)
+                with gr.Column():
+                    search_query_input = gr.Textbox(label="Search Query")
+                    search_collection_name = gr.Textbox(label="Collection Name for Search", placeholder="Enter the collection name to search")
+                    search_output = gr.Textbox(label="Search Results", lines=10)
+                    search_btn = gr.Button("Search")
+            api_details = gr.Markdown("""
+                ### API Endpoint Details
+                - **URL:** http://0.0.0.0:7860/search_vector_database
+                - **Method:** POST
+                - **Example Usage:**
+                ```python
+                from gradio_client import Client
+                client = Client("http://0.0.0.0:7860/")
+                result = client.predict(
+                    "search query",  # str in 'Search Query' Textbox component
+                    "name of collection given in ui",  # str in 'Collection Name' Textbox component
+                    api_name="/search_vector_database"
+                )
+                print(result)
+                ```
+            """)
+        with gr.Tab("Rachel.AI"):
+            question_input = gr.Textbox(label="Ask a question")
+            rachel_collection_name = gr.Textbox(label="Collection Name", placeholder="Enter the collection name to search")
+            answer_output = gr.Textbox(label="Answer", lines=10)
+            ask_btn = gr.Button("Ask Rachel.AI")
+            rachel_api_details = gr.Markdown("""
+                ### API Endpoint Details for Rachel.AI
+                - **URL:** http://0.0.0.0:7860/answer_question
+                - **Method:** POST
+                - **Example Usage:**
+                ```python
+                from gradio_client import Client
+                client = Client("http://0.0.0.0:7860/")
+                result = client.predict(
+                    "question",  # str in 'Ask a question' Textbox component
+                    "collection_name",  # str in 'Collection Name' Textbox component
+                    api_name="/answer_question"
+                )
+                print(result)
+                ```
+            """)
+        with gr.Tab("Collections"):
+            collections_output = gr.Textbox(label="Collections and Files", lines=20)
+            refresh_btn = gr.Button("Refresh Collections")
+        upload_btn.click(handle_file_upload, inputs=[file_upload, collection_name_input], outputs=[upload_status])
+        search_btn.click(search_vector_database, inputs=[search_query_input, search_collection_name], outputs=[search_output])
+        ask_btn.click(answer_question, inputs=[question_input, rachel_collection_name], outputs=[answer_output])
+        refresh_btn.click(read_collections, inputs=[], outputs=[collections_output])
+    interface.launch(server_name="0.0.0.0", server_port=7860)
+if __name__ == "__main__":
+    gradio_interface()