Spaces:

shresthasingh
/

rachel.ai

Running

File size: 9,579 Bytes

c7ea556

import os
import shutil
import requests
import json
import gradio as gr
import PyPDF2
import chromadb
import csv
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings

# Constants
API_KEY = "c0165440493846b339438fab762683835cf8b78a9c2d3c1216555e491565ca6a"
BASE_URL = "https://api.together.xyz/v1/chat/completions"
CHUNK_SIZE = 6000  # Maximum words per chunk
TEMP_SUMMARY_FILE = "temp_summaries.txt"
COLLECTIONS_FILE = "collections.csv"

# Function to convert PDF to text
def pdf_to_text(file_path):
    with open(file_path, 'rb') as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        text = ""
        for page in pdf_reader.pages:
            text += page.extract_text()
    return text

# Function to summarize text using LLM
def summarize_text(text):
    user_prompt = f"""
    You are an expert in legal language and document summarization. Your task is to provide a concise and accurate summary of the given document. 
    Keep the summary concise, ideally in 2000 words, while covering all essential points. Here is the document to summarize:

    {text}
    """

    return call_llm(user_prompt)

# Function to handle file upload, summarization, and saving to ChromaDB
def handle_file_upload(files, collection_name):
    if not collection_name:
        return "Please provide a collection name."

    os.makedirs('uploaded_pdfs', exist_ok=True)
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=100)
    embeddings = HuggingFaceEmbeddings(model_name="thenlper/gte-small")

    client = chromadb.PersistentClient(path="./db")
    try:
        collection = client.create_collection(name=collection_name)
    except ValueError as e:
        return f"Error creating collection: {str(e)}. Please try a different collection name."

    file_names = []
    with open(TEMP_SUMMARY_FILE, 'w', encoding='utf-8') as temp_file:
        for file in files:
            file_name = os.path.basename(file.name)
            file_names.append(file_name)
            file_path = os.path.join('uploaded_pdfs', file_name)
            shutil.copy(file.name, file_path)
            
            text = pdf_to_text(file_path)
            chunks = text_splitter.split_text(text)
            
            for i, chunk in enumerate(chunks):
                summary = summarize_text(chunk)
                temp_file.write(f"Summary of {file_name} (Part {i+1}):\n{summary}\n\n")

    # Process the temporary file and add to ChromaDB
    with open(TEMP_SUMMARY_FILE, 'r', encoding='utf-8') as temp_file:
        summaries = temp_file.read()
        summary_chunks = text_splitter.split_text(summaries)

        for i, chunk in enumerate(summary_chunks):
            vector = embeddings.embed_query(chunk)
            collection.add(
                embeddings=[vector],
                documents=[chunk],
                ids=[f"summary_{i}"]
            )

    os.remove(TEMP_SUMMARY_FILE)

    # Update collections.csv
    update_collections_csv(collection_name, file_names)

    return "Files uploaded, summarized, and processed successfully."

# Function to update collections.csv
def update_collections_csv(collection_name, file_names):
    file_names_str = ", ".join(file_names)
    with open(COLLECTIONS_FILE, 'a', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow([collection_name, file_names_str])

# Function to read collections.csv
def read_collections():
    if not os.path.exists(COLLECTIONS_FILE):
        return "No collections found."
    
    with open(COLLECTIONS_FILE, 'r') as csvfile:
        reader = csv.reader(csvfile)
        collections = [f"Collection: {row[0]}\nFiles: {row[1]}\n\n" for row in reader]
    
    return "".join(collections)

# Function to search vector database
def search_vector_database(query, collection_name):
    if not collection_name:
        return "Please provide a collection name."

    embeddings = HuggingFaceEmbeddings(model_name="thenlper/gte-small")
    client = chromadb.PersistentClient(path="./db")
    try:
        collection = client.get_collection(name=collection_name)
    except ValueError as e:
        return f"Error accessing collection: {str(e)}. Make sure the collection name is correct."

    query_vector = embeddings.embed_query(query)
    results = collection.query(query_embeddings=[query_vector], n_results=2, include=["documents"])
    
    return "\n\n".join(results["documents"][0])

# Function to call LLM
def call_llm(prompt):
    headers = {
        "Authorization": f"Bearer {API_KEY}",
        "Content-Type": "application/json"
    }

    data = {
        "model": "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
        "messages": [{"role": "user", "content": prompt}],
        "temperature": 0.7,
        "top_p": 0.7,
        "top_k": 50,
        "repetition_penalty": 1,
        "stop": ["\"\""],
        "stream": False
    }

    response = requests.post(BASE_URL, headers=headers, data=json.dumps(data))
    response.raise_for_status()
    return response.json()['choices'][0]['message']['content']

# Function to answer questions using Rachel.AI
def answer_question(question, collection_name):
    context = search_vector_database(question, collection_name)
    
    prompt = f"""
    You are a paralegal AI assistant. Your role is to assist with legal inquiries by providing clear and concise answers based on the provided question and legal context. Always maintain a highly professional tone, ensuring that your responses are well-reasoned and legally accurate.
    Question: {question}
    Legal Context: {context}
    Please provide a detailed response considering the above information.
    """
    
    return call_llm(prompt)

# Gradio interface
def gradio_interface():
    with gr.Blocks(theme='gl198976/The-Rounded') as interface:
        gr.Markdown("# rachel.ai backend")

        gr.Markdown("""
        ### Warning
        If you encounter an error when uploading files, try changing the collection name and upload again.
        Each collection name must be unique.
        """)
        
        with gr.Tab("Document Upload and Search"):
            with gr.Row():
                with gr.Column():
                    collection_name_input = gr.Textbox(label="Collection Name", placeholder="Enter a unique name for this collection")
                    file_upload = gr.Files(file_types=[".pdf"], label="Upload PDFs")
                    upload_btn = gr.Button("Upload, Summarize, and Process Files")
                    upload_status = gr.Textbox(label="Upload Status", interactive=False)
                with gr.Column():
                    search_query_input = gr.Textbox(label="Search Query")
                    search_collection_name = gr.Textbox(label="Collection Name for Search", placeholder="Enter the collection name to search")
                    search_output = gr.Textbox(label="Search Results", lines=10)
                    search_btn = gr.Button("Search")
            
            api_details = gr.Markdown("""
                ### API Endpoint Details
                - **URL:** http://0.0.0.0:7860/search_vector_database
                - **Method:** POST
                - **Example Usage:**
                
                ```python
                from gradio_client import Client

                client = Client("http://0.0.0.0:7860/")
                result = client.predict(
                    "search query",  # str in 'Search Query' Textbox component
                    "name of collection given in ui",  # str in 'Collection Name' Textbox component
                    api_name="/search_vector_database"
                )
                print(result)
                ```
            """)

        with gr.Tab("Rachel.AI"):
            question_input = gr.Textbox(label="Ask a question")
            rachel_collection_name = gr.Textbox(label="Collection Name", placeholder="Enter the collection name to search")
            answer_output = gr.Textbox(label="Answer", lines=10)
            ask_btn = gr.Button("Ask Rachel.AI")
            
            rachel_api_details = gr.Markdown("""
                ### API Endpoint Details for Rachel.AI
                - **URL:** http://0.0.0.0:7860/answer_question
                - **Method:** POST
                - **Example Usage:**
                
                ```python
                from gradio_client import Client

                client = Client("http://0.0.0.0:7860/")
                result = client.predict(
                    "question",  # str in 'Ask a question' Textbox component
                    "collection_name",  # str in 'Collection Name' Textbox component
                    api_name="/answer_question"
                )
                print(result)
                ```
            """)

        with gr.Tab("Collections"):
            collections_output = gr.Textbox(label="Collections and Files", lines=20)
            refresh_btn = gr.Button("Refresh Collections")

        upload_btn.click(handle_file_upload, inputs=[file_upload, collection_name_input], outputs=[upload_status])
        search_btn.click(search_vector_database, inputs=[search_query_input, search_collection_name], outputs=[search_output])
        ask_btn.click(answer_question, inputs=[question_input, rachel_collection_name], outputs=[answer_output])
        refresh_btn.click(read_collections, inputs=[], outputs=[collections_output])

    interface.launch(server_name="0.0.0.0", server_port=7860)

if __name__ == "__main__":
    gradio_interface()