Spaces:

adupav
/

my-test-space

Sleeping

File size: 5,261 Bytes

d92b8b1
 
d8c9651
 
 
 
cd3d014
 
 
 
 
d92b8b1
 
 
 
 
d8c9651
 
cd3d014
d8c9651
cd3d014
 
 
d92b8b1
d8c9651
 
cd3d014
 
 
d8c9651
 
 
cd3d014
d8c9651
 
 
cd3d014
 
d8c9651
cd3d014
d8c9651
 
 
cd3d014
 
 
 
 
 
d8c9651
 
 
 
cd3d014
d8c9651
 
 
 
 
 
d92b8b1
 
 
 
 
 
 
d8c9651
bb1b69d
d8c9651
d92b8b1
cd3d014
 
 
 
d8c9651
bb1b69d
d8c9651
 
cd3d014
 
bb1b69d
cd3d014
d8c9651
 
cd3d014
d8c9651
 
d92b8b1
 
 
 
 
 
 
 
cd3d014
d92b8b1
d8c9651
 
cd3d014
bb1b69d
 
 
d92b8b1
 
 
 
 
 
 
 
 
cd3d014
 
 
 
bb1b69d
 
 
d92b8b1
d8c9651
 
cd3d014
d8c9651
b46f895
cd3d014
d8c9651
 
d92b8b1
d8c9651
 
 
d92b8b1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cd3d014
d8c9651

import gradio as gr
from huggingface_hub import InferenceClient
import json
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import logging
import time

# Set up logging
logging.basicConfig(level=logging.INFO)

"""
For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
"""

# Load embeddings from a JSON file
def load_embeddings(file_path):
    logging.info(f"Loading embeddings from {file_path}")
    with open(file_path, 'r', encoding='utf-8') as file:
        embeddings = json.load(file)
        logging.info(f"Loaded {len(embeddings)} embeddings")
        return embeddings

# Function to get relevant articles based on user query
def get_relevant_documents(query, embeddings_data, model, top_k=3):
    logging.info(f"Received query: {query}")
    start_time = time.time()

    query_embedding = model.encode(query)
    similarities = []

    for i, entry in enumerate(embeddings_data):
        embedding = np.array(entry['embedding'])
        similarity = cosine_similarity([query_embedding], [embedding])[0][0]
        similarities.append((entry, similarity))
        if i % 100 == 0:  # Log every 100 iterations
            logging.debug(f"Processed {i} embeddings")

    logging.info("Sorting similarities")
    similarities.sort(key=lambda x: x[1], reverse=True)
    top_entries = [entry for entry, _ in similarities[:top_k]]
    
    end_time = time.time()
    duration = end_time - start_time
    
    logging.info(f"Query processed in {duration:.2f} seconds")
    logging.info(f"Top {top_k} documents returned with similarities: {[sim[1] for sim in similarities[:top_k]]}")

    return top_entries

# Function to format relevant documents into a string
def format_documents(documents):
    logging.info(f"Formatting {len(documents)} documents")
    formatted = ""
    for doc in documents:
        formatted += f"Relevant article: {doc['name']}\n{doc['content']}\n\n"
    return formatted

# Main chatbot function that integrates RAG
def respond(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
    embeddings_data,
    tokenizer,
    model
):
    logging.info(f"New user query: {message}")
    
    start_time = time.time()

    # Search for relevant documents based on user input
    relevant_docs = get_relevant_documents(message, embeddings_data, tokenizer, model)
    retrieved_context = format_documents(relevant_docs)
    
    # Log the statistics about the retrieved documents
    logging.info(f"Total documents retrieved: {len(relevant_docs)}")
    logging.info(f"Documents: " + str([doc['name'] for doc in relevant_docs]))
    
    # Add the retrieved context as part of the system message
    system_message_with_context = system_message + "\n\n" + "Relevant documents:\n" + retrieved_context
    logging.info("System message updated with retrieved context")

    messages = [{"role": "system", "content": system_message_with_context}]

    for val in history:
        if val[0]:
            messages.append({"role": "user", "content": val[0]})
        if val[1]:
            messages.append({"role": "assistant", "content": val[1]})

    messages.append({"role": "user", "content": message})
    logging.info("Messages prepared for InferenceClient")

    client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
    
    logging.info("Sending request to InferenceClient")
    response = ""
    
    # Collect the full response instead of yielding each token
    for message in client.chat_completion(
        messages,
        max_tokens=max_tokens,
        stream=True,
        temperature=temperature,
        top_p=top_p,
    ):
        token = message.choices[0].delta.content
        response += token
    
    end_time = time.time()
    total_duration = end_time - start_time
    logging.info(f"Response generated in {total_duration:.2f} seconds")
    
    return response  # Return the complete response as a string


# Load embeddings and model once at startup
embeddings_file = 'Code Civil vectorised.json'
logging.info("Starting application, loading embeddings and model")
embeddings_data = load_embeddings(embeddings_file)
embedding_model = SentenceTransformer('Lajavaness/bilingual-embedding-small', trust_remote_code=True)
logging.info("Model and embeddings loaded successfully")

# Gradio interface
demo = gr.ChatInterface(
    lambda message, history, system_message, max_tokens, temperature, top_p: respond(
        message, history, system_message, max_tokens, temperature, top_p, embeddings_data, embedding_model
    ),
    additional_inputs=[
        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p (nucleus sampling)",
        ),
    ],
)

if __name__ == "__main__":
    logging.info("Launching Gradio app")
    demo.launch()