Spaces:

kheopss
/

METROPOLE_CHATBOT_FINAL

Sleeping

File size: 5,027 Bytes

import nest_asyncio
import gradio as gr
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core.postprocessor import LLMRerank
import logging
import sys
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.legacy.llms.huggingface import HuggingFaceInferenceAPI, HuggingFaceLLM
from llama_index.core import Settings
from llama_index.llms.huggingface import HuggingFaceLLM
import torch
from transformers import BitsAndBytesConfig
from llama_index.core.prompts import PromptTemplate
from llama_index.llms.openai import OpenAI
import os
import pandas as pd
from llama_index.core import Document
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core import QueryBundle
import time
from huggingface_hub import login

nest_asyncio.apply()
hf_token = os.getenv('hf_token')


# Replace 'your_token_here' with your actual Hugging Face API token
login(token=hf_token)
# quantize to save memory
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

llm = HuggingFaceLLM(
    model_name="kheopss/kheops_hermes-e1-v0.11-bnb-16bit",
    tokenizer_name="kheopss/kheops_hermes-e1-v0.11-bnb-16bit",
    context_window=3900,
    max_new_tokens=2560,
    model_kwargs={"quantization_config": quantization_config},
    generate_kwargs={"temperature": 0.1, "top_k": 50, "top_p": 0.95},
    device_map="cuda:0",
 
)

embed_model = HuggingFaceEmbedding(
    model_name="kheopss/kheops_embedding_e5_v3",
)
Settings.llm=llm
Settings.embed_model=embed_model
# Replace 'file_path.json' with the path to your JSON file


file_path = 'response_metropo_cleaned.json'

data = pd.read_json(file_path)

documents = [Document(text=row['values'],metadata={"filename": row['file_name'], "description":row['file_description']},) for index, row in data.iterrows()]
index = VectorStoreIndex.from_documents(documents, show_progress=True)

def get_retrieved_nodes(
    query_str, vector_top_k=10, reranker_top_n=3, with_reranker=False
):
    query_bundle = QueryBundle(query_str)
    # configure retriever
    phase_01_start = time.time()
    retriever = VectorIndexRetriever(
        index=index,
        similarity_top_k=vector_top_k,
    )
    retrieved_nodes = retriever.retrieve(query_bundle)
    phase_01_end = time.time()
    print(f"Phase 01 <RETRIEVING> took  : {phase_01_end-phase_01_start}")
    phase_02_start = time.time()
    if with_reranker:
        # configure reranker
        reranker = LLMRerank(
     
            choice_batch_size=5,
            top_n=reranker_top_n,
        )
        retrieved_nodes = reranker.postprocess_nodes(
            retrieved_nodes, query_bundle
        )
    phase_02_end = time.time()
    print(f"Phase 02 <RERANKING> took  : {phase_02_end-phase_02_start}")
    return retrieved_nodes

def get_all_text(new_nodes):
    texts = []
    for i, node in enumerate(new_nodes, 1):
        texts.append(f"\nDocument {i} : {node.get_text()}")
    return ' '.join(texts)
def completion_to(text,user_p):
    system_p = "You are a conversational AI assistant tasked with helping public agents in Nice guide residents and citizens to appropriate services. You will respond to user queries using information from provided documents. Your answer mode can be 'Grounded' or 'Mixed'. In 'Grounded' mode, use only exact facts from the documents, citing them with <co: doc_id></co> tags. In 'Mixed' mode, you can incorporate both document facts and your own knowledge. Always respond in French, keeping your answers grounded in the document text and engaging in conversation to assist based on user questions."
    return f"<|im_start|>system{system_p}\n DOCUMENTS : \n {text}\n <|im_end|><|im_start|>user \n{user_p}\n<|im_end|><|im_start|>assistant"


def process_final(user_prom, history):
    import time
    all_process_start = time.time()
    new_nodes = get_retrieved_nodes(
      user_prom,
      vector_top_k=5,
      reranker_top_n=3,
      with_reranker=True,
    ) 
    get_texts = get_all_text(new_nodes)
    prompting = completion_to(get_texts,user_prom)
    print("PHASE 03 passing to LLM\n")
    phase_03_start = time.time()
    gen =llm.stream_complete(formatted=True, prompt=prompting)
    # phase_03_end = time.time()
    # all_process_end = time.time()
    # print(f"Phase 03 (LLM) took {phase_03_end - phase_03_start} seconds")
    # print(f"All process took {all_process_end - all_process_start} seconds")    
    # llm.stream_complete(formatted=True, prompt=prompting)
    
    for response in gen:
        yield response.text
description = """
<p>
<center>
<img src="https://www.nicecotedazur.org/wp-content/themes/mnca/images/logo-metropole-nca.png" alt="rick" width="250"/>
</center>
</p>
<p style="text-align:right"> Made by KHEOPS AI</p>
"""   
demo = gr.ChatInterface(
    fn=process_final, 
    title="METROPOLE CHATBOT",
    description=description,
)
demo.launch(share=True, debug =True)