import nest_asyncio import gradio as gr from llama_index.core import VectorStoreIndex, SimpleDirectoryReader from llama_index.core.postprocessor import LLMRerank import logging import sys from llama_index.embeddings.huggingface import HuggingFaceEmbedding from llama_index.legacy.llms.huggingface import HuggingFaceInferenceAPI, HuggingFaceLLM from llama_index.core import Settings from llama_index.llms.huggingface import HuggingFaceLLM import torch from transformers import BitsAndBytesConfig from llama_index.core.prompts import PromptTemplate from llama_index.llms.openai import OpenAI import os import pandas as pd from llama_index.core import Document from llama_index.core.retrievers import VectorIndexRetriever from llama_index.core import QueryBundle import time from huggingface_hub import login nest_asyncio.apply() hf_token = os.getenv('hf_token') # Replace 'your_token_here' with your actual Hugging Face API token login(token=hf_token) # quantize to save memory quantization_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16, bnb_4bit_quant_type="nf4", bnb_4bit_use_double_quant=True, ) llm = HuggingFaceLLM( model_name="kheopss/kheops_hermes-e1-v0.11-bnb-16bit", tokenizer_name="kheopss/kheops_hermes-e1-v0.11-bnb-16bit", context_window=3900, max_new_tokens=2560, model_kwargs={"quantization_config": quantization_config}, generate_kwargs={"temperature": 0.1, "top_k": 50, "top_p": 0.95}, device_map="cuda:0", ) embed_model = HuggingFaceEmbedding( model_name="kheopss/kheops_embedding_e5_v3", ) Settings.llm=llm Settings.embed_model=embed_model # Replace 'file_path.json' with the path to your JSON file file_path = 'response_metropo_cleaned.json' data = pd.read_json(file_path) documents = [Document(text=row['values'],metadata={"filename": row['file_name'], "description":row['file_description']},) for index, row in data.iterrows()] index = VectorStoreIndex.from_documents(documents, show_progress=True) def get_retrieved_nodes( query_str, vector_top_k=10, reranker_top_n=3, with_reranker=False ): query_bundle = QueryBundle(query_str) # configure retriever phase_01_start = time.time() retriever = VectorIndexRetriever( index=index, similarity_top_k=vector_top_k, ) retrieved_nodes = retriever.retrieve(query_bundle) phase_01_end = time.time() print(f"Phase 01 took : {phase_01_end-phase_01_start}") phase_02_start = time.time() if with_reranker: # configure reranker reranker = LLMRerank( choice_batch_size=5, top_n=reranker_top_n, ) retrieved_nodes = reranker.postprocess_nodes( retrieved_nodes, query_bundle ) phase_02_end = time.time() print(f"Phase 02 took : {phase_02_end-phase_02_start}") return retrieved_nodes def get_all_text(new_nodes): texts = [] for i, node in enumerate(new_nodes, 1): texts.append(f"\nDocument {i} : {node.get_text()}") return ' '.join(texts) def completion_to(text,user_p): system_p = "You are a conversational AI assistant tasked with helping public agents in Nice guide residents and citizens to appropriate services. You will respond to user queries using information from provided documents. Your answer mode can be 'Grounded' or 'Mixed'. In 'Grounded' mode, use only exact facts from the documents, citing them with tags. In 'Mixed' mode, you can incorporate both document facts and your own knowledge. Always respond in French, keeping your answers grounded in the document text and engaging in conversation to assist based on user questions." return f"<|im_start|>system{system_p}\n DOCUMENTS : \n {text}\n <|im_end|><|im_start|>user \n{user_p}\n<|im_end|><|im_start|>assistant" def process_final(user_prom, history): import time all_process_start = time.time() new_nodes = get_retrieved_nodes( user_prom, vector_top_k=5, reranker_top_n=3, with_reranker=True, ) get_texts = get_all_text(new_nodes) prompting = completion_to(get_texts,user_prom) print("PHASE 03 passing to LLM\n") phase_03_start = time.time() gen =llm.stream_complete(formatted=True, prompt=prompting) # phase_03_end = time.time() # all_process_end = time.time() # print(f"Phase 03 (LLM) took {phase_03_end - phase_03_start} seconds") # print(f"All process took {all_process_end - all_process_start} seconds") # llm.stream_complete(formatted=True, prompt=prompting) for response in gen: yield response.text description = """

rick

Made by KHEOPS AI

""" demo = gr.ChatInterface( fn=process_final, title="METROPOLE CHATBOT", description=description, ) demo.launch(share=True, debug =True)