Spaces:

kheopss
/

METROPOLE_CHATBOT_FINAL

Sleeping

App Files Files Community

METROPOLE_CHATBOT_FINAL / app.py

kheopss

Update app.py

fff1463 verified 3 months ago

raw

history blame

5.03 kB

	import nest_asyncio
	import gradio as gr
	from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
	from llama_index.core.postprocessor import LLMRerank
	import logging
	import sys
	from llama_index.embeddings.huggingface import HuggingFaceEmbedding
	from llama_index.legacy.llms.huggingface import HuggingFaceInferenceAPI, HuggingFaceLLM
	from llama_index.core import Settings
	from llama_index.llms.huggingface import HuggingFaceLLM
	import torch
	from transformers import BitsAndBytesConfig
	from llama_index.core.prompts import PromptTemplate
	from llama_index.llms.openai import OpenAI
	import os
	import pandas as pd
	from llama_index.core import Document
	from llama_index.core.retrievers import VectorIndexRetriever
	from llama_index.core import QueryBundle
	import time
	from huggingface_hub import login

	nest_asyncio.apply()
	hf_token = os.getenv('hf_token')


	# Replace 'your_token_here' with your actual Hugging Face API token
	login(token=hf_token)
	# quantize to save memory
	quantization_config = BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_compute_dtype=torch.float16,
	bnb_4bit_quant_type="nf4",
	bnb_4bit_use_double_quant=True,
	)

	llm = HuggingFaceLLM(
	model_name="kheopss/kheops_hermes-e1-v0.11-bnb-16bit",
	tokenizer_name="kheopss/kheops_hermes-e1-v0.11-bnb-16bit",
	context_window=3900,
	max_new_tokens=2560,
	model_kwargs={"quantization_config": quantization_config},
	generate_kwargs={"temperature": 0.1, "top_k": 50, "top_p": 0.95},
	device_map="cuda:0",

	)

	embed_model = HuggingFaceEmbedding(
	model_name="kheopss/kheops_embedding_e5_v3",
	)
	Settings.llm=llm
	Settings.embed_model=embed_model
	# Replace 'file_path.json' with the path to your JSON file


	file_path = 'response_metropo_cleaned.json'

	data = pd.read_json(file_path)

	documents = [Document(text=row['values'],metadata={"filename": row['file_name'], "description":row['file_description']},) for index, row in data.iterrows()]
	index = VectorStoreIndex.from_documents(documents, show_progress=True)

	def get_retrieved_nodes(
	query_str, vector_top_k=10, reranker_top_n=3, with_reranker=False
	):
	query_bundle = QueryBundle(query_str)
	# configure retriever
	phase_01_start = time.time()
	retriever = VectorIndexRetriever(
	index=index,
	similarity_top_k=vector_top_k,
	)
	retrieved_nodes = retriever.retrieve(query_bundle)
	phase_01_end = time.time()
	print(f"Phase 01 <RETRIEVING> took : {phase_01_end-phase_01_start}")
	phase_02_start = time.time()
	if with_reranker:
	# configure reranker
	reranker = LLMRerank(

	choice_batch_size=5,
	top_n=reranker_top_n,
	)
	retrieved_nodes = reranker.postprocess_nodes(
	retrieved_nodes, query_bundle
	)
	phase_02_end = time.time()
	print(f"Phase 02 <RERANKING> took : {phase_02_end-phase_02_start}")
	return retrieved_nodes

	def get_all_text(new_nodes):
	texts = []
	for i, node in enumerate(new_nodes, 1):
	texts.append(f"\nDocument {i} : {node.get_text()}")
	return ' '.join(texts)
	def completion_to(text,user_p):
	system_p = "You are a conversational AI assistant tasked with helping public agents in Nice guide residents and citizens to appropriate services. You will respond to user queries using information from provided documents. Your answer mode can be 'Grounded' or 'Mixed'. In 'Grounded' mode, use only exact facts from the documents, citing them with <co: doc_id></co> tags. In 'Mixed' mode, you can incorporate both document facts and your own knowledge. Always respond in French, keeping your answers grounded in the document text and engaging in conversation to assist based on user questions."
	return f"<\|im_start\|>system{system_p}\n DOCUMENTS : \n {text}\n <\|im_end\|><\|im_start\|>user \n{user_p}\n<\|im_end\|><\|im_start\|>assistant"


	def process_final(user_prom, history):
	import time
	all_process_start = time.time()
	new_nodes = get_retrieved_nodes(
	user_prom,
	vector_top_k=5,
	reranker_top_n=3,
	with_reranker=True,
	)
	get_texts = get_all_text(new_nodes)
	prompting = completion_to(get_texts,user_prom)
	print("PHASE 03 passing to LLM\n")
	phase_03_start = time.time()
	gen =llm.stream_complete(formatted=True, prompt=prompting)
	# phase_03_end = time.time()
	# all_process_end = time.time()
	# print(f"Phase 03 (LLM) took {phase_03_end - phase_03_start} seconds")
	# print(f"All process took {all_process_end - all_process_start} seconds")
	# llm.stream_complete(formatted=True, prompt=prompting)

	for response in gen:
	yield response.text
	description = """
	<p>
	<center>
	<img src="https://www.nicecotedazur.org/wp-content/themes/mnca/images/logo-metropole-nca.png" alt="rick" width="250"/>
	</center>
	</p>
	<p style="text-align:right"> Made by KHEOPS AI</p>
	"""
	demo = gr.ChatInterface(
	fn=process_final,
	title="METROPOLE CHATBOT",
	description=description,
	)
	demo.launch(share=True, debug =True)