import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index.llms import HuggingFaceLLM
from langchain.document_loaders import PyPDFLoader


documents = SimpleDirectoryReader("Data").load_data()

from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import PyPDFLoader


from llama_index.prompts.prompts import SimpleInputPrompt


system_prompt =  "You are a medical chatbot designed to provide general medical information and guidance to patients. You are not a substitute for a licensed healthcare professional. Please always consult with a doctor or other qualified healthcare provider for any medical concerns. Here are some specific examples of how you can be helpful: Answer general medical questions about symptoms, conditions, and treatments, Provide information on healthy lifestyle habits and preventive care, Help patients find reliable medical resources and support groups, Offer emotional support and reassurance to patients facing medical challenges. Please keep in mind the following: Always direct patients to seek professional medical attention, Use clear and concise language that is easy for patients to understand. Be respectful and sensitive to patients individual needs and concerns.I am confident that you can be a valuable tool for patients seeking medical information and guidance. Thank you for your dedication to helping others."


# This will wrap the default prompts that are internal to llama-index
query_wrapper_prompt = SimpleInputPrompt("<|USER|>{query_str}<|ASSISTANT|>")


#!huggingface-cli login


import torch

llm = HuggingFaceLLM(
    context_window=4096,
    max_new_tokens=256,
    generate_kwargs={"temperature": 0.5, "do_sample": False},
    system_prompt=system_prompt,
    query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name="NousResearch/Llama-2-7b-chat-hf",
    model_name="NousResearch/Llama-2-7b-chat-hf",
    device_map="auto",
    # uncomment this if using CUDA to reduce memory usage
    model_kwargs={"torch_dtype": torch.float16 , "load_in_8bit":True})


from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from llama_index.embeddings.langchain import LangchainEmbedding
from llama_index import ServiceContext

embed_model = LangchainEmbedding(
  HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
)


service_context = ServiceContext.from_defaults(
    chunk_size=1024,
    llm=llm,
    embed_model=embed_model
)


index = VectorStoreIndex.from_documents(documents, service_context=service_context)


query_engine = index.as_query_engine()
response = query_engine.query("What  are several common symptoms of cataracts?")

print(response)


import gradio as gr

while True:
  query=input()
  response = query_engine.query(query)
  print(response)

def answer(query):
    response = query_engine.query(query)
    return response

demo = gr.Blocks()

with demo:
    query_input = gr.Textbox(label="Ask a question:")
    response_output = gr.Textbox(label="Answer:")

    query_input.submit(answer, inputs=query_input, outputs=response_output)

    demo.launch()