import streamlit as st
from llama_cpp import Llama
import warnings
warnings.filterwarnings(action='ignore')
import datetime
import random
import string
from time import sleep
import tiktoken
# required for HF SPACES
from huggingface_hub import hf_hub_download
import os

# for counting the tokens in the prompt and in the result
#context_count = len(encoding.encode(yourtext))
encoding = tiktoken.get_encoding("r50k_base") 

verbosity = False
nCTX = 8192
sTOPS = ['<eos>']
modelname = "Gemma2-2B-it"
# Set the webpage title
st.set_page_config(
    page_title=f"Your LocalGPT ✨ with {modelname}",
    page_icon="🌟",
    layout="wide")


if "hf_model" not in st.session_state:
    st.session_state.hf_model = "Gemma2-2B-it"
# Initialize chat history for the LLM
if "messages" not in st.session_state:
    st.session_state.messages = []

# Initialize the ChatMEssages for visualization only
if "chatMessages" not in st.session_state:
    st.session_state.chatMessages = []

if "repeat" not in st.session_state:
    st.session_state.repeat = 1.35

if "temperature" not in st.session_state:
    st.session_state.temperature = 0.1

if "maxlength" not in st.session_state:
    st.session_state.maxlength = 500

if "speed" not in st.session_state:
    st.session_state.speed = 0.0

if "numOfTurns" not in st.session_state:
    st.session_state.numOfTurns = 0

if "maxTurns" not in st.session_state:
    st.session_state.maxTurns = 5  #must be odd number, greater than equal to 5

def writehistory(filename,text):
    with open(filename, 'a', encoding='utf-8') as f:
        f.write(text)
        f.write('\n')
    f.close()

def genRANstring(n):
    """
    n = int number of char to randomize
    """
    N = n
    res = ''.join(random.choices(string.ascii_uppercase +
                                string.digits, k=N))
    return res
#

@st.cache_resource 
def create_chat():   
# Set HF API token  and HF repo
    from llama_cpp import Llama
    #modelfile = hf_hub_download(
    #    repo_id=os.environ.get("REPO_ID", "bartowski/gemma-2-2b-it-GGUF"),
    #    filename=os.environ.get("MODEL_FILE", "gemma-2-2b-it-Q5_K_M.gguf"),
    #)
    client = Llama.from_pretrained(
                repo_id="bartowski/gemma-2-2b-it-GGUF",
                filename="gemma-2-2b-it-Q4_K_S.gguf",
                #model_path=modelfile,
                #n_gpu_layers=-1,  #enable GPU
                n_threads =2,
                temperature=0.24,
                n_ctx=nCTX,
                max_tokens=600,
                repeat_penalty=1.176,
                stop=sTOPS,
                flash_attn=True,
                verbose=verbosity,
                )
    print('loading gemma-2-2b-it-Q4_K_S.gguf with LlamaCPP...')
    return client


# create THE SESSIoN STATES
if "logfilename" not in st.session_state:
## Logger file
    logfile = f'{genRANstring(5)}_log.txt'
    st.session_state.logfilename = logfile
    #Write in the history the first 2 sessions
    writehistory(st.session_state.logfilename,f'{str(datetime.datetime.now())}\n\nYour own LocalGPT with 🌀 {modelname}\n---\n🧠🫡: You are a helpful assistant.')    
    writehistory(st.session_state.logfilename,f'🌀: How may I help you today?')


#AVATARS
av_us = 'https://github.com/fabiomatricardi/Gemma2-2b-it-chatbot/raw/main/images/user.png'  # './man.png'  #"🦖"  #A single emoji, e.g. "🧑‍💻", "🤖", "🦖". Shortcodes are not supported.
av_ass = 'https://github.com/fabiomatricardi/Gemma2-2b-it-chatbot/raw/main/images/assistant2.png'   #'./robot.png'

### START STREAMLIT UI
# Create a header element
st.image('https://github.com/fabiomatricardi/Gemma2-2b-it-chatbot/raw/main/images/Gemma-2-Banner.original.jpg',use_column_width=True)
mytitle = f'> *🌟 {modelname} with {nCTX} tokens Context window* - Turn based Chat available with max capacity of :orange[**{st.session_state.maxTurns} messages**].'
st.markdown(mytitle, unsafe_allow_html=True)
#st.markdown('> Local Chat ')
#st.markdown('---')

# CREATE THE SIDEBAR
with st.sidebar:
    st.image('https://github.com/fabiomatricardi/Gemma2-2b-it-chatbot/raw/main/images/banner.png', use_column_width=True)
    st.session_state.temperature = st.slider('Temperature:', min_value=0.0, max_value=1.0, value=0.65, step=0.01)
    st.session_state.maxlength = st.slider('Length reply:', min_value=150, max_value=2000, 
                                           value=550, step=50)
    st.session_state.repeat = st.slider('Repeat Penalty:', min_value=0.0, max_value=2.0, value=1.176, step=0.02)
    st.session_state.turns = st.toggle('Turn based', value=False, help='Activate Conversational Turn Chat with History', 
                                       disabled=False, label_visibility="visible")
    st.markdown(f"*Number of Max Turns*: {st.session_state.maxTurns}")
    actualTurns = st.markdown(f"*Chat History Lenght*: :green[Good]")
    statspeed = st.markdown(f'💫 speed: {st.session_state.speed}  t/s')
    btnClear = st.button("Clear History",type="primary", use_container_width=True)
    st.markdown(f"**Logfile**: {st.session_state.logfilename}")

llm = create_chat()

# Display chat messages from history on app rerun
for message in st.session_state.chatMessages:
    if message["role"] == "user":
        with st.chat_message(message["role"],avatar=av_us):
            st.markdown(message["content"])
    else:
        with st.chat_message(message["role"],avatar=av_ass):
            st.markdown(message["content"])
# Accept user input
if myprompt := st.chat_input("What is an AI model?"):
    # Add user message to chat history
    st.session_state.messages.append({"role": "user", "content": myprompt})
    st.session_state.chatMessages.append({"role": "user", "content": myprompt})
    st.session_state.numOfTurns = len(st.session_state.messages)
    # Display user message in chat message container
    with st.chat_message("user", avatar=av_us):
        st.markdown(myprompt)
        usertext = f"user: {myprompt}"
        writehistory(st.session_state.logfilename,usertext)
        # Display assistant response in chat message container
    with st.chat_message("assistant",avatar=av_ass):
        message_placeholder = st.empty()
        with st.spinner("Thinking..."):
            start = datetime.datetime.now()
            response = ''
            conv_messages = []
            if st.session_state.turns:
                if st.session_state.numOfTurns > st.session_state.maxTurns:
                    conv_messages = st.session_state.messages[-st.session_state.maxTurns:]
                    actualTurns.markdown(f"*Chat History Lenght*: :red[Trimmed]")
                else:    
                    conv_messages = st.session_state.messages
            else:
                conv_messages.append(st.session_state.messages[-1])
            full_response = ""
            for chunk in llm.create_chat_completion(
                messages=conv_messages,
                temperature=st.session_state.temperature,
                repeat_penalty= st.session_state.repeat,
                stop=sTOPS,
                max_tokens=st.session_state.maxlength,              
                stream=True,):
                try:
                    if chunk["choices"][0]["delta"]["content"]:
                        full_response += chunk["choices"][0]["delta"]["content"]
                        message_placeholder.markdown(full_response + "🟡")
                        delta = datetime.datetime.now() -start       
                        totalseconds = delta.total_seconds()
                        prompttokens = len(encoding.encode(myprompt))
                        assistanttokens = len(encoding.encode(full_response))
                        totaltokens = prompttokens + assistanttokens  
                        st.session_state.speed = totaltokens/totalseconds 
                        statspeed.markdown(f'💫 speed: {st.session_state.speed:.2f}  t/s')                                               
                except:
                    pass                 

            delta = datetime.datetime.now() - start
            totalseconds = delta.total_seconds()
            prompttokens = len(encoding.encode(myprompt))
            assistanttokens = len(encoding.encode(full_response))
            totaltokens = prompttokens + assistanttokens
            st.session_state.speed = totaltokens/totalseconds
            statspeed.markdown(f'💫 speed: {st.session_state.speed:.2f}  t/s')
            toregister = full_response + f"""
```
🧾 prompt tokens: {prompttokens}
📈 generated tokens: {assistanttokens}
⏳ generation time: {delta}
💫 speed: {st.session_state.speed:.3f}  t/s
```"""    
            message_placeholder.markdown(toregister)
            asstext = f"assistant: {toregister}"
            writehistory(st.session_state.logfilename,asstext)       
        st.session_state.messages.append({"role": "assistant", "content": full_response})
        st.session_state.chatMessages.append({"role": "assistant", "content": toregister})
        st.session_state.numOfTurns = len(st.session_state.messages)