import gradio as gr
import os
from typing import Iterator
import sambanova


def generate(
    message: str,
    chat_history: list[tuple[str, str]],
    system_message,
    max_tokens: int = 1024,
    temperature: float = 0.6,
    top_p: float = 0.9,
    top_k: int = 50,
    repetition_penalty: float = 1.2,
) -> Iterator[str]:
    
    conversation = [{"role": "system", "content": system_message}]

    for val in chat_history:
        if val[0]:
            conversation.append({"role": "user", "content": val[0]})
        if val[1]:
            conversation.append({"role": "assistant", "content": val[1]})

    outputs = []
    for text in sambanova.Streamer(conversation,
                                   new_tokens=max_tokens,
                                   temperature=temperature,
                                   top_k=top_k,
                                   top_p=top_p):
        outputs.append(text)
        yield "".join(outputs)


MAX_MAX_TOKENS = 2048
DEFAULT_MAX_TOKENS = 1024
MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))

# chat_interface = gr.ChatInterface(
#     fn=generate,
#     additional_inputs=[
#         gr.Slider(
#             label="Max new tokens",
#             minimum=1,
#             maximum=MAX_MAX_NEW_TOKENS,
#             step=1,
#             value=DEFAULT_MAX_NEW_TOKENS,
#         ),
#         gr.Slider(
#             label="Temperature",
#             minimum=0.1,
#             maximum=4.0,
#             step=0.1,
#             value=0.6,
#         ),
#         gr.Slider(
#             label="Top-p (nucleus sampling)",
#             minimum=0.05,
#             maximum=1.0,
#             step=0.05,
#             value=0.9,
#         ),
#         gr.Slider(
#             label="Top-k",
#             minimum=1,
#             maximum=1000,
#             step=1,
#             value=50,
#         ),
#         gr.Slider(
#             label="Repetition penalty",
#             minimum=1.0,
#             maximum=2.0,
#             step=0.05,
#             value=1.2,
#         ),
#     ],
#     stop_btn=None,
#     fill_height=True,
#     examples=[
#         ["Which one is bigger? 4.9 or 4.11"],
#         [
#             "Can you explain briefly to me what is the Python programming language?"
#         ],
#         ["Explain the plot of Cinderella in a sentence."],
#         ["How many hours does it take a man to eat a Helicopter?"],
#         [
#             "Write a 100-word article on 'Benefits of Open-Source in AI research'"
#         ],
#     ],
#     cache_examples=False,
# )

chat_interface = gr.ChatInterface(
    generate,
    additional_inputs=[
        gr.Textbox(value="You are a friendly Chatbot.",
                   label="System message"),
        gr.Slider(
            label="Max tokens",
            minimum=1,
            maximum=MAX_MAX_TOKENS,
            step=1,
            value=DEFAULT_MAX_TOKENS,
        ),
        gr.Slider(
            label="Temperature",
            minimum=0.1,
            maximum=4.0,
            step=0.1,
            value=0.6,
        ),
        gr.Slider(
            label="Top-p (nucleus sampling)",
            minimum=0.05,
            maximum=1.0,
            step=0.05,
            value=0.9,
        ),
        gr.Slider(
            label="Top-k",
            minimum=1,
            maximum=1000,
            step=1,
            value=50,
        ),

    ],
    examples=[
        ["Which one is bigger? 4.9 or 4.11"],
        [
            "Can you explain briefly to me what is the Python programming language?"
        ],
        ["Explain the plot of Cinderella in a sentence."],
        ["How many hours does it take a man to eat a Helicopter?"],
        [
            "Write a 100-word article on 'Benefits of Open-Source in AI research'"
        ],
    ],
    cache_examples=False,
)

with gr.Blocks() as demo:
    gr.Markdown('# Sambanova model inference LLAMA 405B')

    chat_interface.render()

if __name__ == "__main__":
    demo.queue(max_size=20).launch()