SambaNova-fast / app.py
Xianbao QIAN
fill height for the block as well!
e2bc778
raw
history blame contribute delete
No virus
4.12 kB
import gradio as gr
import os
from typing import Iterator
import sambanova
def generate(
message: str,
chat_history: list[tuple[str, str]],
system_message,
max_tokens: int = 1024,
temperature: float = 0.6,
top_p: float = 0.9,
top_k: int = 50,
repetition_penalty: float = 1.2,
) -> Iterator[str]:
conversation = [{"role": "system", "content": system_message}]
for val in chat_history:
if val[0]:
conversation.append({"role": "user", "content": val[0]})
if val[1]:
conversation.append({"role": "assistant", "content": val[1]})
conversation.append({"role": "user", "content": message})
outputs = []
for text in sambanova.Streamer(conversation,
new_tokens=max_tokens,
temperature=temperature,
top_k=top_k,
top_p=top_p):
outputs.append(text)
yield "".join(outputs)
MAX_MAX_TOKENS = 2048
DEFAULT_MAX_TOKENS = 1024
MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
# chat_interface = gr.ChatInterface(
# fn=generate,
# additional_inputs=[
# gr.Slider(
# label="Max new tokens",
# minimum=1,
# maximum=MAX_MAX_NEW_TOKENS,
# step=1,
# value=DEFAULT_MAX_NEW_TOKENS,
# ),
# gr.Slider(
# label="Temperature",
# minimum=0.1,
# maximum=4.0,
# step=0.1,
# value=0.6,
# ),
# gr.Slider(
# label="Top-p (nucleus sampling)",
# minimum=0.05,
# maximum=1.0,
# step=0.05,
# value=0.9,
# ),
# gr.Slider(
# label="Top-k",
# minimum=1,
# maximum=1000,
# step=1,
# value=50,
# ),
# gr.Slider(
# label="Repetition penalty",
# minimum=1.0,
# maximum=2.0,
# step=0.05,
# value=1.2,
# ),
# ],
# stop_btn=None,
# fill_height=True,
# examples=[
# ["Which one is bigger? 4.9 or 4.11"],
# [
# "Can you explain briefly to me what is the Python programming language?"
# ],
# ["Explain the plot of Cinderella in a sentence."],
# ["How many hours does it take a man to eat a Helicopter?"],
# [
# "Write a 100-word article on 'Benefits of Open-Source in AI research'"
# ],
# ],
# cache_examples=False,
# )
chat_interface = gr.ChatInterface(
generate,
additional_inputs=[
gr.Textbox(value="You are a friendly Chatbot.",
label="System message"),
gr.Slider(
label="Max tokens",
minimum=1,
maximum=MAX_MAX_TOKENS,
step=1,
value=DEFAULT_MAX_TOKENS,
),
gr.Slider(
label="Temperature",
minimum=0.1,
maximum=4.0,
step=0.1,
value=0.6,
),
gr.Slider(
label="Top-p (nucleus sampling)",
minimum=0.05,
maximum=1.0,
step=0.05,
value=0.9,
),
gr.Slider(
label="Top-k",
minimum=1,
maximum=1000,
step=1,
value=50,
),
],
examples=[
["Which one is bigger? 4.9 or 4.11"],
[
"Can you explain briefly to me what is the Python programming language?"
],
["Explain the plot of Cinderella in a sentence."],
["How many hours does it take a man to eat a Helicopter?"],
[
"Write a 100-word article on 'Benefits of Open-Source in AI research'"
],
],
cache_examples=False,
)
with gr.Blocks(fill_height=True) as demo:
gr.Markdown('# Sambanova model inference LLAMA 405B')
chat_interface.render()
if __name__ == "__main__":
demo.queue(max_size=20).launch()