gemma-2-9b-it

Runtime error

File size: 3,937 Bytes

import os
from threading import Thread
from typing import Iterator

import gradio as gr
import spaces
import torch
from transformers import (
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    GemmaTokenizerFast,
    TextIteratorStreamer,
)

DESCRIPTION = """\
# Gemma 2 9B IT

Gemma 2 is Google's latest iteration of open LLMs.
This is a demo of [`google/gemma-2-9b-it`](https://huggingface.co/google/gemma-2-9b-it), fine-tuned for instruction following.
For more details, please check [our post](https://huggingface.co/blog/gemma2).

👉 Looking for a larger and more powerful version? Try the 27B version in [HuggingChat](https://huggingface.co/chat/models/google/gemma-2-27b-it).
"""

MAX_MAX_NEW_TOKENS = 2048
DEFAULT_MAX_NEW_TOKENS = 1024
MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model_id = "ehristoforu/Gemma2-9b-it-train1"
tokenizer = GemmaTokenizerFast.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    quantization_config=BitsAndBytesConfig(load_in_8bit=True),
)
model.config.sliding_window = 4096
model.eval()


@spaces.GPU(duration=50)
def generate(
    message: str,
    system_prompt: str,
    max_new_tokens: int = 1024,
    temperature: float = 0.6,
    top_p: float = 0.9,
    top_k: int = 50,
    repetition_penalty: float = 1.2,
) -> Iterator[str]:
    conversation = []
    conversation.append({"role": "system", "content": system_prompt})
    conversation.append({"role": "user", "content": message})

    input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
    if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
        input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
        gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
    input_ids = input_ids.to(model.device)

    streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
    generate_kwargs = dict(
        {"input_ids": input_ids},
        streamer=streamer,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        top_p=top_p,
        top_k=top_k,
        temperature=temperature,
        num_beams=1,
        repetition_penalty=repetition_penalty,
    )
    t = Thread(target=model.generate, kwargs=generate_kwargs)
    t.start()

    outputs = []
    for text in streamer:
        outputs.append(text)
        yield "".join(outputs)

message = gr.Textbox(
    label="Message",
    max_lines=5,
    lines=2,
    interactive=True,
)

system_prompt = gr.Textbox(
    label="System prompt",
    max_lines=5,
    lines=2,
    interactive=True,
)
max_tokens = gr.Slider(
    label="Max new tokens",
    minimum=1,
    maximum=MAX_MAX_NEW_TOKENS,
    step=1,
        value=DEFAULT_MAX_NEW_TOKENS,
)
temperature = gr.Slider(
    label="Temperature",
    minimum=0.1,
    maximum=4.0,
    step=0.1,
    value=0.6,
)
top_p = gr.Slider(
    label="Top-p (nucleus sampling)",
    minimum=0.05,
    maximum=1.0,
    step=0.05,
    value=0.9,
)
top_k = gr.Slider(
    label="Top-k",
    minimum=1,
    maximum=1000,
    step=1,
    value=50,
)
repeat_penalty = gr.Slider(
    label="Repetition penalty",
    minimum=1.0,
    maximum=2.0,
    step=0.05,
    value=1.2,
)

output = gr.Textbox(
    label="Output",
    max_lines=16,
    lines=10,
    interactive=True,
)

chat_interface = gr.Interface(
    fn=generate,
    inputs=[message, system_prompt, max_tokens, temperature, top_p, top_k, repeat_penalty],
    outputs=output,
    api_name="/run",

)

with gr.Blocks(css="style.css", fill_height=True) as demo:
    gr.Markdown(DESCRIPTION)
    gr.DuplicateButton(value="Duplicate Space for private use", elem_id="duplicate-button")
    chat_interface.render()

if __name__ == "__main__":
    demo.queue(max_size=20).launch()