Spaces:

crang
/

Phi-3-mini-128k-instruct

Runtime error

File size: 2,735 Bytes

1d896f1
 
c4bbe12
 
 
 
 
 
1d896f1
 
 
 
 
c4bbe12
1d896f1
c4bbe12
 
1d896f1
 
 
c4bbe12
1d896f1
c4bbe12
1d896f1
 
c4bbe12
 
 
 
 
 
1d896f1
c4bbe12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1d896f1
c4bbe12
 
 
1d896f1
 
c4bbe12
 
 
1d896f1
 
 
 
c4bbe12
1d896f1
 
 
 
 
 
 
c4bbe12
1d896f1
c4bbe12
 
1d896f1
c4bbe12
1d896f1
 
 
 
 
 
c4bbe12
1d896f1
1970af8

import gradio as gr
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TextIteratorStreamer,
    BitsAndBytesConfig,
)
import os
from threading import Thread
import spaces
import time

token = os.environ["HF_TOKEN"]

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16
)

model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-mini-128k-instruct", quantization_config=quantization_config, token=token
)
tok = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct", token=token)


if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"Using GPU: {torch.cuda.get_device_name(device)}")
else:
    device = torch.device("cpu")
    print("Using CPU")

@spaces.GPU(duration=150)
def chat(message, history, temperature,do_sample, max_tokens):
    chat = []
    for item in history:
        chat.append({"role": "user", "content": item[0]})
        if item[1] is not None:
            chat.append({"role": "assistant", "content": item[1]})
    chat.append({"role": "user", "content": message})
    messages = tok.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
    model_inputs = tok([messages], return_tensors="pt").to(device)
    streamer = TextIteratorStreamer(
        tok, timeout=10.0, skip_prompt=True, skip_special_tokens=True
    )
    generate_kwargs = dict(
        model_inputs,
        streamer=streamer,
        max_new_tokens=max_tokens,
        do_sample=True,
        temperature=temperature,
    )
    
    if temperature == 0:
        generate_kwargs['do_sample'] = False
    
    t = Thread(target=model.generate, kwargs=generate_kwargs)
    t.start()

    partial_text = ""
    for new_text in streamer:
        partial_text += new_text
        yield partial_text

    tokens = len(tok.tokenize(partial_text))
    yield partial_text 


demo = gr.ChatInterface(
    fn=chat,
    examples=[["Write me a poem about Machine Learning."]],
    # multimodal=False,
    additional_inputs_accordion=gr.Accordion(
        label="⚙️ Parameters", open=False, render=False
    ),
    additional_inputs=[
        gr.Slider(
            minimum=0, maximum=1, step=0.1, value=0.9, label="Temperature", render=False
        ),
        gr.Checkbox(label="Sampling",value=True),
        gr.Slider(
            minimum=128,
            maximum=4096,
            step=1,
            value=512,
            label="Max new tokens",
            render=False,
        ),
    ],
    stop_btn="Stop Generation",
    title="Chat With LLMs",
    description="Now Running [Microsoft Phi-3-mini-128k-instruct](https://huggingface.co/microsoft/Phi-3-mini-128k-instruct) in 4bit"
)
demo.launch()