tinyllama-chat

Runtime error

File size: 1,980 Bytes

762c182
86acf2f
762c182
354bd03
 
 
a6dfd28
 
354bd03
 
 
 
 
710ee23
354bd03
 
 
 
 
 
 
 
 
 
 
82fe858
354bd03
86acf2f
354bd03
86acf2f
 
354bd03
 
 
82fe858
354bd03
86acf2f
 
 
 
 
354bd03
 
 
 
 
86acf2f
354bd03
710ee23
 
354bd03
86acf2f
354bd03
 
 
 
 
 
 
ed610fa
 
 
 
 
354bd03
19fcd2b
354bd03

import gradio as gr
import json

from huggingface_hub import snapshot_download
from llama_cpp import Llama

repo_name = "PY007/TinyLlama-1.1B-Chat-v0.2-GGUF"
model_name = "ggml-model-q4_0.gguf"

snapshot_download(repo_id=repo_name, local_dir=".", allow_patterns=model_name)

model = Llama(
    model_path=model_name,
    n_ctx=1024,
    n_parts=1,
)


template = "<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n"

def generate(
    input=None,
    temperature=0.1,
    top_p=0.75,
    top_k=40,
    max_tokens=512,
):
    
    prompt = template.format(input)
    output = ""
    for chunk in model.create_completion(prompt, 
                   temperature = temperature,
                   top_k = top_k,
                   top_p = top_p,
                   max_tokens = max_tokens,
                   stop=["<|im_end|>"], 
                   echo = False,
                   stream = True):
        output +=chunk["choices"][0]["text"]
        yield output
    return output

g = gr.Interface(
    fn=generate,
    inputs=[
        gr.components.Textbox(
            lines=2, label="Prompt", value = "What is Huggingface?"
        ),
        gr.components.Slider(minimum=0, maximum=1, value=0.1, label="Temperature"),
        gr.components.Slider(minimum=0, maximum=1, value=1, label="Top p"),
        gr.components.Slider(minimum=0, maximum=100, step=1, value=50, label="Top k"),
        gr.components.Slider(minimum=1, maximum=1024, step=1, value=256, label="Max tokens"),  
    ],
    outputs=[
        gr.Textbox(
            lines=10,
            label="Output",
        )
    ],
    title = "TinyLlama 1.1B Chat GGUF",
    description =  """
                    original model: [PY007/TinyLlama-1.1B-Chat-v0.2](https://huggingface.co/PY007/TinyLlama-1.1B-Chat-v0.2)
                    quantized_model: [kirp/TinyLlama-1.1B-Chat-v0.2-gguf](https://huggingface.co/kirp/TinyLlama-1.1B-Chat-v0.2-gguf)
                    """
)
g.queue(concurrency_count=1)
g.launch()