File size: 1,980 Bytes
762c182
86acf2f
762c182
354bd03
 
 
a6dfd28
 
354bd03
 
 
 
 
710ee23
354bd03
 
 
 
 
 
 
 
 
 
 
82fe858
354bd03
86acf2f
354bd03
86acf2f
 
354bd03
 
 
82fe858
354bd03
86acf2f
 
 
 
 
354bd03
 
 
 
 
86acf2f
354bd03
710ee23
 
354bd03
86acf2f
354bd03
 
 
 
 
 
 
ed610fa
 
 
 
 
354bd03
19fcd2b
354bd03
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import gradio as gr
import json

from huggingface_hub import snapshot_download
from llama_cpp import Llama

repo_name = "PY007/TinyLlama-1.1B-Chat-v0.2-GGUF"
model_name = "ggml-model-q4_0.gguf"

snapshot_download(repo_id=repo_name, local_dir=".", allow_patterns=model_name)

model = Llama(
    model_path=model_name,
    n_ctx=1024,
    n_parts=1,
)


template = "<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n"

def generate(
    input=None,
    temperature=0.1,
    top_p=0.75,
    top_k=40,
    max_tokens=512,
):
    
    prompt = template.format(input)
    output = ""
    for chunk in model.create_completion(prompt, 
                   temperature = temperature,
                   top_k = top_k,
                   top_p = top_p,
                   max_tokens = max_tokens,
                   stop=["<|im_end|>"], 
                   echo = False,
                   stream = True):
        output +=chunk["choices"][0]["text"]
        yield output
    return output

g = gr.Interface(
    fn=generate,
    inputs=[
        gr.components.Textbox(
            lines=2, label="Prompt", value = "What is Huggingface?"
        ),
        gr.components.Slider(minimum=0, maximum=1, value=0.1, label="Temperature"),
        gr.components.Slider(minimum=0, maximum=1, value=1, label="Top p"),
        gr.components.Slider(minimum=0, maximum=100, step=1, value=50, label="Top k"),
        gr.components.Slider(minimum=1, maximum=1024, step=1, value=256, label="Max tokens"),  
    ],
    outputs=[
        gr.Textbox(
            lines=10,
            label="Output",
        )
    ],
    title = "TinyLlama 1.1B Chat GGUF",
    description =  """
                    original model: [PY007/TinyLlama-1.1B-Chat-v0.2](https://huggingface.co/PY007/TinyLlama-1.1B-Chat-v0.2)
                    quantized_model: [kirp/TinyLlama-1.1B-Chat-v0.2-gguf](https://huggingface.co/kirp/TinyLlama-1.1B-Chat-v0.2-gguf)
                    """
)
g.queue(concurrency_count=1)
g.launch()