tinyllama-chat / app.py
kirp
.
ed610fa
raw
history blame
2.02 kB
import gradio as gr
import copy
import random
import os
import requests
import time
import sys
from huggingface_hub import snapshot_download
from llama_cpp import Llama
repo_name = "kirp/TinyLlama-1.1B-Chat-v0.2-gguf"
model_name = "ggml-model-q4_k_m.gguf"
snapshot_download(repo_id=repo_name, local_dir=".", allow_patterns=model_name)
model = Llama(
model_path=model_name,
n_ctx=2048,
n_parts=1,
)
template = "<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n"
def generate(
input=None,
temperature=0.1,
top_p=0.75,
top_k=40,
max_new_tokens=512,
):
prompt = template.format(input)
output = model(prompt,
temperature = temperature,
top_k = top_k,
top_p = top_p,
max_tokens = max_new_tokens + len(input),
stop=["<|im_end|>"],
echo=True)
output = output["choices"][0]['text']
return output.split("assistant\n")[1]
g = gr.Interface(
fn=generate,
inputs=[
gr.components.Textbox(
lines=2, label="Prompt", placeholder="Tell me about huggingface."
),
gr.components.Slider(minimum=0, maximum=1, value=0.7, label="Temperature"),
gr.components.Slider(minimum=0, maximum=1, value=0.8, label="Top p"),
gr.components.Slider(minimum=0, maximum=100, step=1, value=50, label="Top k"),
gr.components.Slider(
minimum=1, maximum=2048, step=1, value=512, label="Max tokens"
),
],
outputs=[
gr.Textbox(
lines=10,
label="Output",
)
],
title = "TinyLlama 1.1B Chat GGUF",
description = """
original model: [PY007/TinyLlama-1.1B-Chat-v0.2](https://huggingface.co/PY007/TinyLlama-1.1B-Chat-v0.2)
quantized_model: [kirp/TinyLlama-1.1B-Chat-v0.2-gguf](https://huggingface.co/kirp/TinyLlama-1.1B-Chat-v0.2-gguf)
"""
)
g.queue(concurrency_count=2)
g.launch()