Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -3,6 +3,7 @@ import copy
|
|
3 |
import gradio as gr
|
4 |
import spaces
|
5 |
from llama_cpp import Llama
|
|
|
6 |
import os
|
7 |
from huggingface_hub import hf_hub_download
|
8 |
|
@@ -21,8 +22,9 @@ llm = Llama(
|
|
21 |
filename=MODEL_FILE,
|
22 |
),
|
23 |
n_ctx=4096,
|
24 |
-
n_gpu_layers=-1,
|
25 |
-
|
|
|
26 |
)
|
27 |
|
28 |
TITLE = "<h1><center>Chatbox</center></h1>"
|
@@ -61,14 +63,14 @@ def stream_chat(message: str, history: list, temperature: float, max_new_tokens:
|
|
61 |
|
62 |
print(f"Conversation is -\n{conversation}")
|
63 |
|
64 |
-
output = llm
|
65 |
messages=conversation,
|
66 |
top_k=top_k,
|
67 |
top_p=top_p,
|
68 |
repeat_penalty=penalty,
|
69 |
max_tokens=max_new_tokens,
|
70 |
stream =True,
|
71 |
-
temperature=temperature,
|
72 |
)
|
73 |
|
74 |
for out in output:
|
|
|
3 |
import gradio as gr
|
4 |
import spaces
|
5 |
from llama_cpp import Llama
|
6 |
+
import llama_cpp.llama_tokenizer
|
7 |
import os
|
8 |
from huggingface_hub import hf_hub_download
|
9 |
|
|
|
22 |
filename=MODEL_FILE,
|
23 |
),
|
24 |
n_ctx=4096,
|
25 |
+
n_gpu_layers=-1,
|
26 |
+
tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained(MODEL_ID),
|
27 |
+
verbose=False,
|
28 |
)
|
29 |
|
30 |
TITLE = "<h1><center>Chatbox</center></h1>"
|
|
|
63 |
|
64 |
print(f"Conversation is -\n{conversation}")
|
65 |
|
66 |
+
output = llm(
|
67 |
messages=conversation,
|
68 |
top_k=top_k,
|
69 |
top_p=top_p,
|
70 |
repeat_penalty=penalty,
|
71 |
max_tokens=max_new_tokens,
|
72 |
stream =True,
|
73 |
+
temperature=temperature,
|
74 |
)
|
75 |
|
76 |
for out in output:
|