Llama-3.1-8B-Instruct

Running on Zero

vilarin commited on Jun 28, 2024

Commit

4d0e4e3

verified ·

1 Parent(s): 16b7c93

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -3,6 +3,7 @@ import copy
 import gradio as gr
 import spaces
 from llama_cpp import Llama
 import os
 from huggingface_hub import hf_hub_download
@@ -21,8 +22,9 @@ llm = Llama(
         filename=MODEL_FILE,
     ),
     n_ctx=4096,
-    n_gpu_layers=-1,
-    chat_format="gemma",
 )
 TITLE = "<h1><center>Chatbox</center></h1>"
@@ -61,14 +63,14 @@ def stream_chat(message: str, history: list, temperature: float, max_new_tokens:
     print(f"Conversation is -\n{conversation}")
-    output = llm.create_chat_completion(
         messages=conversation,
         top_k=top_k,
         top_p=top_p,
         repeat_penalty=penalty,
         max_tokens=max_new_tokens,
         stream =True,
-        temperature=temperature,
     )
     for out in output:

 import gradio as gr
 import spaces
 from llama_cpp import Llama
+import llama_cpp.llama_tokenizer
 import os
 from huggingface_hub import hf_hub_download
         filename=MODEL_FILE,
     ),
     n_ctx=4096,
+    n_gpu_layers=-1,
+    tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained(MODEL_ID),
+    verbose=False,
 )
 TITLE = "<h1><center>Chatbox</center></h1>"
     print(f"Conversation is -\n{conversation}")
+    output = llm(
         messages=conversation,
         top_k=top_k,
         top_p=top_p,
         repeat_penalty=penalty,
         max_tokens=max_new_tokens,
         stream =True,
+        temperature=temperature,
     )
     for out in output: