Spaces:

crang
/

Phi-3-mini-128k-instruct

Runtime error

crang commited on Apr 29, 2024

Commit

0bcc140

1 Parent(s): 86689e5

add token counter

Files changed (1) hide show

app.py CHANGED Viewed

@@ -8,9 +8,9 @@ import time
 import subprocess
 MIN_TOKENS=128
-MAX_TOKENS=8192
-DEFAULT_TOKENS=2048
-DURATION=60
 # Install flash attention
 subprocess.run(
@@ -36,6 +36,19 @@ model.to(device)
 def handle_error(error):
     return {"error": str(error)}
 # Define chat function with input validation and error handling
 @spaces.GPU(duration=DURATION)
 def chat(message, history, temperature, do_sample, max_tokens):
@@ -56,6 +69,9 @@ def chat(message, history, temperature, do_sample, max_tokens):
                 chat.append({"role": "assistant", "content": item[1]})
         chat.append({"role": "user", "content": message})
         # Generate response
         messages = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
         model_inputs = tokenizer([messages], return_tensors="pt").to(device)
@@ -105,6 +121,7 @@ demo = gr.ChatInterface(
             label="Max new tokens",
             render=False,
         ),
     ],
     stop_btn="Stop Generation",
     title="Chat With LLMs",

 import subprocess
 MIN_TOKENS=128
+MAX_TOKENS=128000
+DEFAULT_TOKENS=4096
+DURATION=120
 # Install flash attention
 subprocess.run(
 def handle_error(error):
     return {"error": str(error)}
+# Define a custom component for the token counter
+class TokenCounter(gr.components.Textbox):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.token_count = 0
+    def update(self, message):
+        self.token_count = len(tokenizer.encode(message, truncation=True))
+        self.update_text(f"Token count: {self.token_count}")
+# Create a token counter component
+token_counter = TokenCounter(label="Token Count", interactive=False)
 # Define chat function with input validation and error handling
 @spaces.GPU(duration=DURATION)
 def chat(message, history, temperature, do_sample, max_tokens):
                 chat.append({"role": "assistant", "content": item[1]})
         chat.append({"role": "user", "content": message})
+        # Update the token counter
+        token_counter.update(message)
         # Generate response
         messages = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
         model_inputs = tokenizer([messages], return_tensors="pt").to(device)
             label="Max new tokens",
             render=False,
         ),
+        token_counter,  # Add the token counter component
     ],
     stop_btn="Stop Generation",
     title="Chat With LLMs",