Spaces:
Runtime error
Runtime error
add token counter
Browse files
app.py
CHANGED
@@ -8,9 +8,9 @@ import time
|
|
8 |
import subprocess
|
9 |
|
10 |
MIN_TOKENS=128
|
11 |
-
MAX_TOKENS=
|
12 |
-
DEFAULT_TOKENS=
|
13 |
-
DURATION=
|
14 |
|
15 |
# Install flash attention
|
16 |
subprocess.run(
|
@@ -36,6 +36,19 @@ model.to(device)
|
|
36 |
def handle_error(error):
|
37 |
return {"error": str(error)}
|
38 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
# Define chat function with input validation and error handling
|
40 |
@spaces.GPU(duration=DURATION)
|
41 |
def chat(message, history, temperature, do_sample, max_tokens):
|
@@ -56,6 +69,9 @@ def chat(message, history, temperature, do_sample, max_tokens):
|
|
56 |
chat.append({"role": "assistant", "content": item[1]})
|
57 |
chat.append({"role": "user", "content": message})
|
58 |
|
|
|
|
|
|
|
59 |
# Generate response
|
60 |
messages = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
|
61 |
model_inputs = tokenizer([messages], return_tensors="pt").to(device)
|
@@ -105,6 +121,7 @@ demo = gr.ChatInterface(
|
|
105 |
label="Max new tokens",
|
106 |
render=False,
|
107 |
),
|
|
|
108 |
],
|
109 |
stop_btn="Stop Generation",
|
110 |
title="Chat With LLMs",
|
|
|
8 |
import subprocess
|
9 |
|
10 |
MIN_TOKENS=128
|
11 |
+
MAX_TOKENS=128000
|
12 |
+
DEFAULT_TOKENS=4096
|
13 |
+
DURATION=120
|
14 |
|
15 |
# Install flash attention
|
16 |
subprocess.run(
|
|
|
36 |
def handle_error(error):
|
37 |
return {"error": str(error)}
|
38 |
|
39 |
+
# Define a custom component for the token counter
|
40 |
+
class TokenCounter(gr.components.Textbox):
|
41 |
+
def __init__(self, **kwargs):
|
42 |
+
super().__init__(**kwargs)
|
43 |
+
self.token_count = 0
|
44 |
+
|
45 |
+
def update(self, message):
|
46 |
+
self.token_count = len(tokenizer.encode(message, truncation=True))
|
47 |
+
self.update_text(f"Token count: {self.token_count}")
|
48 |
+
|
49 |
+
# Create a token counter component
|
50 |
+
token_counter = TokenCounter(label="Token Count", interactive=False)
|
51 |
+
|
52 |
# Define chat function with input validation and error handling
|
53 |
@spaces.GPU(duration=DURATION)
|
54 |
def chat(message, history, temperature, do_sample, max_tokens):
|
|
|
69 |
chat.append({"role": "assistant", "content": item[1]})
|
70 |
chat.append({"role": "user", "content": message})
|
71 |
|
72 |
+
# Update the token counter
|
73 |
+
token_counter.update(message)
|
74 |
+
|
75 |
# Generate response
|
76 |
messages = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
|
77 |
model_inputs = tokenizer([messages], return_tensors="pt").to(device)
|
|
|
121 |
label="Max new tokens",
|
122 |
render=False,
|
123 |
),
|
124 |
+
token_counter, # Add the token counter component
|
125 |
],
|
126 |
stop_btn="Stop Generation",
|
127 |
title="Chat With LLMs",
|