crang commited on
Commit
0bcc140
·
1 Parent(s): 86689e5

add token counter

Browse files
Files changed (1) hide show
  1. app.py +20 -3
app.py CHANGED
@@ -8,9 +8,9 @@ import time
8
  import subprocess
9
 
10
  MIN_TOKENS=128
11
- MAX_TOKENS=8192
12
- DEFAULT_TOKENS=2048
13
- DURATION=60
14
 
15
  # Install flash attention
16
  subprocess.run(
@@ -36,6 +36,19 @@ model.to(device)
36
  def handle_error(error):
37
  return {"error": str(error)}
38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  # Define chat function with input validation and error handling
40
  @spaces.GPU(duration=DURATION)
41
  def chat(message, history, temperature, do_sample, max_tokens):
@@ -56,6 +69,9 @@ def chat(message, history, temperature, do_sample, max_tokens):
56
  chat.append({"role": "assistant", "content": item[1]})
57
  chat.append({"role": "user", "content": message})
58
 
 
 
 
59
  # Generate response
60
  messages = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
61
  model_inputs = tokenizer([messages], return_tensors="pt").to(device)
@@ -105,6 +121,7 @@ demo = gr.ChatInterface(
105
  label="Max new tokens",
106
  render=False,
107
  ),
 
108
  ],
109
  stop_btn="Stop Generation",
110
  title="Chat With LLMs",
 
8
  import subprocess
9
 
10
  MIN_TOKENS=128
11
+ MAX_TOKENS=128000
12
+ DEFAULT_TOKENS=4096
13
+ DURATION=120
14
 
15
  # Install flash attention
16
  subprocess.run(
 
36
  def handle_error(error):
37
  return {"error": str(error)}
38
 
39
+ # Define a custom component for the token counter
40
+ class TokenCounter(gr.components.Textbox):
41
+ def __init__(self, **kwargs):
42
+ super().__init__(**kwargs)
43
+ self.token_count = 0
44
+
45
+ def update(self, message):
46
+ self.token_count = len(tokenizer.encode(message, truncation=True))
47
+ self.update_text(f"Token count: {self.token_count}")
48
+
49
+ # Create a token counter component
50
+ token_counter = TokenCounter(label="Token Count", interactive=False)
51
+
52
  # Define chat function with input validation and error handling
53
  @spaces.GPU(duration=DURATION)
54
  def chat(message, history, temperature, do_sample, max_tokens):
 
69
  chat.append({"role": "assistant", "content": item[1]})
70
  chat.append({"role": "user", "content": message})
71
 
72
+ # Update the token counter
73
+ token_counter.update(message)
74
+
75
  # Generate response
76
  messages = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
77
  model_inputs = tokenizer([messages], return_tensors="pt").to(device)
 
121
  label="Max new tokens",
122
  render=False,
123
  ),
124
+ token_counter, # Add the token counter component
125
  ],
126
  stop_btn="Stop Generation",
127
  title="Chat With LLMs",