Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -8,9 +8,9 @@ import time
|
|
8 |
import subprocess
|
9 |
|
10 |
MIN_TOKENS=128
|
11 |
-
MAX_TOKENS=
|
12 |
-
DEFAULT_TOKENS=
|
13 |
-
DURATION=
|
14 |
|
15 |
# Install flash attention
|
16 |
subprocess.run(
|
@@ -36,19 +36,6 @@ model.to(device)
|
|
36 |
def handle_error(error):
|
37 |
return {"error": str(error)}
|
38 |
|
39 |
-
# Define a custom component for the token counter
|
40 |
-
class TokenCounter(gr.components.Textbox):
|
41 |
-
def __init__(self, **kwargs):
|
42 |
-
super().__init__(**kwargs)
|
43 |
-
self.token_count = 0
|
44 |
-
|
45 |
-
def update(self, message):
|
46 |
-
self.token_count = len(tokenizer.encode(message, truncation=True))
|
47 |
-
self.update_text(f"Token count: {self.token_count}")
|
48 |
-
|
49 |
-
# Create a token counter component
|
50 |
-
token_counter = TokenCounter(label="Token Count", interactive=False)
|
51 |
-
|
52 |
# Define chat function with input validation and error handling
|
53 |
@spaces.GPU(duration=DURATION)
|
54 |
def chat(message, history, temperature, do_sample, max_tokens):
|
@@ -69,9 +56,6 @@ def chat(message, history, temperature, do_sample, max_tokens):
|
|
69 |
chat.append({"role": "assistant", "content": item[1]})
|
70 |
chat.append({"role": "user", "content": message})
|
71 |
|
72 |
-
# Update the token counter
|
73 |
-
token_counter.update(message)
|
74 |
-
|
75 |
# Generate response
|
76 |
messages = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
|
77 |
model_inputs = tokenizer([messages], return_tensors="pt").to(device)
|
@@ -121,7 +105,6 @@ demo = gr.ChatInterface(
|
|
121 |
label="Max new tokens",
|
122 |
render=False,
|
123 |
),
|
124 |
-
token_counter, # Add the token counter component
|
125 |
],
|
126 |
stop_btn="Stop Generation",
|
127 |
title="Chat With LLMs",
|
@@ -129,4 +112,4 @@ demo = gr.ChatInterface(
|
|
129 |
)
|
130 |
|
131 |
# Launch Gradio app
|
132 |
-
demo.launch()
|
|
|
8 |
import subprocess
|
9 |
|
10 |
MIN_TOKENS=128
|
11 |
+
MAX_TOKENS=8192
|
12 |
+
DEFAULT_TOKENS=2048
|
13 |
+
DURATION=60
|
14 |
|
15 |
# Install flash attention
|
16 |
subprocess.run(
|
|
|
36 |
def handle_error(error):
|
37 |
return {"error": str(error)}
|
38 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
# Define chat function with input validation and error handling
|
40 |
@spaces.GPU(duration=DURATION)
|
41 |
def chat(message, history, temperature, do_sample, max_tokens):
|
|
|
56 |
chat.append({"role": "assistant", "content": item[1]})
|
57 |
chat.append({"role": "user", "content": message})
|
58 |
|
|
|
|
|
|
|
59 |
# Generate response
|
60 |
messages = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
|
61 |
model_inputs = tokenizer([messages], return_tensors="pt").to(device)
|
|
|
105 |
label="Max new tokens",
|
106 |
render=False,
|
107 |
),
|
|
|
108 |
],
|
109 |
stop_btn="Stop Generation",
|
110 |
title="Chat With LLMs",
|
|
|
112 |
)
|
113 |
|
114 |
# Launch Gradio app
|
115 |
+
demo.launch()
|