Spaces:

crang
/

Phi-3-mini-128k-instruct

Runtime error

App Files Files Community

crang commited on May 21

Commit

1970af8

•

1 Parent(s): 0bcc140

Update app.py

Browse files

Files changed (1) hide show

app.py +4 -21

app.py CHANGED Viewed

@@ -8,9 +8,9 @@ import time
 import subprocess
 MIN_TOKENS=128
-MAX_TOKENS=128000
-DEFAULT_TOKENS=4096
-DURATION=120
 # Install flash attention
 subprocess.run(
@@ -36,19 +36,6 @@ model.to(device)
 def handle_error(error):
     return {"error": str(error)}
-# Define a custom component for the token counter
-class TokenCounter(gr.components.Textbox):
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-        self.token_count = 0
-    def update(self, message):
-        self.token_count = len(tokenizer.encode(message, truncation=True))
-        self.update_text(f"Token count: {self.token_count}")
-# Create a token counter component
-token_counter = TokenCounter(label="Token Count", interactive=False)
 # Define chat function with input validation and error handling
 @spaces.GPU(duration=DURATION)
 def chat(message, history, temperature, do_sample, max_tokens):
@@ -69,9 +56,6 @@ def chat(message, history, temperature, do_sample, max_tokens):
                 chat.append({"role": "assistant", "content": item[1]})
         chat.append({"role": "user", "content": message})
-        # Update the token counter
-        token_counter.update(message)
         # Generate response
         messages = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
         model_inputs = tokenizer([messages], return_tensors="pt").to(device)
@@ -121,7 +105,6 @@ demo = gr.ChatInterface(
             label="Max new tokens",
             render=False,
         ),
-        token_counter,  # Add the token counter component
     ],
     stop_btn="Stop Generation",
     title="Chat With LLMs",
@@ -129,4 +112,4 @@ demo = gr.ChatInterface(
 )
 # Launch Gradio app
-demo.launch()

 import subprocess
 MIN_TOKENS=128
+MAX_TOKENS=8192
+DEFAULT_TOKENS=2048
+DURATION=60
 # Install flash attention
 subprocess.run(
 def handle_error(error):
     return {"error": str(error)}
 # Define chat function with input validation and error handling
 @spaces.GPU(duration=DURATION)
 def chat(message, history, temperature, do_sample, max_tokens):
                 chat.append({"role": "assistant", "content": item[1]})
         chat.append({"role": "user", "content": message})
         # Generate response
         messages = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
         model_inputs = tokenizer([messages], return_tensors="pt").to(device)
             label="Max new tokens",
             render=False,
         ),
     ],
     stop_btn="Stop Generation",
     title="Chat With LLMs",
 )
 # Launch Gradio app
+demo.launch()