mgoin commited on
Commit
24e1981
1 Parent(s): ba83872

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +5 -3
app.py CHANGED
@@ -5,7 +5,7 @@ import gradio as gr
5
  # import spaces
6
  import torch
7
  from transformers import AutoTokenizer
8
- from vllm import LLM, SamplingParams
9
 
10
  MAX_MAX_NEW_TOKENS = 2048
11
  DEFAULT_MAX_NEW_TOKENS = 1024
@@ -19,7 +19,9 @@ if not torch.cuda.is_available():
19
  raise ValueError("Running on CPU 🥶 This demo does not work on CPU.")
20
 
21
  model_id = "neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50"
22
- model = LLM(model_id, sparsity="sparse_w16a16", max_model_len=MAX_INPUT_TOKEN_LENGTH)
 
 
23
  tokenizer = AutoTokenizer.from_pretrained(model_id)
24
  tokenizer.use_default_system_prompt = False
25
 
@@ -51,7 +53,7 @@ async def generate(
51
  repetition_penalty=repetition_penalty,
52
  )
53
 
54
- stream = await model.add_request(uuid.uuid4().hex, formatted_conversation, sampling_params)
55
 
56
  async for request_output in stream:
57
  text = request_output.outputs[0].text
 
5
  # import spaces
6
  import torch
7
  from transformers import AutoTokenizer
8
+ from vllm import AsyncLLMEngine, AsyncEngineArgs, SamplingParams
9
 
10
  MAX_MAX_NEW_TOKENS = 2048
11
  DEFAULT_MAX_NEW_TOKENS = 1024
 
19
  raise ValueError("Running on CPU 🥶 This demo does not work on CPU.")
20
 
21
  model_id = "neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50"
22
+ engine_args = AsyncEngineArgs(model=model_id, sparsity="sparse_w16a16", max_model_len=MAX_INPUT_TOKEN_LENGTH)
23
+ engine = AsyncLLMEngine.from_engine_args(engine_args)
24
+
25
  tokenizer = AutoTokenizer.from_pretrained(model_id)
26
  tokenizer.use_default_system_prompt = False
27
 
 
53
  repetition_penalty=repetition_penalty,
54
  )
55
 
56
+ stream = await engine.add_request(uuid.uuid4().hex, formatted_conversation, sampling_params)
57
 
58
  async for request_output in stream:
59
  text = request_output.outputs[0].text