Benjamin Gonzalez commited on
Commit
c4f947a
·
1 Parent(s): 62a0c90

try to implement streaming

Browse files
Files changed (1) hide show
  1. app.py +44 -7
app.py CHANGED
@@ -1,5 +1,11 @@
1
  import torch
2
- from transformers import AutoTokenizer, AutoModelForCausalLM
 
 
 
 
 
 
3
  import gradio as gr
4
 
5
  if torch.cuda.is_available():
@@ -13,11 +19,39 @@ model = AutoModelForCausalLM.from_pretrained(
13
  )
14
 
15
 
16
- def generate(prompt, length):
17
- inputs = tokenizer(prompt, return_tensors="pt", return_attention_mask=False)
18
- input_token_len = len(inputs.tokens())
19
- outputs = model.generate(**inputs, max_length=length if length >= input_token_len else input_token_len)
20
- return tokenizer.batch_decode(outputs)[0]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
 
23
  demo = gr.Interface(
@@ -27,7 +61,7 @@ demo = gr.Interface(
27
  label="prompt",
28
  value="Write a detailed analogy between mathematics and a lighthouse.",
29
  ),
30
- gr.Number(value=100, label="max length", maximum=500),
31
  ],
32
  outputs="text",
33
  examples=[
@@ -50,6 +84,9 @@ demo = gr.Interface(
50
  """\n''',
51
  100,
52
  ],
 
 
 
53
  ],
54
  title="Microsoft Phi-2",
55
  description="Unofficial demo of Microsoft Phi-2, a high performing model with only 2.7B parameters.",
 
1
  import torch
2
+ from transformers import (
3
+ AutoTokenizer,
4
+ AutoModelForCausalLM,
5
+ TextIteratorStreamer,
6
+ StoppingCriteriaList,
7
+ )
8
+ from threading import Thread
9
  import gradio as gr
10
 
11
  if torch.cuda.is_available():
 
19
  )
20
 
21
 
22
+ def Phi2StoppingCriteria(
23
+ input_ids: torch.LongTensor, score: torch.FloatTensor, **kwargs
24
+ ) -> bool:
25
+ stop_list = ["Exercise", "Exercises", "<|endoftext|>"]
26
+ stop_tokens = []
27
+ for stop in stop_list:
28
+ stop_tokens.append(
29
+ tokenizer(stop, add_special_tokens=False, return_tensors="pt").input_ids
30
+ )
31
+ return input_ids[-1] in stop_tokens
32
+
33
+
34
+ stopping_criteria = StoppingCriteriaList([Phi2StoppingCriteria])
35
+
36
+
37
+ def generate(prompt, max_new_tokens):
38
+ inputs = tokenizer(prompt, return_tensors="pt")
39
+ # thanks https://huggingface.co/spaces/joaogante/transformers_streaming/blob/main/app.py
40
+ streamer = TextIteratorStreamer(inputs)
41
+ generation_kwargs = dict(
42
+ inputs,
43
+ streamer=streamer,
44
+ max_new_tokens=max_new_tokens,
45
+ do_sample=True,
46
+ stopping_criteria=stopping_criteria,
47
+ )
48
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
49
+ thread.start()
50
+ model_output = ""
51
+ for new_text in streamer:
52
+ model_output += new_text
53
+ yield model_output
54
+ return model_output
55
 
56
 
57
  demo = gr.Interface(
 
61
  label="prompt",
62
  value="Write a detailed analogy between mathematics and a lighthouse.",
63
  ),
64
+ gr.Number(value=100, label="max new tokens", maximum=500),
65
  ],
66
  outputs="text",
67
  examples=[
 
84
  """\n''',
85
  100,
86
  ],
87
+ ["User: How does sleep affect mood?\nAI:", 125],
88
+ ["Who was Ada Lovelace?", 100],
89
+ ["Explain the concept of skip lists.", 125],
90
  ],
91
  title="Microsoft Phi-2",
92
  description="Unofficial demo of Microsoft Phi-2, a high performing model with only 2.7B parameters.",