Benjamin G commited on
Commit
0439661
·
1 Parent(s): c4f947a

added streaming

Browse files
Files changed (2) hide show
  1. app.py +79 -22
  2. requirements.txt +0 -15
app.py CHANGED
@@ -3,47 +3,70 @@ from transformers import (
3
  AutoTokenizer,
4
  AutoModelForCausalLM,
5
  TextIteratorStreamer,
6
- StoppingCriteriaList,
7
  )
8
  from threading import Thread
9
  import gradio as gr
10
 
11
- if torch.cuda.is_available():
12
- torch.set_default_device("cuda")
 
 
 
13
 
14
  tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2", trust_remote_code=True)
15
  model = AutoModelForCausalLM.from_pretrained(
16
  "microsoft/phi-2",
17
- torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
 
 
18
  trust_remote_code=True,
19
  )
20
 
21
 
22
- def Phi2StoppingCriteria(
23
- input_ids: torch.LongTensor, score: torch.FloatTensor, **kwargs
24
- ) -> bool:
25
- stop_list = ["Exercise", "Exercises", "<|endoftext|>"]
26
- stop_tokens = []
27
- for stop in stop_list:
28
- stop_tokens.append(
29
- tokenizer(stop, add_special_tokens=False, return_tensors="pt").input_ids
30
- )
31
- return input_ids[-1] in stop_tokens
32
-
33
 
34
- stopping_criteria = StoppingCriteriaList([Phi2StoppingCriteria])
 
 
 
 
 
35
 
36
 
37
- def generate(prompt, max_new_tokens):
 
 
 
 
 
 
 
 
38
  inputs = tokenizer(prompt, return_tensors="pt")
39
  # thanks https://huggingface.co/spaces/joaogante/transformers_streaming/blob/main/app.py
40
- streamer = TextIteratorStreamer(inputs)
41
  generation_kwargs = dict(
42
  inputs,
43
  streamer=streamer,
44
  max_new_tokens=max_new_tokens,
45
- do_sample=True,
46
- stopping_criteria=stopping_criteria,
 
 
 
 
 
47
  )
48
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
49
  thread.start()
@@ -61,7 +84,41 @@ demo = gr.Interface(
61
  label="prompt",
62
  value="Write a detailed analogy between mathematics and a lighthouse.",
63
  ),
64
- gr.Number(value=100, label="max new tokens", maximum=500),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  ],
66
  outputs="text",
67
  examples=[
@@ -84,7 +141,7 @@ demo = gr.Interface(
84
  """\n''',
85
  100,
86
  ],
87
- ["User: How does sleep affect mood?\nAI:", 125],
88
  ["Who was Ada Lovelace?", 100],
89
  ["Explain the concept of skip lists.", 125],
90
  ],
 
3
  AutoTokenizer,
4
  AutoModelForCausalLM,
5
  TextIteratorStreamer,
6
+ StoppingCriteria,
7
  )
8
  from threading import Thread
9
  import gradio as gr
10
 
11
+ # has_gpu = torch.cuda.is_available()
12
+ has_gpu = False
13
+ device = "cuda" if has_gpu else "cpu"
14
+
15
+ torch.set_default_device(device)
16
 
17
  tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2", trust_remote_code=True)
18
  model = AutoModelForCausalLM.from_pretrained(
19
  "microsoft/phi-2",
20
+ # torch_dtype=torch.float16 if has_gpu else torch.float32,
21
+ torch_dtype=torch.float32,
22
+ device_map=device,
23
  trust_remote_code=True,
24
  )
25
 
26
 
27
+ # custom stopping criteria (avoid generating hallucinated prompts)
28
+ # still includes these tokens in the output but stops generating after them
29
+ class Phi2StoppingCriteria(StoppingCriteria):
30
+ def __init__(self):
31
+ stop_list = ["Exercise", "Exercises", "<|endoftext|>"]
32
+ tokenphrases = []
33
+ for token in stop_list:
34
+ tokenphrases.append(
35
+ tokenizer(token, return_tensors="pt").input_ids[0].tolist()
36
+ )
37
+ self.tokenphrases = tokenphrases
38
 
39
+ def __call__(
40
+ self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs
41
+ ) -> bool:
42
+ for tokenphrase in self.tokenphrases:
43
+ if tokenphrase == input_ids[0].tolist()[-len(tokenphrase):]:
44
+ return True
45
 
46
 
47
+ def generate(
48
+ prompt,
49
+ max_new_tokens,
50
+ avoid_hallucinated_prompts,
51
+ sampling,
52
+ temperature,
53
+ top_k,
54
+ top_p,
55
+ ):
56
  inputs = tokenizer(prompt, return_tensors="pt")
57
  # thanks https://huggingface.co/spaces/joaogante/transformers_streaming/blob/main/app.py
58
+ streamer = TextIteratorStreamer(tokenizer)
59
  generation_kwargs = dict(
60
  inputs,
61
  streamer=streamer,
62
  max_new_tokens=max_new_tokens,
63
+ do_sample=sampling,
64
+ stopping_criteria=[Phi2StoppingCriteria()]
65
+ if avoid_hallucinated_prompts
66
+ else None,
67
+ temperature=temperature,
68
+ top_k=top_k,
69
+ top_p=top_p,
70
  )
71
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
72
  thread.start()
 
84
  label="prompt",
85
  value="Write a detailed analogy between mathematics and a lighthouse.",
86
  ),
87
+ gr.Slider(minimum=0, maximum=500, step=1, value=100, label="max new tokens"),
88
+ gr.Checkbox(
89
+ value=True,
90
+ label="avoid hallucinated prompts",
91
+ info="stop generation after getting tokens like 'Exercise' or '<|endoftext|>, but will not remove them.",
92
+ ),
93
+ gr.Checkbox(
94
+ label="do sampling",
95
+ info="introduce randomness for non-deterministic results. required for below options",
96
+ value=True,
97
+ ),
98
+ gr.Slider(
99
+ label="temperature",
100
+ info="higher temperature means more randomness",
101
+ value=1.0,
102
+ minimum=0.1,
103
+ maximum=1.5,
104
+ step=0.1,
105
+ ),
106
+ gr.Slider(
107
+ label="top-k",
108
+ info="consider only the k most likely tokens",
109
+ value=50,
110
+ minimum=1,
111
+ maximum=100,
112
+ step=1,
113
+ ),
114
+ gr.Slider(
115
+ label="top-p",
116
+ info="choose from the smallest possible set of words whose cumulative probability exceeds the probability p",
117
+ value=1.0,
118
+ minimum=0.1,
119
+ maximum=1.0,
120
+ step=0.1,
121
+ ),
122
  ],
123
  outputs="text",
124
  examples=[
 
141
  """\n''',
142
  100,
143
  ],
144
+ ["User: How does sleep affect mood?\nAI:", 75],
145
  ["Who was Ada Lovelace?", 100],
146
  ["Explain the concept of skip lists.", 125],
147
  ],
requirements.txt CHANGED
@@ -1,20 +1,5 @@
1
- mlflow==2.6.0
2
- cloudpickle==2.2.1
3
- jsonpickle==3.0.1
4
- mlflow-skinny==2.6.0
5
- azureml-core==1.51.0.post1
6
- azureml-mlflow==1.51.0
7
- azureml-metrics[all]==0.0.32
8
  scikit-learn==1.2.2
9
- cryptography==41.0.1
10
- python-dateutil==2.8.2
11
- datasets==2.14.6
12
- soundfile==0.12.1
13
- librosa==0.10.1
14
  diffusers==0.21.4
15
- sentencepiece==0.1.99
16
  transformers==4.34.0
17
  accelerate==0.23.0
18
- Pillow==9.4.0
19
  einops
20
- azureml-evaluate-mlflow==0.0.32
 
 
 
 
 
 
 
 
1
  scikit-learn==1.2.2
 
 
 
 
 
2
  diffusers==0.21.4
 
3
  transformers==4.34.0
4
  accelerate==0.23.0
 
5
  einops