phi-2 / app.py
randomblock1's picture
Update app.py
29a2d0c verified
raw
history blame
4.48 kB
import torch
from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
TextIteratorStreamer,
StoppingCriteria,
)
from threading import Thread
import gradio as gr
has_gpu = torch.cuda.is_available()
device = "cuda" if has_gpu else "cpu"
torch.set_default_device(device)
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
"microsoft/phi-2",
# torch_dtype=torch.float16 if has_gpu else torch.float32,
torch_dtype=torch.float32,
device_map=device,
trust_remote_code=True,
)
# custom stopping criteria (avoid generating hallucinated prompts)
# still includes these tokens in the output but stops generating after them
class Phi2StoppingCriteria(StoppingCriteria):
def __init__(self):
stop_list = ["Exercise", "Exercises", "exercises:", "<|endoftext|>"]
tokenphrases = []
for token in stop_list:
tokenphrases.append(
tokenizer(token, return_tensors="pt").input_ids[0].tolist()
)
self.tokenphrases = tokenphrases
def __call__(
self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs
) -> bool:
for tokenphrase in self.tokenphrases:
if tokenphrase == input_ids[0].tolist()[-len(tokenphrase):]:
return True
def generate(
prompt,
max_new_tokens=75,
terminate_hallucinated_prompts=True,
sampling=False,
temperature=1.0,
top_k=50,
top_p=1.0,
):
inputs = tokenizer(prompt, return_tensors="pt").to(device)
# thanks https://huggingface.co/spaces/joaogante/transformers_streaming/blob/main/app.py
streamer = TextIteratorStreamer(tokenizer)
generation_kwargs = dict(
inputs,
streamer=streamer,
max_new_tokens=max_new_tokens,
do_sample=sampling,
stopping_criteria=[Phi2StoppingCriteria()]
if terminate_hallucinated_prompts
else None,
temperature=temperature,
top_k=top_k,
top_p=top_p,
)
thread = Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()
model_output = ""
for new_text in streamer:
model_output += new_text
yield model_output
return model_output
demo = gr.Interface(
fn=generate,
inputs=[
gr.Text(
label="prompt",
value="Write a detailed analogy between mathematics and a lighthouse.",
),
gr.Slider(minimum=0, maximum=500, step=1, value=50, label="max new tokens"),
gr.Checkbox(
value=True,
label="terminate hallucinated prompts",
info="stop generation after getting tokens like 'Exercise' or '<|endoftext|>, but will not remove them.",
),
gr.Checkbox(
label="do sampling",
info="introduce randomness for non-deterministic results. required for below options",
value=True,
),
gr.Slider(
label="temperature",
info="higher temperature means more randomness",
value=1.0,
minimum=0.1,
maximum=1.5,
step=0.1,
),
gr.Slider(
label="top-k",
info="consider only the k most likely tokens",
value=50,
minimum=1,
maximum=100,
step=1,
),
gr.Slider(
label="top-p",
info="choose from the smallest possible set of words whose cumulative probability exceeds the probability p",
value=1.0,
minimum=0.1,
maximum=1.0,
step=0.1,
),
],
outputs="text",
examples=[
[
"Write a detailed analogy between mathematics and a lighthouse.",
75,
],
[
"Instruct: Write a detailed analogy between mathematics and a lighthouse.\nOutput:",
100,
],
[
"Alice: I don't know why, I'm struggling to maintain focus while studying. Any suggestions?\n\nBob: ",
150,
],
[
'''```
def print_prime(n):
"""
Print all primes between 1 and n
"""\n''',
125,
],
],
title="Microsoft Phi-2",
description="Unofficial demo of Microsoft Phi-2, a high performing model with only 2.7B parameters.",
)
if __name__ == "__main__":
demo.queue().launch(show_api=False)