File size: 3,876 Bytes
9e357fb
 
 
 
 
 
 
 
05401d5
9e357fb
 
 
dbaa1ee
 
9e357fb
dbaa1ee
 
 
 
 
cb08a5b
dbaa1ee
 
 
05401d5
dbaa1ee
 
05401d5
dbaa1ee
 
 
 
 
cb08a5b
dbaa1ee
9e357fb
05401d5
9e357fb
 
 
 
 
 
 
 
 
 
 
 
 
a846a84
 
9e357fb
 
 
 
 
 
 
 
 
 
9050c05
9e357fb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cb08a5b
9e357fb
cb08a5b
9e357fb
 
 
 
 
 
b4cb98a
9e357fb
 
 
 
 
 
 
 
 
 
 
e032a01
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import gradio as gr

import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TextIteratorStreamer,
    pipeline,
    AutoConfig,
)
from threading import Thread

# The huggingface model id for Microsoft's phi-2 model
checkpoint = "microsoft/phi-2"

# Download and load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    checkpoint, torch_dtype=torch.float32, device_map="cpu", trust_remote_code=True
)

#model_name_or_path = "TheBloke/phi-2-GPTQ"
## To use a different branch, change revision
## For example: revision="gptq-4bit-32g-actorder_True"

#config = AutoConfig.from_pretrained(model_name_or_path,trust_remote_code=True)
#config.quantization_config["use_exllama"] = False

#model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
#                                             device_map="cpu",
#                                             trust_remote_code=True,
#                                             revision="main",
#                                             config=config)

#tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)


# Text generation pipeline
phi2 = pipeline(
    "text-generation",
    tokenizer=tokenizer,
    model=model,
    pad_token_id=tokenizer.eos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    device_map="cpu",
)


# Function that accepts a prompt and generates text using the phi2 pipeline
def generate(message, chat_history, max_new_tokens):
    #instruction = "You are a helpful assistant to 'User'. You do not respond as 'User' or pretend to be 'User'. You only respond once as 'Assistant'."
    instruction = "You are a helpful assistant to 'User'. You will answer any question for 'User'."
    final_prompt = f"Instruction: {instruction}\n"

    for sent, received in chat_history:
        final_prompt += "User: " + sent + "\n"
        final_prompt += "Assistant: " + received + "\n"

    final_prompt += "User: " + message + "\n"
    final_prompt += "Output:"

    if (
        len(tokenizer.tokenize(final_prompt)) >= tokenizer.model_max_length - max_new_tokens
    ):
        final_prompt = "Instruction: Say 'Input exceeded context size, please clear the chat history and retry!' Output:"

    # Streamer
    streamer = TextIteratorStreamer(
        tokenizer=tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=300.0
    )
    thread = Thread(
        target=phi2,
        kwargs={
            "text_inputs": final_prompt,
            "max_new_tokens": max_new_tokens,
            "streamer": streamer,
        },
    )
    thread.start()

    generated_text = ""
    for word in streamer:
        generated_text += word
        response = generated_text.strip()

        if "User:" in response:
            response = response.split("User:")[0].strip()

        if "Assistant:" in response:
            response = response.split("Assistant:")[1].strip()

        yield response


# Chat interface with gradio
with gr.Blocks() as demo:
    gr.Markdown(
        """
  # Phi-2 Chatbot Demo
  This chatbot was created using TheBloke/phi-2-GPTQ from Microsoft's 2.7 billion parameter [phi-2](https://huggingface.co/microsoft/phi-2) Transformer model. 
  
  In order to reduce the response time on this hardware, set `max_new_tokens` to lower number in the text generation pipeline.
  """
    )

    tokens_slider = gr.Slider(
        8,
        128,
        value=128,
        label="Maximum new tokens",
        info="A larger `max_new_tokens` parameter value gives you longer text responses but at the cost of a slower response time.",
    )

    chatbot = gr.ChatInterface(
        fn=generate,
        additional_inputs=[tokens_slider],
        stop_btn=None,
        examples=[["Who is Leonhard Euler?"]],
    )

demo.queue().launch()