|
|
|
|
|
|
|
|
|
|
|
from config import SUPPORTED_LLM_MODELS |
|
|
|
from transformers import AutoModelForCausalLM, AutoConfig |
|
from optimum.intel.openvino import OVModelForCausalLM |
|
import openvino as ov |
|
from pathlib import Path |
|
import shutil |
|
import torch |
|
import logging |
|
import nncf |
|
import gc |
|
from converter import converters, register_configs |
|
|
|
register_configs() |
|
|
|
model_id = "llama-2-chat-7b" |
|
|
|
|
|
|
|
|
|
model_configuration = SUPPORTED_LLM_MODELS[model_id] |
|
print(f"Selected model {model_id}") |
|
|
|
prepare_int4_model = True |
|
prepare_int8_model = False |
|
prepare_fp16_model = False |
|
|
|
from optimum.intel import OVWeightQuantizationConfig |
|
|
|
nncf.set_log_level(logging.ERROR) |
|
|
|
DIRNAME="new_321" |
|
DIRNAME="temp" |
|
|
|
pt_model_id = model_configuration["model_id"] |
|
pt_model_name = model_id.split("-")[0] |
|
model_type = AutoConfig.from_pretrained(pt_model_id, trust_remote_code=True).model_type |
|
fp16_model_dir = Path(DIRNAME) / Path(model_id) / "FP16" |
|
int8_model_dir = Path(DIRNAME) / Path(model_id) / "INT8_compressed_weights" |
|
int4_model_dir = Path(DIRNAME) / Path(model_id) / "INT4_compressed_weights" |
|
|
|
|
|
def convert_to_fp16(): |
|
if (fp16_model_dir / "openvino_model.xml").exists(): |
|
return |
|
if not model_configuration["remote"]: |
|
remote_code = model_configuration.get("remote_code", False) |
|
model_kwargs = {} |
|
if remote_code: |
|
model_kwargs = { |
|
"trust_remote_code": True, |
|
"config": AutoConfig.from_pretrained(pt_model_id, trust_remote_code=True) |
|
} |
|
ov_model = OVModelForCausalLM.from_pretrained( |
|
pt_model_id, export=True, compile=False, load_in_8bit=False, **model_kwargs |
|
) |
|
ov_model.half() |
|
ov_model.save_pretrained(fp16_model_dir) |
|
del ov_model |
|
else: |
|
model_kwargs = {} |
|
if "revision" in model_configuration: |
|
model_kwargs["revision"] = model_configuration["revision"] |
|
model = AutoModelForCausalLM.from_pretrained( |
|
model_configuration["model_id"], |
|
torch_dtype=torch.float32, |
|
trust_remote_code=True, |
|
**model_kwargs |
|
) |
|
converters[pt_model_name](model, fp16_model_dir) |
|
del model |
|
gc.collect() |
|
|
|
|
|
def convert_to_int8(): |
|
if (int8_model_dir / "openvino_model.xml").exists(): |
|
return |
|
int8_model_dir.mkdir(parents=True, exist_ok=True) |
|
if not model_configuration["remote"]: |
|
remote_code = model_configuration.get("remote_code", False) |
|
model_kwargs = {} |
|
if remote_code: |
|
model_kwargs = { |
|
"trust_remote_code": True, |
|
"config": AutoConfig.from_pretrained(pt_model_id, trust_remote_code=True) |
|
} |
|
ov_model = OVModelForCausalLM.from_pretrained( |
|
pt_model_id, export=True, compile=False, load_in_8bit=True, **model_kwargs |
|
) |
|
ov_model.save_pretrained(int8_model_dir) |
|
del ov_model |
|
else: |
|
convert_to_fp16() |
|
ov_model = ov.Core().read_model(fp16_model_dir / "openvino_model.xml") |
|
shutil.copy(fp16_model_dir / "config.json", int8_model_dir / "config.json") |
|
configuration_file = fp16_model_dir / f"configuration_{model_type}.py" |
|
if configuration_file.exists(): |
|
shutil.copy( |
|
configuration_file, int8_model_dir / f"configuration_{model_type}.py" |
|
) |
|
compressed_model = nncf.compress_weights(ov_model) |
|
ov.save_model(compressed_model, int8_model_dir / "openvino_model.xml") |
|
del ov_model |
|
del compressed_model |
|
gc.collect() |
|
|
|
|
|
def convert_to_int4(): |
|
compression_configs = { |
|
"zephyr-7b-beta": { |
|
"sym": True, |
|
"group_size": 64, |
|
"ratio": 0.6, |
|
}, |
|
"mistral-7b": { |
|
"sym": True, |
|
"group_size": 64, |
|
"ratio": 0.6, |
|
}, |
|
"minicpm-2b-dpo": { |
|
"sym": True, |
|
"group_size": 64, |
|
"ratio": 0.6, |
|
}, |
|
"gemma-2b-it": { |
|
"sym": True, |
|
"group_size": 64, |
|
|
|
"ratio": 0.6, |
|
}, |
|
"notus-7b-v1": { |
|
"sym": True, |
|
"group_size": 64, |
|
"ratio": 0.6, |
|
}, |
|
"neural-chat-7b-v3-1": { |
|
"sym": True, |
|
"group_size": 64, |
|
"ratio": 0.6, |
|
}, |
|
"llama-2-chat-7b": { |
|
"sym": True, |
|
|
|
"group_size": 128, |
|
"ratio": 0.8, |
|
|
|
}, |
|
"gemma-7b-it": { |
|
"sym": True, |
|
"group_size": 128, |
|
"ratio": 1.0, |
|
}, |
|
"chatglm2-6b": { |
|
"sym": True, |
|
"group_size": 128, |
|
"ratio": 0.72, |
|
}, |
|
"qwen-7b-chat": { |
|
"sym": True, |
|
"group_size": 128, |
|
"ratio": 0.6 |
|
}, |
|
'red-pajama-3b-chat': { |
|
"sym": False, |
|
"group_size": 128, |
|
"ratio": 0.5, |
|
}, |
|
"default": { |
|
"sym": False, |
|
"group_size": 128, |
|
"ratio": 0.8, |
|
}, |
|
} |
|
|
|
model_compression_params = compression_configs.get( |
|
model_id, compression_configs["default"] |
|
) |
|
if (int4_model_dir / "openvino_model.xml").exists(): |
|
return |
|
int4_model_dir.mkdir(parents=True, exist_ok=True) |
|
if not model_configuration["remote"]: |
|
remote_code = model_configuration.get("remote_code", False) |
|
model_kwargs = {} |
|
if remote_code: |
|
model_kwargs = { |
|
"trust_remote_code" : True, |
|
"config": AutoConfig.from_pretrained(pt_model_id, trust_remote_code=True) |
|
} |
|
ov_model = OVModelForCausalLM.from_pretrained( |
|
pt_model_id, export=True, compile=False, |
|
quantization_config=OVWeightQuantizationConfig(bits=4, **model_compression_params), |
|
**model_kwargs |
|
) |
|
ov_model.save_pretrained(int4_model_dir) |
|
del ov_model |
|
else: |
|
convert_to_fp16() |
|
ov_model = ov.Core().read_model(fp16_model_dir / "openvino_model.xml") |
|
shutil.copy(fp16_model_dir / "config.json", int4_model_dir / "config.json") |
|
configuration_file = fp16_model_dir / f"configuration_{model_type}.py" |
|
if configuration_file.exists(): |
|
shutil.copy( |
|
configuration_file, int4_model_dir / f"configuration_{model_type}.py" |
|
) |
|
mode = nncf.CompressWeightsMode.INT4_SYM if model_compression_params["sym"] else \ |
|
nncf.CompressWeightsMode.INT4_ASYM |
|
del model_compression_params["sym"] |
|
compressed_model = nncf.compress_weights(ov_model, mode=mode, **model_compression_params) |
|
ov.save_model(compressed_model, int4_model_dir / "openvino_model.xml") |
|
del ov_model |
|
del compressed_model |
|
gc.collect() |
|
|
|
|
|
if prepare_fp16_model: |
|
convert_to_fp16() |
|
if prepare_int8_model: |
|
convert_to_int8() |
|
if prepare_int4_model: |
|
convert_to_int4() |
|
|
|
|
|
exit() |
|
|
|
fp16_weights = fp16_model_dir / "openvino_model.bin" |
|
int8_weights = int8_model_dir / "openvino_model.bin" |
|
int4_weights = int4_model_dir / "openvino_model.bin" |
|
|
|
if fp16_weights.exists(): |
|
print(f"Size of FP16 model is {fp16_weights.stat().st_size / 1024 / 1024:.2f} MB") |
|
for precision, compressed_weights in zip([8, 4], [int8_weights, int4_weights]): |
|
if compressed_weights.exists(): |
|
print( |
|
f"Size of model with INT{precision} compressed weights is {compressed_weights.stat().st_size / 1024 / 1024:.2f} MB" |
|
) |
|
if compressed_weights.exists() and fp16_weights.exists(): |
|
print( |
|
f"Compression rate for INT{precision} model: {fp16_weights.stat().st_size / compressed_weights.stat().st_size:.3f}" |
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
core = ov.Core() |
|
device = widgets.Dropdown( |
|
options=core.available_devices + ["AUTO"], |
|
value="CPU", |
|
description="Device:", |
|
disabled=False, |
|
) |
|
|
|
device |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from ov_llm_model import model_classes |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
available_models = [] |
|
if int4_model_dir.exists(): |
|
available_models.append("INT4") |
|
if int8_model_dir.exists(): |
|
available_models.append("INT8") |
|
if fp16_model_dir.exists(): |
|
available_models.append("FP16") |
|
|
|
model_to_run = widgets.Dropdown( |
|
options=available_models, |
|
value=available_models[0], |
|
description="Model to run:", |
|
disabled=False, |
|
) |
|
|
|
model_to_run |
|
|
|
|
|
|
|
|
|
|
|
from transformers import AutoTokenizer |
|
|
|
if model_to_run.value == "INT4": |
|
model_dir = int4_model_dir |
|
elif model_to_run.value == "INT8": |
|
model_dir = int8_model_dir |
|
else: |
|
model_dir = fp16_model_dir |
|
print(f"Loading model from {model_dir}") |
|
|
|
ov_config = {"PERFORMANCE_HINT": "LATENCY", "NUM_STREAMS": "1", "CACHE_DIR": ""} |
|
|
|
|
|
|
|
if model_id == "red-pajama-3b-chat" and "GPU" in core.available_devices and device.value in ["GPU", "AUTO"]: |
|
ov_config["INFERENCE_PRECISION_HINT"] = "f32" |
|
|
|
model_name = model_configuration["model_id"] |
|
class_key = model_id.split("-")[0] |
|
tok = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) |
|
|
|
model_class = ( |
|
OVModelForCausalLM |
|
if not model_configuration["remote"] |
|
else model_classes[class_key] |
|
) |
|
ov_model = model_class.from_pretrained( |
|
model_dir, |
|
device=device.value, |
|
ov_config=ov_config, |
|
config=AutoConfig.from_pretrained(model_dir, trust_remote_code=True), |
|
trust_remote_code=True, |
|
) |
|
|
|
|
|
|
|
|
|
|
|
tokenizer_kwargs = model_configuration.get("tokenizer_kwargs", {}) |
|
test_string = "2 + 2 =" |
|
input_tokens = tok(test_string, return_tensors="pt", **tokenizer_kwargs) |
|
answer = ov_model.generate(**input_tokens, max_new_tokens=2) |
|
print(tok.batch_decode(answer, skip_special_tokens=True)[0]) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from threading import Event, Thread |
|
from uuid import uuid4 |
|
from typing import List, Tuple |
|
import gradio as gr |
|
from transformers import ( |
|
AutoTokenizer, |
|
StoppingCriteria, |
|
StoppingCriteriaList, |
|
TextIteratorStreamer, |
|
) |
|
|
|
|
|
model_name = model_configuration["model_id"] |
|
start_message = model_configuration["start_message"] |
|
history_template = model_configuration.get("history_template") |
|
current_message_template = model_configuration.get("current_message_template") |
|
stop_tokens = model_configuration.get("stop_tokens") |
|
roles = model_configuration.get("roles") |
|
tokenizer_kwargs = model_configuration.get("tokenizer_kwargs", {}) |
|
|
|
chinese_examples = [ |
|
["你好!"], |
|
["你是谁?"], |
|
["请介绍一下上海"], |
|
["请介绍一下英特尔公司"], |
|
["晚上睡不着怎么办?"], |
|
["给我讲一个年轻人奋斗创业最终取得成功的故事。"], |
|
["给这个故事起一个标题。"], |
|
] |
|
|
|
english_examples = [ |
|
["Hello there! How are you doing?"], |
|
["What is OpenVINO?"], |
|
["Who are you?"], |
|
["Can you explain to me briefly what is Python programming language?"], |
|
["Explain the plot of Cinderella in a sentence."], |
|
["What are some common mistakes to avoid when writing code?"], |
|
[ |
|
"Write a 100-word blog post on “Benefits of Artificial Intelligence and OpenVINO“" |
|
], |
|
] |
|
|
|
japanese_examples = [ |
|
["こんにちは!調子はどうですか?"], |
|
["OpenVINOとは何ですか?"], |
|
["あなたは誰ですか?"], |
|
["Pythonプログラミング言語とは何か簡単に説明してもらえますか?"], |
|
["シンデレラのあらすじを一文で説明してください。"], |
|
["コードを書くときに避けるべきよくある間違いは何ですか?"], |
|
["人工知能と「OpenVINOの利点」について100語程度のブログ記事を書いてください。"], |
|
] |
|
|
|
examples = ( |
|
chinese_examples |
|
if ("qwen" in model_id or "chatglm" in model_id or "baichuan" in model_id) |
|
else japanese_examples |
|
if ("youri" in model_id) |
|
else english_examples |
|
) |
|
|
|
max_new_tokens = 256 |
|
|
|
|
|
class StopOnTokens(StoppingCriteria): |
|
def __init__(self, token_ids): |
|
self.token_ids = token_ids |
|
|
|
def __call__( |
|
self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs |
|
) -> bool: |
|
for stop_id in self.token_ids: |
|
if input_ids[0][-1] == stop_id: |
|
return True |
|
return False |
|
|
|
|
|
if stop_tokens is not None: |
|
if isinstance(stop_tokens[0], str): |
|
stop_tokens = tok.convert_tokens_to_ids(stop_tokens) |
|
|
|
stop_tokens = [StopOnTokens(stop_tokens)] |
|
|
|
|
|
def default_partial_text_processor(partial_text: str, new_text: str): |
|
""" |
|
helper for updating partially generated answer, used by default |
|
|
|
Params: |
|
partial_text: text buffer for storing previosly generated text |
|
new_text: text update for the current step |
|
Returns: |
|
updated text string |
|
|
|
""" |
|
partial_text += new_text |
|
return partial_text |
|
|
|
|
|
text_processor = model_configuration.get( |
|
"partial_text_processor", default_partial_text_processor |
|
) |
|
|
|
|
|
def convert_history_to_token(history: List[Tuple[str, str]], roles=None): |
|
""" |
|
function for conversion history stored as list pairs of user and assistant messages to tokens according to model expected conversation template |
|
Params: |
|
history: dialogue history |
|
Returns: |
|
history in token format |
|
""" |
|
if roles is None: |
|
text = start_message + "".join( |
|
[ |
|
"".join( |
|
[ |
|
history_template.format( |
|
num=round, user=item[0], assistant=item[1] |
|
) |
|
] |
|
) |
|
for round, item in enumerate(history[:-1]) |
|
] |
|
) |
|
text += "".join( |
|
[ |
|
"".join( |
|
[ |
|
current_message_template.format( |
|
num=len(history) + 1, |
|
user=history[-1][0], |
|
assistant=history[-1][1], |
|
) |
|
] |
|
) |
|
] |
|
) |
|
input_token = tok(text, return_tensors="pt", **tokenizer_kwargs).input_ids |
|
elif pt_model_name == "chatglm3": |
|
input_ids = [] |
|
input_ids.extend(tok.build_single_message(roles[0], "", start_message)) |
|
for old_query, response in history[:-1]: |
|
input_ids.extend(tok.build_single_message(roles[1], "", old_query)) |
|
input_ids.extend(tok.build_single_message(roles[2], "", response)) |
|
input_ids.extend(tok.build_single_message( |
|
roles[1], "", history[-1][0])) |
|
input_ids.extend([tok.get_command(f"<|{roles[2]}|>")]) |
|
input_token = tok.batch_encode_plus( |
|
[input_ids], return_tensors="pt", is_split_into_words=True |
|
).input_ids |
|
else: |
|
system_tokens = tok.encode(start_message) |
|
history_tokens = [] |
|
for (old_query, response) in history[:-1]: |
|
round_tokens = [] |
|
round_tokens.append(roles[0]) |
|
round_tokens.extend(tok.encode(old_query)) |
|
round_tokens.append(roles[1]) |
|
round_tokens.extend(tok.encode(response)) |
|
history_tokens = round_tokens + history_tokens |
|
input_tokens = system_tokens + history_tokens |
|
input_tokens.append(roles[0]) |
|
input_tokens.extend(tok.encode(history[-1][0])) |
|
input_tokens.append(roles[1]) |
|
input_token = torch.LongTensor([input_tokens]) |
|
return input_token |
|
|
|
|
|
def user(message, history): |
|
""" |
|
callback function for updating user messages in interface on submit button click |
|
|
|
Params: |
|
message: current message |
|
history: conversation history |
|
Returns: |
|
None |
|
""" |
|
|
|
return "", history + [[message, ""]] |
|
|
|
|
|
def bot(history, temperature, top_p, top_k, repetition_penalty, conversation_id): |
|
""" |
|
callback function for running chatbot on submit button click |
|
|
|
Params: |
|
history: conversation history |
|
temperature: parameter for control the level of creativity in AI-generated text. |
|
By adjusting the `temperature`, you can influence the AI model's probability distribution, making the text more focused or diverse. |
|
top_p: parameter for control the range of tokens considered by the AI model based on their cumulative probability. |
|
top_k: parameter for control the range of tokens considered by the AI model based on their cumulative probability, selecting number of tokens with highest probability. |
|
repetition_penalty: parameter for penalizing tokens based on how frequently they occur in the text. |
|
conversation_id: unique conversation identifier. |
|
|
|
""" |
|
|
|
|
|
|
|
input_ids = convert_history_to_token(history, roles) |
|
if input_ids.shape[1] > 2000: |
|
history = [history[-1]] |
|
input_ids = convert_history_to_token(history, roles) |
|
streamer = TextIteratorStreamer( |
|
tok, timeout=30.0, skip_prompt=True, skip_special_tokens=True |
|
) |
|
generate_kwargs = dict( |
|
input_ids=input_ids, |
|
max_new_tokens=max_new_tokens, |
|
temperature=temperature, |
|
do_sample=temperature > 0.0, |
|
top_p=top_p, |
|
top_k=top_k, |
|
repetition_penalty=repetition_penalty, |
|
streamer=streamer, |
|
) |
|
if stop_tokens is not None: |
|
generate_kwargs["stopping_criteria"] = StoppingCriteriaList( |
|
stop_tokens) |
|
|
|
stream_complete = Event() |
|
|
|
def generate_and_signal_complete(): |
|
""" |
|
genration function for single thread |
|
""" |
|
global start_time |
|
ov_model.generate(**generate_kwargs) |
|
stream_complete.set() |
|
|
|
t1 = Thread(target=generate_and_signal_complete) |
|
t1.start() |
|
|
|
|
|
partial_text = "" |
|
for new_text in streamer: |
|
partial_text = text_processor(partial_text, new_text) |
|
history[-1][1] = partial_text |
|
yield history |
|
|
|
|
|
def get_uuid(): |
|
""" |
|
universal unique identifier for thread |
|
""" |
|
return str(uuid4()) |
|
|
|
|
|
with gr.Blocks( |
|
theme=gr.themes.Soft(), |
|
css=".disclaimer {font-variant-caps: all-small-caps;}", |
|
) as demo: |
|
conversation_id = gr.State(get_uuid) |
|
gr.Markdown( |
|
f"""<h1><center>OpenVINO {model_id} Chatbot</center></h1>""") |
|
chatbot = gr.Chatbot(height=500) |
|
with gr.Row(): |
|
with gr.Column(): |
|
msg = gr.Textbox( |
|
label="Chat Message Box", |
|
placeholder="Chat Message Box", |
|
show_label=False, |
|
container=False, |
|
) |
|
with gr.Column(): |
|
with gr.Row(): |
|
submit = gr.Button("Submit") |
|
stop = gr.Button("Stop") |
|
clear = gr.Button("Clear") |
|
with gr.Row(): |
|
with gr.Accordion("Advanced Options:", open=False): |
|
with gr.Row(): |
|
with gr.Column(): |
|
with gr.Row(): |
|
temperature = gr.Slider( |
|
label="Temperature", |
|
value=0.1, |
|
minimum=0.0, |
|
maximum=1.0, |
|
step=0.1, |
|
interactive=True, |
|
info="Higher values produce more diverse outputs", |
|
) |
|
with gr.Column(): |
|
with gr.Row(): |
|
top_p = gr.Slider( |
|
label="Top-p (nucleus sampling)", |
|
value=1.0, |
|
minimum=0.0, |
|
maximum=1, |
|
step=0.01, |
|
interactive=True, |
|
info=( |
|
"Sample from the smallest possible set of tokens whose cumulative probability " |
|
"exceeds top_p. Set to 1 to disable and sample from all tokens." |
|
), |
|
) |
|
with gr.Column(): |
|
with gr.Row(): |
|
top_k = gr.Slider( |
|
label="Top-k", |
|
value=50, |
|
minimum=0.0, |
|
maximum=200, |
|
step=1, |
|
interactive=True, |
|
info="Sample from a shortlist of top-k tokens — 0 to disable and sample from all tokens.", |
|
) |
|
with gr.Column(): |
|
with gr.Row(): |
|
repetition_penalty = gr.Slider( |
|
label="Repetition Penalty", |
|
value=1.1, |
|
minimum=1.0, |
|
maximum=2.0, |
|
step=0.1, |
|
interactive=True, |
|
info="Penalize repetition — 1.0 to disable.", |
|
) |
|
gr.Examples( |
|
examples, inputs=msg, label="Click on any example and press the 'Submit' button" |
|
) |
|
|
|
submit_event = msg.submit( |
|
fn=user, |
|
inputs=[msg, chatbot], |
|
outputs=[msg, chatbot], |
|
queue=False, |
|
).then( |
|
fn=bot, |
|
inputs=[ |
|
chatbot, |
|
temperature, |
|
top_p, |
|
top_k, |
|
repetition_penalty, |
|
conversation_id, |
|
], |
|
outputs=chatbot, |
|
queue=True, |
|
) |
|
submit_click_event = submit.click( |
|
fn=user, |
|
inputs=[msg, chatbot], |
|
outputs=[msg, chatbot], |
|
queue=False, |
|
).then( |
|
fn=bot, |
|
inputs=[ |
|
chatbot, |
|
temperature, |
|
top_p, |
|
top_k, |
|
repetition_penalty, |
|
conversation_id, |
|
], |
|
outputs=chatbot, |
|
queue=True, |
|
) |
|
stop.click( |
|
fn=None, |
|
inputs=None, |
|
outputs=None, |
|
cancels=[submit_event, submit_click_event], |
|
queue=False, |
|
) |
|
clear.click(lambda: None, None, chatbot, queue=False) |
|
|
|
|
|
|
|
|
|
|
|
|
|
demo.launch() |
|
|
|
|