|
import gc |
|
import logging |
|
import os |
|
|
|
import torch |
|
from accelerate import dispatch_model, infer_auto_device_map |
|
from accelerate.utils import get_balanced_memory |
|
from h2o_wave import Q |
|
from h2o_wave import data as chat_data |
|
from h2o_wave import ui |
|
|
|
from llm_studio.app_utils.utils import get_experiments, get_ui_elements, set_env |
|
from llm_studio.python_configs.base import DefaultConfigProblemBase |
|
from llm_studio.src.datasets.text_utils import get_tokenizer |
|
from llm_studio.src.utils.config_utils import ( |
|
NON_GENERATION_PROBLEM_TYPES, |
|
load_config_yaml, |
|
) |
|
from llm_studio.src.utils.modeling_utils import load_checkpoint |
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
async def chat_tab(q: Q, load_model=True): |
|
if not await should_start_chat(q): |
|
return |
|
|
|
if load_model: |
|
q.page["experiment/display/chat"] = ui.form_card( |
|
box="first", |
|
items=[ui.progress(label="Loading the model...")], |
|
) |
|
|
|
q.client["experiment/display/chat/messages"] = [] |
|
q.client.delete_cards.add("experiment/display/chat") |
|
|
|
q.page["experiment/display/chat/settings"] = ui.form_card( |
|
box="second", |
|
items=[ |
|
ui.expander( |
|
name="chat_settings", |
|
label="Chat Settings", |
|
items=[ui.progress(label="Loading model configuration...")], |
|
expanded=True, |
|
) |
|
], |
|
) |
|
q.client.delete_cards.add("experiment/display/chat/settings") |
|
|
|
await q.page.save() |
|
logger.info(torch.cuda.memory_allocated()) |
|
|
|
if load_model: |
|
with set_env(HUGGINGFACE_TOKEN=q.client["default_huggingface_api_token"]): |
|
gpu_id = q.client["gpu_used_for_chat"] - 1 |
|
cfg, model, tokenizer = load_cfg_model_tokenizer( |
|
q.client["experiment/display/experiment_path"], device=f"cuda:{gpu_id}" |
|
) |
|
q.client["experiment/display/chat/cfg"] = cfg |
|
q.client["experiment/display/chat/model"] = model |
|
q.client["experiment/display/chat/tokenizer"] = tokenizer |
|
initial_message = "Model successfully loaded, how can I help you?" |
|
|
|
else: |
|
cfg = q.client["experiment/display/chat/cfg"] |
|
assert q.client["experiment/display/chat/model"] is not None |
|
assert q.client["experiment/display/chat/tokenizer"] is not None |
|
initial_message = "Chat History cleaned. How can I help you?" |
|
|
|
|
|
cfg.prediction._visibility["metric"] = -1 |
|
cfg.prediction._visibility["batch_size_inference"] = -1 |
|
cfg.prediction._visibility["min_length_inference"] = -1 |
|
cfg.prediction._visibility["stop_tokens"] = -1 |
|
|
|
logger.info(torch.cuda.memory_allocated()) |
|
q.page["experiment/display/chat"] = ui.chatbot_card( |
|
box="first", |
|
data=chat_data(fields="content from_user", t="list"), |
|
name="experiment/display/chat/chatbot", |
|
events=["stop", "suggestion"], |
|
suggestions=[ |
|
ui.chat_suggestion( |
|
"Write a poem about H2O LLM Studio", |
|
label="Write a poem", |
|
caption="about H2O LLM Studio", |
|
icon="Edit", |
|
), |
|
ui.chat_suggestion( |
|
"Plan a trip to Europe", |
|
label="Plan a trip", |
|
caption="to Europe", |
|
icon="Airplane", |
|
), |
|
ui.chat_suggestion( |
|
"Give me ideas for a new project", |
|
label="Give me ideas", |
|
caption="for a new project", |
|
icon="Lightbulb", |
|
), |
|
ui.chat_suggestion( |
|
"Explain me CSS preprocessors", |
|
label="Explain me", |
|
caption="CSS preprocessors", |
|
icon="Code", |
|
), |
|
], |
|
) |
|
q.page["experiment/display/chat"].data += [initial_message, False] |
|
|
|
option_items = get_ui_elements( |
|
cfg=q.client["experiment/display/chat/cfg"].prediction, |
|
q=q, |
|
pre="chat/cfg_predictions", |
|
) |
|
q.page["experiment/display/chat/settings"] = ui.form_card( |
|
box="second", |
|
items=[ |
|
ui.buttons( |
|
[ |
|
ui.button( |
|
name="experiment/display/chat/clear_history", |
|
label="Clear History", |
|
primary=True, |
|
), |
|
] |
|
), |
|
ui.expander( |
|
name="chat_settings", |
|
label="Chat Settings", |
|
items=option_items, |
|
expanded=True, |
|
), |
|
], |
|
) |
|
|
|
|
|
async def should_start_chat(q: Q): |
|
cfg: DefaultConfigProblemBase = load_config_yaml( |
|
os.path.join(q.client["experiment/display/experiment_path"], "cfg.yaml") |
|
) |
|
|
|
if cfg.problem_type in NON_GENERATION_PROBLEM_TYPES: |
|
q.page["experiment/display/chat"] = ui.form_card( |
|
box="first", |
|
items=[ |
|
ui.text( |
|
"Chatbot is not available for text classification problems. " |
|
"Please select a text generation problem." |
|
) |
|
], |
|
title="", |
|
) |
|
q.client.delete_cards.add("experiment/display/chat") |
|
return False |
|
|
|
|
|
gpu_id = q.client["gpu_used_for_chat"] - 1 |
|
if gpu_is_blocked(q, gpu_id): |
|
q.page["experiment/display/chat"] = ui.form_card( |
|
box="first", |
|
items=[ |
|
ui.text( |
|
f"""Chatbot is not available when GPU{q.client["gpu_used_for_chat"]} |
|
is blocked by another experiment. |
|
You can change "Gpu used for Chat" in the settings tab |
|
to use another GPU for the chatbot. """ |
|
) |
|
], |
|
title="", |
|
) |
|
q.client.delete_cards.add("experiment/display/chat") |
|
return False |
|
return True |
|
|
|
|
|
def gpu_is_blocked(q, gpu_id): |
|
experiments = get_experiments(q=q) |
|
running_experiments = experiments[experiments.status.isin(["running"])] |
|
gpu_blocked = any( |
|
[ |
|
str(gpu_id) in gpu_list |
|
for gpu_list in running_experiments["gpu_list"] |
|
.apply(lambda x: x.split(",")) |
|
.to_list() |
|
] |
|
) |
|
return gpu_blocked |
|
|
|
|
|
def load_cfg_model_tokenizer( |
|
experiment_path: str, merge: bool = False, device: str = "cuda:0" |
|
): |
|
cfg = load_config_yaml(os.path.join(experiment_path, "cfg.yaml")) |
|
cfg.architecture.pretrained = False |
|
cfg.architecture.gradient_checkpointing = False |
|
cfg.environment._device = device.replace("_shard", "") |
|
cfg.environment._local_rank = 0 |
|
cfg.prediction._visibility["num_history"] = 1 |
|
|
|
tokenizer = get_tokenizer(cfg) |
|
|
|
gc.collect() |
|
torch.cuda.empty_cache() |
|
|
|
if ( |
|
merge |
|
and cfg.training.lora |
|
and cfg.architecture.backbone_dtype in ("int4", "int8") |
|
): |
|
logger.info("Loading backbone in float16 for merging LORA weights.") |
|
cfg.architecture.backbone_dtype = "float16" |
|
cfg.architecture.pretrained = True |
|
|
|
|
|
|
|
|
|
with torch.device(cfg.environment._device): |
|
model = cfg.architecture.model_class(cfg) |
|
cfg.architecture.pretrained_weights = os.path.join( |
|
experiment_path, "checkpoint.pth" |
|
) |
|
load_checkpoint(cfg, model, strict=False) |
|
|
|
if device == "cpu_shard": |
|
max_memory = get_balanced_memory( |
|
model, |
|
) |
|
device_map = infer_auto_device_map(model, max_memory=max_memory) |
|
model = dispatch_model( |
|
model, |
|
device_map=device_map, |
|
) |
|
|
|
if merge and cfg.training.lora: |
|
|
|
|
|
logger.info("Merging LORA layers with base model.") |
|
if device == "cpu": |
|
model = model.to(torch.float32) |
|
model.backbone = model.backbone.merge_and_unload() |
|
if device == "cpu": |
|
model = model.to(torch.float16) |
|
|
|
model = model.eval() |
|
model.backbone.use_cache = True |
|
|
|
return cfg, model, tokenizer |
|
|