wulewule / deploy /lmdeploy_model.py
zhiyun.xu
update demo
d573b56
from lmdeploy.model import MODELS, BaseChatTemplate
@MODELS.register_module(name='customized_model')
class CustomizedModel(BaseChatTemplate):
"""A customized chat template."""
def __init__(self,
system='<|im_start|>system\n',
meta_instruction='You are a robot developed by LMDeploy.',
user='<|im_start|>user\n',
assistant='<|im_start|>assistant\n',
eosys='<|im_end|>\n',
eoh='<|im_end|>\n',
eoa='<|im_end|>',
separator='\n',
stop_words=['<|im_end|>', '<|action_end|>']):
super().__init__(system=system,
meta_instruction=meta_instruction,
eosys=eosys,
user=user,
eoh=eoh,
assistant=assistant,
eoa=eoa,
separator=separator,
stop_words=stop_words)
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from pathlib import Path
import streamlit as st
from lmdeploy import TurbomindEngineConfig, pipeline, GenerationConfig, ChatTemplateConfig
from lmdeploy.serve.async_engine import AsyncEngine
from modelscope import snapshot_download
import logging
from typing import Any, List, Optional, Iterator
import hydra
from download_models import download_model
from langchain.llms.base import LLM
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel, TextIteratorStreamer
class LmdeployLM(LLM):
llm_system_prompt: str=""
model: AsyncEngine = None
gen_config: GenerationConfig = None
def __init__(self, model_path: str, llm_system_prompt: str, cache_max_entry_count: float):
super().__init__()
self.llm_system_prompt = llm_system_prompt
self.model = load_turbomind_model(model_path, llm_system_prompt, cache_max_entry_count)
self.gen_config = GenerationConfig(top_p=0.8,
top_k=40,
temperature=0.8,
max_new_tokens=2048,
repetition_penalty=1.05)
def _call(self, prompt : str, stop: Optional[List[str]] = None, **kwargs: Any):
response = self.model([prompt])
return response[0].text
def stream(self, prompt: str) -> Iterator[str]:
## OpenAI 格式输
messages = [{'role': 'user', 'content': f'{prompt}'}]
for response in self.model.stream_infer(messages, gen_config=self.gen_config):
yield response.text
@property
def _llm_type(self) -> str:
return "InternLM2"
@st.cache_resource
def load_turbomind_model(model_dir, system_prompt, cache_max_entry_count): # hf awq
logging.info(f"正在从本地:{model_dir}加载模型...")
model_format = "hf"
if Path(model_dir).stem.endswith("-4bit"):
model_format = "awq"
# model_dir = snapshot_download(model_dir, revision="master", cache_dir="./models")
backend_config = TurbomindEngineConfig(
model_format=model_format, session_len=32768, cache_max_entry_count=cache_max_entry_count,
)
pipe = pipeline(model_dir, backend_config=backend_config, log_level="ERROR", model_name="internlm2",
chat_template_config=ChatTemplateConfig('customized_model', meta_instruction=system_prompt) )
logging.info("完成本地模型的加载")
return pipe
@hydra.main(version_base=None, config_path="../configs", config_name="model_cfg")
def test_demo(config):
model_dir = config.llm_model
## download model from modelscope
if not os.path.exists(model_dir):
download_model(llm_model_path = model_dir)
system_prompt = config.llm_system_prompt
cache_max_entry_count = config.cache_max_entry_count #lmdeploy 4bit, k/v cache内存占比调整为总显存的 20%
question="""黑神话悟空发售时间和团队?"""
if config.use_lmdepoly:
## lmdepoly inference
## OpenAI 格式输
messages = [{'role': 'user', 'content': f'{question}'}]
gen_config = GenerationConfig(top_p=0.8,
top_k=40,
temperature=0.8,
max_new_tokens=2048,
repetition_penalty=1.05)
pipe = load_turbomind_model(model_dir, system_prompt, cache_max_entry_count)
for response in pipe.stream_infer(messages, gen_config=gen_config):
print(response.text, end='')
# response = pipe(['你是谁呀', '介绍下你自己', 'Are you developed by LMDeploy?', '黑神话悟空发售时间和团队?'])
else:
## normal inference
assert not str(model_dir).endswith("w4a16-4bit"), f"{model_dir} must use lmdeploy inference"
from rag.simple_rag import InternLM
base_mode = InternLM(model_path=model_dir, llm_system_prompt=system_prompt)
# 流式显示, used streaming result
if config.stream_response:
logging.info("Streaming response:")
for chunk in base_mode.stream(question):
print(chunk, end='', flush=True)
print("\n")
# 一次性显示结果
else:
response = base_mode(question)
logging.info(f"question: {question}\n wulewule answer:\n{response}")
if __name__ == "__main__":
test_demo()