|
from lmdeploy.model import MODELS, BaseChatTemplate |
|
|
|
@MODELS.register_module(name='customized_model') |
|
class CustomizedModel(BaseChatTemplate): |
|
"""A customized chat template.""" |
|
|
|
def __init__(self, |
|
system='<|im_start|>system\n', |
|
meta_instruction='You are a robot developed by LMDeploy.', |
|
user='<|im_start|>user\n', |
|
assistant='<|im_start|>assistant\n', |
|
eosys='<|im_end|>\n', |
|
eoh='<|im_end|>\n', |
|
eoa='<|im_end|>', |
|
separator='\n', |
|
stop_words=['<|im_end|>', '<|action_end|>']): |
|
super().__init__(system=system, |
|
meta_instruction=meta_instruction, |
|
eosys=eosys, |
|
user=user, |
|
eoh=eoh, |
|
assistant=assistant, |
|
eoa=eoa, |
|
separator=separator, |
|
stop_words=stop_words) |
|
|
|
|
|
import os |
|
import sys |
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
|
from pathlib import Path |
|
|
|
import streamlit as st |
|
from lmdeploy import TurbomindEngineConfig, pipeline, GenerationConfig, ChatTemplateConfig |
|
from lmdeploy.serve.async_engine import AsyncEngine |
|
from modelscope import snapshot_download |
|
import logging |
|
from typing import Any, List, Optional, Iterator |
|
import hydra |
|
|
|
from download_models import download_model |
|
from langchain.llms.base import LLM |
|
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel, TextIteratorStreamer |
|
|
|
class LmdeployLM(LLM): |
|
llm_system_prompt: str="" |
|
model: AsyncEngine = None |
|
gen_config: GenerationConfig = None |
|
def __init__(self, model_path: str, llm_system_prompt: str, cache_max_entry_count: float): |
|
super().__init__() |
|
self.llm_system_prompt = llm_system_prompt |
|
self.model = load_turbomind_model(model_path, llm_system_prompt, cache_max_entry_count) |
|
self.gen_config = GenerationConfig(top_p=0.8, |
|
top_k=40, |
|
temperature=0.8, |
|
max_new_tokens=2048, |
|
repetition_penalty=1.05) |
|
|
|
|
|
def _call(self, prompt : str, stop: Optional[List[str]] = None, **kwargs: Any): |
|
response = self.model([prompt]) |
|
return response[0].text |
|
|
|
def stream(self, prompt: str) -> Iterator[str]: |
|
|
|
messages = [{'role': 'user', 'content': f'{prompt}'}] |
|
for response in self.model.stream_infer(messages, gen_config=self.gen_config): |
|
yield response.text |
|
|
|
@property |
|
def _llm_type(self) -> str: |
|
return "InternLM2" |
|
|
|
@st.cache_resource |
|
def load_turbomind_model(model_dir, system_prompt, cache_max_entry_count): |
|
|
|
logging.info(f"正在从本地:{model_dir}加载模型...") |
|
|
|
model_format = "hf" |
|
if Path(model_dir).stem.endswith("-4bit"): |
|
model_format = "awq" |
|
|
|
|
|
|
|
|
|
backend_config = TurbomindEngineConfig( |
|
model_format=model_format, session_len=32768, cache_max_entry_count=cache_max_entry_count, |
|
) |
|
|
|
pipe = pipeline(model_dir, backend_config=backend_config, log_level="ERROR", model_name="internlm2", |
|
chat_template_config=ChatTemplateConfig('customized_model', meta_instruction=system_prompt) ) |
|
|
|
logging.info("完成本地模型的加载") |
|
return pipe |
|
|
|
|
|
@hydra.main(version_base=None, config_path="../configs", config_name="model_cfg") |
|
def test_demo(config): |
|
model_dir = config.llm_model |
|
|
|
if not os.path.exists(model_dir): |
|
download_model(llm_model_path = model_dir) |
|
|
|
system_prompt = config.llm_system_prompt |
|
cache_max_entry_count = config.cache_max_entry_count |
|
question="""黑神话悟空发售时间和团队?""" |
|
if config.use_lmdepoly: |
|
|
|
|
|
messages = [{'role': 'user', 'content': f'{question}'}] |
|
gen_config = GenerationConfig(top_p=0.8, |
|
top_k=40, |
|
temperature=0.8, |
|
max_new_tokens=2048, |
|
repetition_penalty=1.05) |
|
pipe = load_turbomind_model(model_dir, system_prompt, cache_max_entry_count) |
|
for response in pipe.stream_infer(messages, gen_config=gen_config): |
|
print(response.text, end='') |
|
|
|
else: |
|
|
|
assert not str(model_dir).endswith("w4a16-4bit"), f"{model_dir} must use lmdeploy inference" |
|
from rag.simple_rag import InternLM |
|
base_mode = InternLM(model_path=model_dir, llm_system_prompt=system_prompt) |
|
|
|
if config.stream_response: |
|
logging.info("Streaming response:") |
|
for chunk in base_mode.stream(question): |
|
print(chunk, end='', flush=True) |
|
print("\n") |
|
|
|
else: |
|
response = base_mode(question) |
|
logging.info(f"question: {question}\n wulewule answer:\n{response}") |
|
|
|
if __name__ == "__main__": |
|
test_demo() |