File size: 5,488 Bytes
d573b56 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
from lmdeploy.model import MODELS, BaseChatTemplate
@MODELS.register_module(name='customized_model')
class CustomizedModel(BaseChatTemplate):
"""A customized chat template."""
def __init__(self,
system='<|im_start|>system\n',
meta_instruction='You are a robot developed by LMDeploy.',
user='<|im_start|>user\n',
assistant='<|im_start|>assistant\n',
eosys='<|im_end|>\n',
eoh='<|im_end|>\n',
eoa='<|im_end|>',
separator='\n',
stop_words=['<|im_end|>', '<|action_end|>']):
super().__init__(system=system,
meta_instruction=meta_instruction,
eosys=eosys,
user=user,
eoh=eoh,
assistant=assistant,
eoa=eoa,
separator=separator,
stop_words=stop_words)
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from pathlib import Path
import streamlit as st
from lmdeploy import TurbomindEngineConfig, pipeline, GenerationConfig, ChatTemplateConfig
from lmdeploy.serve.async_engine import AsyncEngine
from modelscope import snapshot_download
import logging
from typing import Any, List, Optional, Iterator
import hydra
from download_models import download_model
from langchain.llms.base import LLM
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel, TextIteratorStreamer
class LmdeployLM(LLM):
llm_system_prompt: str=""
model: AsyncEngine = None
gen_config: GenerationConfig = None
def __init__(self, model_path: str, llm_system_prompt: str, cache_max_entry_count: float):
super().__init__()
self.llm_system_prompt = llm_system_prompt
self.model = load_turbomind_model(model_path, llm_system_prompt, cache_max_entry_count)
self.gen_config = GenerationConfig(top_p=0.8,
top_k=40,
temperature=0.8,
max_new_tokens=2048,
repetition_penalty=1.05)
def _call(self, prompt : str, stop: Optional[List[str]] = None, **kwargs: Any):
response = self.model([prompt])
return response[0].text
def stream(self, prompt: str) -> Iterator[str]:
## OpenAI 格式输
messages = [{'role': 'user', 'content': f'{prompt}'}]
for response in self.model.stream_infer(messages, gen_config=self.gen_config):
yield response.text
@property
def _llm_type(self) -> str:
return "InternLM2"
@st.cache_resource
def load_turbomind_model(model_dir, system_prompt, cache_max_entry_count): # hf awq
logging.info(f"正在从本地:{model_dir}加载模型...")
model_format = "hf"
if Path(model_dir).stem.endswith("-4bit"):
model_format = "awq"
# model_dir = snapshot_download(model_dir, revision="master", cache_dir="./models")
backend_config = TurbomindEngineConfig(
model_format=model_format, session_len=32768, cache_max_entry_count=cache_max_entry_count,
)
pipe = pipeline(model_dir, backend_config=backend_config, log_level="ERROR", model_name="internlm2",
chat_template_config=ChatTemplateConfig('customized_model', meta_instruction=system_prompt) )
logging.info("完成本地模型的加载")
return pipe
@hydra.main(version_base=None, config_path="../configs", config_name="model_cfg")
def test_demo(config):
model_dir = config.llm_model
## download model from modelscope
if not os.path.exists(model_dir):
download_model(llm_model_path = model_dir)
system_prompt = config.llm_system_prompt
cache_max_entry_count = config.cache_max_entry_count #lmdeploy 4bit, k/v cache内存占比调整为总显存的 20%
question="""黑神话悟空发售时间和团队?"""
if config.use_lmdepoly:
## lmdepoly inference
## OpenAI 格式输
messages = [{'role': 'user', 'content': f'{question}'}]
gen_config = GenerationConfig(top_p=0.8,
top_k=40,
temperature=0.8,
max_new_tokens=2048,
repetition_penalty=1.05)
pipe = load_turbomind_model(model_dir, system_prompt, cache_max_entry_count)
for response in pipe.stream_infer(messages, gen_config=gen_config):
print(response.text, end='')
# response = pipe(['你是谁呀', '介绍下你自己', 'Are you developed by LMDeploy?', '黑神话悟空发售时间和团队?'])
else:
## normal inference
assert not str(model_dir).endswith("w4a16-4bit"), f"{model_dir} must use lmdeploy inference"
from rag.simple_rag import InternLM
base_mode = InternLM(model_path=model_dir, llm_system_prompt=system_prompt)
# 流式显示, used streaming result
if config.stream_response:
logging.info("Streaming response:")
for chunk in base_mode.stream(question):
print(chunk, end='', flush=True)
print("\n")
# 一次性显示结果
else:
response = base_mode(question)
logging.info(f"question: {question}\n wulewule answer:\n{response}")
if __name__ == "__main__":
test_demo() |