Spaces:

xzyun2011
/

wulewule

Running

wulewule / deploy /lmdeploy_model.py

zhiyun.xu

update demo

d573b56 about 1 month ago

5.49 kB

	from lmdeploy.model import MODELS, BaseChatTemplate

	@MODELS.register_module(name='customized_model')
	class CustomizedModel(BaseChatTemplate):
	"""A customized chat template."""

	def __init__(self,
	system='<\|im_start\|>system\n',
	meta_instruction='You are a robot developed by LMDeploy.',
	user='<\|im_start\|>user\n',
	assistant='<\|im_start\|>assistant\n',
	eosys='<\|im_end\|>\n',
	eoh='<\|im_end\|>\n',
	eoa='<\|im_end\|>',
	separator='\n',
	stop_words=['<\|im_end\|>', '<\|action_end\|>']):
	super().__init__(system=system,
	meta_instruction=meta_instruction,
	eosys=eosys,
	user=user,
	eoh=eoh,
	assistant=assistant,
	eoa=eoa,
	separator=separator,
	stop_words=stop_words)


	import os
	import sys
	sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
	from pathlib import Path

	import streamlit as st
	from lmdeploy import TurbomindEngineConfig, pipeline, GenerationConfig, ChatTemplateConfig
	from lmdeploy.serve.async_engine import AsyncEngine
	from modelscope import snapshot_download
	import logging
	from typing import Any, List, Optional, Iterator
	import hydra

	from download_models import download_model
	from langchain.llms.base import LLM
	from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel, TextIteratorStreamer

	class LmdeployLM(LLM):
	llm_system_prompt: str=""
	model: AsyncEngine = None
	gen_config: GenerationConfig = None
	def __init__(self, model_path: str, llm_system_prompt: str, cache_max_entry_count: float):
	super().__init__()
	self.llm_system_prompt = llm_system_prompt
	self.model = load_turbomind_model(model_path, llm_system_prompt, cache_max_entry_count)
	self.gen_config = GenerationConfig(top_p=0.8,
	top_k=40,
	temperature=0.8,
	max_new_tokens=2048,
	repetition_penalty=1.05)


	def _call(self, prompt : str, stop: Optional[List[str]] = None, **kwargs: Any):
	response = self.model([prompt])
	return response[0].text

	def stream(self, prompt: str) -> Iterator[str]:
	## OpenAI 格式输
	messages = [{'role': 'user', 'content': f'{prompt}'}]
	for response in self.model.stream_infer(messages, gen_config=self.gen_config):
	yield response.text

	@property
	def _llm_type(self) -> str:
	return "InternLM2"

	@st.cache_resource
	def load_turbomind_model(model_dir, system_prompt, cache_max_entry_count): # hf awq

	logging.info(f"正在从本地:{model_dir}加载模型...")

	model_format = "hf"
	if Path(model_dir).stem.endswith("-4bit"):
	model_format = "awq"

	# model_dir = snapshot_download(model_dir, revision="master", cache_dir="./models")


	backend_config = TurbomindEngineConfig(
	model_format=model_format, session_len=32768, cache_max_entry_count=cache_max_entry_count,
	)

	pipe = pipeline(model_dir, backend_config=backend_config, log_level="ERROR", model_name="internlm2",
	chat_template_config=ChatTemplateConfig('customized_model', meta_instruction=system_prompt) )

	logging.info("完成本地模型的加载")
	return pipe


	@hydra.main(version_base=None, config_path="../configs", config_name="model_cfg")
	def test_demo(config):
	model_dir = config.llm_model
	## download model from modelscope
	if not os.path.exists(model_dir):
	download_model(llm_model_path = model_dir)

	system_prompt = config.llm_system_prompt
	cache_max_entry_count = config.cache_max_entry_count #lmdeploy 4bit, k/v cache内存占比调整为总显存的 20%
	question="""黑神话悟空发售时间和团队？"""
	if config.use_lmdepoly:
	## lmdepoly inference
	## OpenAI 格式输
	messages = [{'role': 'user', 'content': f'{question}'}]
	gen_config = GenerationConfig(top_p=0.8,
	top_k=40,
	temperature=0.8,
	max_new_tokens=2048,
	repetition_penalty=1.05)
	pipe = load_turbomind_model(model_dir, system_prompt, cache_max_entry_count)
	for response in pipe.stream_infer(messages, gen_config=gen_config):
	print(response.text, end='')
	# response = pipe(['你是谁呀', '介绍下你自己', 'Are you developed by LMDeploy?', '黑神话悟空发售时间和团队？'])
	else:
	## normal inference
	assert not str(model_dir).endswith("w4a16-4bit"), f"{model_dir} must use lmdeploy inference"
	from rag.simple_rag import InternLM
	base_mode = InternLM(model_path=model_dir, llm_system_prompt=system_prompt)
	# 流式显示, used streaming result
	if config.stream_response:
	logging.info("Streaming response:")
	for chunk in base_mode.stream(question):
	print(chunk, end='', flush=True)
	print("\n")
	# 一次性显示结果
	else:
	response = base_mode(question)
	logging.info(f"question: {question}\n wulewule answer:\n{response}")

	if __name__ == "__main__":
	test_demo()