Spaces:

xzyun2011
/

wulewule

Running

File size: 5,488 Bytes

d573b56

from lmdeploy.model import MODELS, BaseChatTemplate

@MODELS.register_module(name='customized_model')
class CustomizedModel(BaseChatTemplate):
    """A customized chat template."""

    def __init__(self,
                 system='<|im_start|>system\n',
                 meta_instruction='You are a robot developed by LMDeploy.',
                 user='<|im_start|>user\n',
                 assistant='<|im_start|>assistant\n',
                 eosys='<|im_end|>\n',
                 eoh='<|im_end|>\n',
                 eoa='<|im_end|>',
                 separator='\n',
                 stop_words=['<|im_end|>', '<|action_end|>']):
        super().__init__(system=system,
                         meta_instruction=meta_instruction,
                         eosys=eosys,
                         user=user,
                         eoh=eoh,
                         assistant=assistant,
                         eoa=eoa,
                         separator=separator,
                         stop_words=stop_words)
        

import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from pathlib import Path

import streamlit as st
from lmdeploy import TurbomindEngineConfig, pipeline, GenerationConfig, ChatTemplateConfig
from lmdeploy.serve.async_engine import AsyncEngine
from modelscope import snapshot_download
import logging
from typing import Any, List, Optional, Iterator
import hydra

from download_models import download_model
from langchain.llms.base import LLM
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel, TextIteratorStreamer

class LmdeployLM(LLM):   
    llm_system_prompt: str=""
    model: AsyncEngine = None
    gen_config: GenerationConfig = None
    def __init__(self, model_path: str, llm_system_prompt: str, cache_max_entry_count: float):
        super().__init__()
        self.llm_system_prompt = llm_system_prompt
        self.model = load_turbomind_model(model_path, llm_system_prompt, cache_max_entry_count)
        self.gen_config = GenerationConfig(top_p=0.8,
                                top_k=40,
                                temperature=0.8,
                                max_new_tokens=2048,
                                repetition_penalty=1.05)


    def _call(self, prompt : str, stop: Optional[List[str]] = None, **kwargs: Any):
        response = self.model([prompt])
        return response[0].text
    
    def stream(self, prompt: str) -> Iterator[str]:
        ## OpenAI 格式输
        messages = [{'role': 'user', 'content': f'{prompt}'}]
        for response in self.model.stream_infer(messages, gen_config=self.gen_config):
            yield response.text
        
    @property
    def _llm_type(self) -> str:
        return "InternLM2"

@st.cache_resource
def load_turbomind_model(model_dir, system_prompt, cache_max_entry_count):  # hf awq

    logging.info(f"正在从本地:{model_dir}加载模型...")

    model_format = "hf"
    if Path(model_dir).stem.endswith("-4bit"):
        model_format = "awq"

    # model_dir = snapshot_download(model_dir, revision="master", cache_dir="./models")
    

    backend_config = TurbomindEngineConfig(
        model_format=model_format, session_len=32768, cache_max_entry_count=cache_max_entry_count, 
    )

    pipe = pipeline(model_dir, backend_config=backend_config, log_level="ERROR", model_name="internlm2",
                chat_template_config=ChatTemplateConfig('customized_model', meta_instruction=system_prompt) )

    logging.info("完成本地模型的加载") 
    return pipe


@hydra.main(version_base=None, config_path="../configs", config_name="model_cfg")
def test_demo(config):
    model_dir = config.llm_model
    ## download model from modelscope
    if not os.path.exists(model_dir):
        download_model(llm_model_path = model_dir)

    system_prompt = config.llm_system_prompt
    cache_max_entry_count = config.cache_max_entry_count #lmdeploy 4bit,  k/v cache内存占比调整为总显存的 20%
    question="""黑神话悟空发售时间和团队？"""
    if config.use_lmdepoly:
        ## lmdepoly inference
        ## OpenAI 格式输
        messages = [{'role': 'user', 'content': f'{question}'}]
        gen_config = GenerationConfig(top_p=0.8,
                    top_k=40,
                    temperature=0.8,
                    max_new_tokens=2048,
                    repetition_penalty=1.05)
        pipe = load_turbomind_model(model_dir, system_prompt, cache_max_entry_count)
        for response in pipe.stream_infer(messages, gen_config=gen_config):
            print(response.text, end='')
        # response = pipe(['你是谁呀', '介绍下你自己', 'Are you developed by LMDeploy?', '黑神话悟空发售时间和团队？'])
    else:
        ## normal inference
        assert not str(model_dir).endswith("w4a16-4bit"), f"{model_dir} must use lmdeploy inference"
        from rag.simple_rag import InternLM
        base_mode = InternLM(model_path=model_dir, llm_system_prompt=system_prompt)
        # 流式显示, used streaming result
        if config.stream_response:
            logging.info("Streaming response:")
            for chunk in base_mode.stream(question):
                print(chunk, end='', flush=True)
            print("\n")
        # 一次性显示结果
        else:
            response = base_mode(question)
            logging.info(f"question: {question}\n wulewule answer:\n{response}")

if __name__ == "__main__":
    test_demo()