File size: 5,488 Bytes
d573b56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
from lmdeploy.model import MODELS, BaseChatTemplate

@MODELS.register_module(name='customized_model')
class CustomizedModel(BaseChatTemplate):
    """A customized chat template."""

    def __init__(self,
                 system='<|im_start|>system\n',
                 meta_instruction='You are a robot developed by LMDeploy.',
                 user='<|im_start|>user\n',
                 assistant='<|im_start|>assistant\n',
                 eosys='<|im_end|>\n',
                 eoh='<|im_end|>\n',
                 eoa='<|im_end|>',
                 separator='\n',
                 stop_words=['<|im_end|>', '<|action_end|>']):
        super().__init__(system=system,
                         meta_instruction=meta_instruction,
                         eosys=eosys,
                         user=user,
                         eoh=eoh,
                         assistant=assistant,
                         eoa=eoa,
                         separator=separator,
                         stop_words=stop_words)
        

import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from pathlib import Path

import streamlit as st
from lmdeploy import TurbomindEngineConfig, pipeline, GenerationConfig, ChatTemplateConfig
from lmdeploy.serve.async_engine import AsyncEngine
from modelscope import snapshot_download
import logging
from typing import Any, List, Optional, Iterator
import hydra

from download_models import download_model
from langchain.llms.base import LLM
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel, TextIteratorStreamer

class LmdeployLM(LLM):   
    llm_system_prompt: str=""
    model: AsyncEngine = None
    gen_config: GenerationConfig = None
    def __init__(self, model_path: str, llm_system_prompt: str, cache_max_entry_count: float):
        super().__init__()
        self.llm_system_prompt = llm_system_prompt
        self.model = load_turbomind_model(model_path, llm_system_prompt, cache_max_entry_count)
        self.gen_config = GenerationConfig(top_p=0.8,
                                top_k=40,
                                temperature=0.8,
                                max_new_tokens=2048,
                                repetition_penalty=1.05)


    def _call(self, prompt : str, stop: Optional[List[str]] = None, **kwargs: Any):
        response = self.model([prompt])
        return response[0].text
    
    def stream(self, prompt: str) -> Iterator[str]:
        ## OpenAI 格式输
        messages = [{'role': 'user', 'content': f'{prompt}'}]
        for response in self.model.stream_infer(messages, gen_config=self.gen_config):
            yield response.text
        
    @property
    def _llm_type(self) -> str:
        return "InternLM2"

@st.cache_resource
def load_turbomind_model(model_dir, system_prompt, cache_max_entry_count):  # hf awq

    logging.info(f"正在从本地:{model_dir}加载模型...")

    model_format = "hf"
    if Path(model_dir).stem.endswith("-4bit"):
        model_format = "awq"

    # model_dir = snapshot_download(model_dir, revision="master", cache_dir="./models")
    

    backend_config = TurbomindEngineConfig(
        model_format=model_format, session_len=32768, cache_max_entry_count=cache_max_entry_count, 
    )

    pipe = pipeline(model_dir, backend_config=backend_config, log_level="ERROR", model_name="internlm2",
                chat_template_config=ChatTemplateConfig('customized_model', meta_instruction=system_prompt) )

    logging.info("完成本地模型的加载") 
    return pipe


@hydra.main(version_base=None, config_path="../configs", config_name="model_cfg")
def test_demo(config):
    model_dir = config.llm_model
    ## download model from modelscope
    if not os.path.exists(model_dir):
        download_model(llm_model_path = model_dir)

    system_prompt = config.llm_system_prompt
    cache_max_entry_count = config.cache_max_entry_count #lmdeploy 4bit,  k/v cache内存占比调整为总显存的 20%
    question="""黑神话悟空发售时间和团队?"""
    if config.use_lmdepoly:
        ## lmdepoly inference
        ## OpenAI 格式输
        messages = [{'role': 'user', 'content': f'{question}'}]
        gen_config = GenerationConfig(top_p=0.8,
                    top_k=40,
                    temperature=0.8,
                    max_new_tokens=2048,
                    repetition_penalty=1.05)
        pipe = load_turbomind_model(model_dir, system_prompt, cache_max_entry_count)
        for response in pipe.stream_infer(messages, gen_config=gen_config):
            print(response.text, end='')
        # response = pipe(['你是谁呀', '介绍下你自己', 'Are you developed by LMDeploy?', '黑神话悟空发售时间和团队?'])
    else:
        ## normal inference
        assert not str(model_dir).endswith("w4a16-4bit"), f"{model_dir} must use lmdeploy inference"
        from rag.simple_rag import InternLM
        base_mode = InternLM(model_path=model_dir, llm_system_prompt=system_prompt)
        # 流式显示, used streaming result
        if config.stream_response:
            logging.info("Streaming response:")
            for chunk in base_mode.stream(question):
                print(chunk, end='', flush=True)
            print("\n")
        # 一次性显示结果
        else:
            response = base_mode(question)
            logging.info(f"question: {question}\n wulewule answer:\n{response}")

if __name__ == "__main__":
    test_demo()