File size: 1,153 Bytes
5f232f5
acff406
 
308345c
 
acff406
308345c
acff406
5f232f5
308345c
acff406
5f232f5
acff406
5f232f5
acff406
308345c
acff406
 
 
5f232f5
acff406
5f232f5
acff406
 
 
 
 
5f232f5
308345c
acff406
 
 
 
 
5f232f5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
# coding=utf-8

from transformers import AutoTokenizer
from lyraChatGLM import GLM6B, FasterChatGLM
import os

current_workdir = os.path.dirname(__file__)

MAX_OUT_LEN = 100
chatglm6b_dir = os.path.join(current_workdir, "models")
tokenizer = AutoTokenizer.from_pretrained(chatglm6b_dir, trust_remote_code=True)
input_str = ["为什么我们需要对深度学习模型加速?", ]
inputs = tokenizer(input_str, return_tensors="pt", padding=True)
input_ids = inputs.input_ids.to('cuda:0')

plan_path = os.path.join(current_workdir, "models/glm6b-bs8.ftm")

# kernel for chat model.
kernel = GLM6B(plan_path=plan_path,
               batch_size=1,
               num_beams=1,
               use_cache=True,
               num_heads=32,
               emb_size_per_heads=128,
               decoder_layers=28,
               vocab_size=150528,
               max_seq_len=MAX_OUT_LEN)

chat = FasterChatGLM(model_dir=chatglm6b_dir, kernel=kernel).half().cuda()

# generate
sample_output = chat.generate(inputs=input_ids, max_length=MAX_OUT_LEN)
# de-tokenize model output to text
res = tokenizer.decode(sample_output[0], skip_special_tokens=True)
print(res)