|
|
|
|
|
from transformers import AutoTokenizer |
|
from faster_chat_glm import GLM6B, FasterChatGLM |
|
|
|
|
|
MAX_OUT_LEN = 100 |
|
chatglm6b_dir = './models' |
|
tokenizer = AutoTokenizer.from_pretrained(chatglm6b_dir, trust_remote_code=True) |
|
input_str = ["为什么我们需要对深度学习模型加速?", ] |
|
inputs = tokenizer(input_str, return_tensors="pt", padding=True) |
|
input_ids = inputs.input_ids.to('cuda:0') |
|
|
|
|
|
plan_path = './models/glm6b-bs8.ftm' |
|
|
|
kernel = GLM6B(plan_path=plan_path, |
|
batch_size=1, |
|
num_beams=1, |
|
use_cache=True, |
|
num_heads=32, |
|
emb_size_per_heads=128, |
|
decoder_layers=28, |
|
vocab_size=150528, |
|
max_seq_len=MAX_OUT_LEN) |
|
|
|
chat = FasterChatGLM(model_dir="./models", kernel=kernel).half().cuda() |
|
|
|
|
|
sample_output = chat.generate(inputs=input_ids, max_length=MAX_OUT_LEN) |
|
|
|
res = tokenizer.decode(sample_output[0], skip_special_tokens=True) |
|
print(res) |