|
|
|
|
|
from transformers import AutoTokenizer |
|
from faster_chat_glm import GLM6B, FasterChatGLM |
|
|
|
|
|
MAX_OUT_LEN = 50 |
|
BATCH_SIZE = 8 |
|
USE_CACHE = True |
|
|
|
print("Prepare config and inputs....") |
|
chatglm6b_dir = './models' |
|
tokenizer = AutoTokenizer.from_pretrained(chatglm6b_dir, trust_remote_code=True) |
|
|
|
input_str = ["音乐推荐应该考虑哪些因素?帮我写一篇不少于800字的方案。 ", ] * BATCH_SIZE |
|
inputs = tokenizer(input_str, return_tensors="pt", padding=True) |
|
input_ids = inputs.input_ids |
|
input_ids = input_ids.to('cuda:0') |
|
print(input_ids.shape) |
|
|
|
|
|
print('Loading faster model...') |
|
if USE_CACHE: |
|
plan_path = f'./models/glm6b-kv-cache-dy-bs{BATCH_SIZE}.ftm' |
|
else: |
|
plan_path = f'./models/glm6b-bs{BATCH_SIZE}.ftm' |
|
|
|
|
|
kernel = GLM6B(plan_path=plan_path, |
|
batch_size=BATCH_SIZE, |
|
num_beams=1, |
|
use_cache=USE_CACHE, |
|
num_heads=32, |
|
emb_size_per_heads=128, |
|
decoder_layers=28, |
|
vocab_size=150528, |
|
max_seq_len=MAX_OUT_LEN) |
|
print("test") |
|
chat = FasterChatGLM(model_dir=chatglm6b_dir, kernel=kernel).half().cuda() |
|
|
|
|
|
sample_output = chat.generate(inputs=input_ids, max_length=MAX_OUT_LEN) |
|
|
|
res = tokenizer.decode(sample_output[0], skip_special_tokens=True) |
|
print(res) |
|
res = tokenizer.decode(sample_output[BATCH_SIZE-1], skip_special_tokens=True) |
|
print(res) |
|
|