lyraChatGLM / demo.py
bigmoyan's picture
Upload 12 files
acff406
raw
history blame
1.44 kB
from transformers import AutoTokenizer
from faster_chat_glm import GLM6B, FasterChatGLM
MAX_OUT_LEN = 50
BATCH_SIZE = 8
USE_CACHE = True
print("Prepare config and inputs....")
chatglm6b_dir = './models'
tokenizer = AutoTokenizer.from_pretrained(chatglm6b_dir, trust_remote_code=True)
input_str = ["音乐推荐应该考虑哪些因素?帮我写一篇不少于800字的方案。 ", ] * BATCH_SIZE
inputs = tokenizer(input_str, return_tensors="pt", padding=True)
input_ids = inputs.input_ids
input_ids = input_ids.to('cuda:0')
print(input_ids.shape)
print('Loading faster model...')
if USE_CACHE:
plan_path = f'./models/glm6b-kv-cache-dy-bs{BATCH_SIZE}.ftm'
else:
plan_path = f'./models/glm6b-bs{BATCH_SIZE}.ftm'
# kernel for chat model.
kernel = GLM6B(plan_path=plan_path,
batch_size=BATCH_SIZE,
num_beams=1,
use_cache=USE_CACHE,
num_heads=32,
emb_size_per_heads=128,
decoder_layers=28,
vocab_size=150528,
max_seq_len=MAX_OUT_LEN)
print("test")
chat = FasterChatGLM(model_dir=chatglm6b_dir, kernel=kernel).half().cuda()
# generate
sample_output = chat.generate(inputs=input_ids, max_length=MAX_OUT_LEN)
# de-tokenize model output to text
res = tokenizer.decode(sample_output[0], skip_special_tokens=True)
print(res)
res = tokenizer.decode(sample_output[BATCH_SIZE-1], skip_special_tokens=True)
print(res)