from transformers import AutoTokenizer from faster_chat_glm import GLM6B, FasterChatGLM MAX_OUT_LEN = 50 BATCH_SIZE = 8 USE_CACHE = True print("Prepare config and inputs....") chatglm6b_dir = './models' tokenizer = AutoTokenizer.from_pretrained(chatglm6b_dir, trust_remote_code=True) input_str = ["音乐推荐应该考虑哪些因素?帮我写一篇不少于800字的方案。 ", ] * BATCH_SIZE inputs = tokenizer(input_str, return_tensors="pt", padding=True) input_ids = inputs.input_ids input_ids = input_ids.to('cuda:0') print(input_ids.shape) print('Loading faster model...') if USE_CACHE: plan_path = f'./models/glm6b-kv-cache-dy-bs{BATCH_SIZE}.ftm' else: plan_path = f'./models/glm6b-bs{BATCH_SIZE}.ftm' # kernel for chat model. kernel = GLM6B(plan_path=plan_path, batch_size=BATCH_SIZE, num_beams=1, use_cache=USE_CACHE, num_heads=32, emb_size_per_heads=128, decoder_layers=28, vocab_size=150528, max_seq_len=MAX_OUT_LEN) print("test") chat = FasterChatGLM(model_dir=chatglm6b_dir, kernel=kernel).half().cuda() # generate sample_output = chat.generate(inputs=input_ids, max_length=MAX_OUT_LEN) # de-tokenize model output to text res = tokenizer.decode(sample_output[0], skip_special_tokens=True) print(res) res = tokenizer.decode(sample_output[BATCH_SIZE-1], skip_special_tokens=True) print(res)