TMElyralab
/

lyraChatGLM

Model card Files Files and versions Community

lyraChatGLM / demo.py

bigmoyan's picture

Upload 12 files

acff406 over 1 year ago

1.44 kB



	from transformers import AutoTokenizer
	from faster_chat_glm import GLM6B, FasterChatGLM


	MAX_OUT_LEN = 50
	BATCH_SIZE = 8
	USE_CACHE = True

	print("Prepare config and inputs....")
	chatglm6b_dir = './models'
	tokenizer = AutoTokenizer.from_pretrained(chatglm6b_dir, trust_remote_code=True)

	input_str = ["音乐推荐应该考虑哪些因素？帮我写一篇不少于800字的方案。 ", ] * BATCH_SIZE
	inputs = tokenizer(input_str, return_tensors="pt", padding=True)
	input_ids = inputs.input_ids
	input_ids = input_ids.to('cuda:0')
	print(input_ids.shape)


	print('Loading faster model...')
	if USE_CACHE:
	plan_path = f'./models/glm6b-kv-cache-dy-bs{BATCH_SIZE}.ftm'
	else:
	plan_path = f'./models/glm6b-bs{BATCH_SIZE}.ftm'

	# kernel for chat model.
	kernel = GLM6B(plan_path=plan_path,
	batch_size=BATCH_SIZE,
	num_beams=1,
	use_cache=USE_CACHE,
	num_heads=32,
	emb_size_per_heads=128,
	decoder_layers=28,
	vocab_size=150528,
	max_seq_len=MAX_OUT_LEN)
	print("test")
	chat = FasterChatGLM(model_dir=chatglm6b_dir, kernel=kernel).half().cuda()

	# generate
	sample_output = chat.generate(inputs=input_ids, max_length=MAX_OUT_LEN)
	# de-tokenize model output to text
	res = tokenizer.decode(sample_output[0], skip_special_tokens=True)
	print(res)
	res = tokenizer.decode(sample_output[BATCH_SIZE-1], skip_special_tokens=True)
	print(res)