from transformers import AutoTokenizer, AutoModelForSeq2SeqLM hf_model = "law-llm/law-glm-10b" max_question_length = 64 max_generation_length = 490 tokenizer = AutoTokenizer.from_pretrained( hf_model, cache_dir=model_cache_dir, use_fast=True, trust_remote_code=True ) model = AutoModelForSeq2SeqLM.from_pretrained( hf_model, cache_dir=model_cache_dir, trust_remote_code=True ) model = model.to('cuda') model.eval() model_inputs = "提问: 犯了盗窃罪怎么判刑? 回答: [gMASK]" model_inputs = tokenizer(model_inputs, max_length=max_question_length, padding=True, truncation=True, return_tensors="pt") model_inputs = tokenizer.build_inputs_for_generation(model_inputs, targets=None, max_gen_length=max_generation_length, padding=True) inputs = model_inputs.to('cuda') outputs = model.generate(**inputs, max_length=max_generation_length, eos_token_id=tokenizer.eop_token_id) prediction = tokenizer.decode(outputs[0].tolist())