import torch from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer from accelerate import infer_auto_device_map, init_empty_weights, dispatch_model model_path = "Crystalcareai/Quiet-Star-Custom" n_ahead = 8 n_ahead_talk = 4 merged_talk_heads = True model = AutoModelForCausalLM.from_pretrained(model_path, max_thoughts=n_ahead + n_ahead_talk + 1, merged_talk_heads=merged_talk_heads, merged_lm_and_talk_heads=False, merged_lm_and_think_heads=True, use_concat_talk_head=True, use_shallow_think=True, use_shallow_talk=False, use_complex_think_head=False, use_complex_talk_head=True, use_weighted_talk_head=True, trust_remote_code=True, torch_dtype=torch.bfloat16, device_map="auto", ) model.eval() tokenizer = AutoTokenizer.from_pretrained(model_path) model.tokenizer = tokenizer # Set the tokenizer attribute of the model streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=False) # Convert prompt to tokens prompt_template = "[INST] {prompt} [/INST]" prompt = "It is not always easy to see who is related to whom -- and in which ways. The following argument pertains to this question: To begin with, Lesley is a close friend of Fernando. Moreover, being a close friend of Fernando or a schoolmate of Lowell is sufficient for being a great-grandfather of Leroy. It follows that Lesley is a great-grandfather of Leroy. Is the argument, given the explicitly stated premises, deductively valid or invalid?" input_ids = tokenizer( prompt_template.format(prompt=prompt), return_tensors='pt' ).input_ids.to(model.device) attention_mask = torch.ones_like(input_ids) max_length = 256 output_ids, _ = model.generate(input_ids, attention_mask=attention_mask, max_length=max_length, streamer=streamer) print(tokenizer.decode(output_ids[0], skip_special_tokens=False))