Crystalcareai
/

Quiet-Star-Custom

Text Generation

Transformers

Safetensors

quiet

custom_code

Model card Files Files and versions Community

Crystalcareai commited on Apr 3

Commit

2f4f471

•

1 Parent(s): ce8a2da

Update inference.py

Browse files

Files changed (1) hide show

inference.py +40 -49

inference.py CHANGED Viewed

@@ -1,57 +1,48 @@
 import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer
-# Load the pre-trained model and tokenizer
-model_name = "Crystalcareai/Quiet-Star-Custom"
-model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, ignore_mismatched_sizes=True)
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-# Set the tokenizer in the model
-model.tokenizer = tokenizer
 prompt_template = "[INST] {prompt} [/INST]"
-prompt = "This is a reasoning problem. You're standing on the surface of the Earth. " \
-          "You walk one mile south, one mile west and one mile north. " \
-          "You end up exactly where you started. Where are EXACTLY on earth you?"
-input_text = prompt
-input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)
-attention_mask = torch.ones_like(input_ids).to(device)
-streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
-tokens = tokenizer(
     prompt_template.format(prompt=prompt),
     return_tensors='pt'
 ).input_ids.cuda()
-# Generate the output using the generate method
-with torch.no_grad():
-    generated_outputs = model.generate(
-        input_ids=input_ids,
-        attention_mask=attention_mask,
-        max_length=1024,
-        num_return_sequences=1,
-        no_repeat_ngram_size=2,
-        early_stopping=True,
-        use_cache=True,
-        num_beams=1,
-        temperature=0.2,
-        repetition_penalty=1.2,
-        length_penalty=1.0,
-        pad_token_id=tokenizer.eos_token_id,
-        eos_token_id=tokenizer.eos_token_id,
-        output_attentions=False,
-        output_hidden_states=False,
-        return_dict_in_generate=True,
-        streamer=streamer,
-    )
-# Decode the generated output
-generated_text = tokenizer.decode(generated_outputs.sequences[0], skip_special_tokens=True)
-# Print the generated output
-print("Generated output:")
-print(generated_text)

+import gc
 import torch
+from tqdm import tqdm
+from transformers import AutoTokenizer, TextStreamer, AutoModelForCausalLM, AutoConfig
+model_path = "Crystalcareai/Quiet-Star-Custom"
+# Load model
+config = AutoConfig.from_pretrained(model_path, max_position_embeddings=2048, use_cache=False, trust_remote_code=True)
+model = AutoModelForCausalLM.from_pretrained(
+    model_path,
+    config=config,
+    device_map="auto",
+    low_cpu_mem_usage=True,
+    torch_dtype=torch.bfloat16,
+    trust_remote_code=True,
+)
+tokenizer = AutoTokenizer.from_pretrained(model_path)
+model.tokenizer = tokenizer  # Assign the tokenizer to the model instance
+streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=False)
+# Convert prompt to tokens
 prompt_template = "[INST] {prompt} [/INST]"
+prompt = "You're standing on the surface of the Earth. "\
+        "You walk one mile south, one mile west and one mile north. "\
+        "You end up exactly where you started. Where are you?"
+input_ids = tokenizer(
     prompt_template.format(prompt=prompt),
     return_tensors='pt'
 ).input_ids.cuda()
+# Generate output
+generation_output = model.generate(
+    input_ids,
+    max_length=1024,
+    do_sample=True,
+    top_k=50,
+    top_p=0.95,
+    num_return_sequences=1,
+    streamer=streamer,
+)
+# Decode the output
+generated_text = tokenizer.batch_decode(generation_output, skip_special_tokens=True)[0]
+print(generated_text)