Crystalcareai commited on
Commit
2f4f471
1 Parent(s): ce8a2da

Update inference.py

Browse files
Files changed (1) hide show
  1. inference.py +40 -49
inference.py CHANGED
@@ -1,57 +1,48 @@
 
1
  import torch
2
-
3
- from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer
4
-
5
- # Load the pre-trained model and tokenizer
6
- model_name = "Crystalcareai/Quiet-Star-Custom"
7
- model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, ignore_mismatched_sizes=True)
8
- tokenizer = AutoTokenizer.from_pretrained(model_name)
9
-
10
- # Set the tokenizer in the model
11
- model.tokenizer = tokenizer
12
-
13
-
 
 
 
 
 
 
 
 
 
14
  prompt_template = "[INST] {prompt} [/INST]"
15
- prompt = "This is a reasoning problem. You're standing on the surface of the Earth. " \
16
- "You walk one mile south, one mile west and one mile north. " \
17
- "You end up exactly where you started. Where are EXACTLY on earth you?"
18
-
19
- input_text = prompt
20
- input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)
21
- attention_mask = torch.ones_like(input_ids).to(device)
22
 
23
- streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
 
 
24
 
25
- tokens = tokenizer(
26
  prompt_template.format(prompt=prompt),
27
  return_tensors='pt'
28
  ).input_ids.cuda()
29
 
30
- # Generate the output using the generate method
31
- with torch.no_grad():
32
- generated_outputs = model.generate(
33
- input_ids=input_ids,
34
- attention_mask=attention_mask,
35
- max_length=1024,
36
- num_return_sequences=1,
37
- no_repeat_ngram_size=2,
38
- early_stopping=True,
39
- use_cache=True,
40
- num_beams=1,
41
- temperature=0.2,
42
- repetition_penalty=1.2,
43
- length_penalty=1.0,
44
- pad_token_id=tokenizer.eos_token_id,
45
- eos_token_id=tokenizer.eos_token_id,
46
- output_attentions=False,
47
- output_hidden_states=False,
48
- return_dict_in_generate=True,
49
- streamer=streamer,
50
- )
51
-
52
- # Decode the generated output
53
- generated_text = tokenizer.decode(generated_outputs.sequences[0], skip_special_tokens=True)
54
-
55
- # Print the generated output
56
- print("Generated output:")
57
- print(generated_text)
 
1
+ import gc
2
  import torch
3
+ from tqdm import tqdm
4
+ from transformers import AutoTokenizer, TextStreamer, AutoModelForCausalLM, AutoConfig
5
+
6
+ model_path = "Crystalcareai/Quiet-Star-Custom"
7
+
8
+ # Load model
9
+ config = AutoConfig.from_pretrained(model_path, max_position_embeddings=2048, use_cache=False, trust_remote_code=True)
10
+ model = AutoModelForCausalLM.from_pretrained(
11
+ model_path,
12
+ config=config,
13
+ device_map="auto",
14
+ low_cpu_mem_usage=True,
15
+ torch_dtype=torch.bfloat16,
16
+ trust_remote_code=True,
17
+ )
18
+
19
+ tokenizer = AutoTokenizer.from_pretrained(model_path)
20
+ model.tokenizer = tokenizer # Assign the tokenizer to the model instance
21
+ streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=False)
22
+
23
+ # Convert prompt to tokens
24
  prompt_template = "[INST] {prompt} [/INST]"
 
 
 
 
 
 
 
25
 
26
+ prompt = "You're standing on the surface of the Earth. "\
27
+ "You walk one mile south, one mile west and one mile north. "\
28
+ "You end up exactly where you started. Where are you?"
29
 
30
+ input_ids = tokenizer(
31
  prompt_template.format(prompt=prompt),
32
  return_tensors='pt'
33
  ).input_ids.cuda()
34
 
35
+ # Generate output
36
+ generation_output = model.generate(
37
+ input_ids,
38
+ max_length=1024,
39
+ do_sample=True,
40
+ top_k=50,
41
+ top_p=0.95,
42
+ num_return_sequences=1,
43
+ streamer=streamer,
44
+ )
45
+
46
+ # Decode the output
47
+ generated_text = tokenizer.batch_decode(generation_output, skip_special_tokens=True)[0]
48
+ print(generated_text)