not run
colab t4
model_id = 'mobiuslabsgmbh/Llama-2-7b-chat-hf_1bitgs8_hqq'
from hqq.engine.hf import HQQModelForCausalLM, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = HQQModelForCausalLM.from_quantized(model_id)
##########################################################################################################
import transformers
from threading import Thread
from sys import stdout
def print_flush(data):
stdout.write("\r" + data)
stdout.flush()
#Adapted from https://huggingface.co/spaces/huggingface-projects/llama-2-7b-chat/blob/main/app.py
def process_conversation(chat):
system_prompt = chat['system_prompt']
chat_history = chat['chat_history']
message = chat['message']
conversation = []
if system_prompt:
conversation.append({"role": "system", "content": system_prompt})
for user, assistant in chat_history:
conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
conversation.append({"role": "user", "content": message})
return tokenizer.apply_chat_template(conversation, return_tensors="pt").to('cuda')
def chat_processor(chat, max_new_tokens=100, do_sample=True):
tokenizer.use_default_system_prompt = False
streamer = transformers.TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
generate_params = dict(
{"input_ids": process_conversation(chat)},
streamer=streamer,
max_new_tokens=max_new_tokens,
do_sample=do_sample,
top_p=0.90,
top_k=50,
temperature= 0.6,
num_beams=1,
repetition_penalty=1.2,
)
t = Thread(target=model.generate, kwargs=generate_params)
t.start()
outputs = []
for text in streamer:
outputs.append(text)
print_flush("".join(outputs))
return outputs
###################################################################################################
outputs = chat_processor({'system_prompt':"You are a helpful assistant.",
'chat_history':[],
'message':"How can I build a car?"
},
max_new_tokens=1000, do_sample=False)
Fetchingβ9βfiles:β100%
β9/9β[00:00<00:00,β256.19it/s]
100%|ββββββββββ| 32/32 [00:00<00:00, 2681.67it/s]
100%|ββββββββββ| 32/32 [00:00<00:00, 46.53it/s]
/usr/local/lib/python3.10/dist-packages/transformers/generation/configuration_utils.py:590: UserWarning: do_sample
is set to False
. However, temperature
is set to 0.6
-- this flag is only used in sample-based generation modes. You should set do_sample=True
or unset temperature
.
warnings.warn(
/usr/local/lib/python3.10/dist-packages/transformers/generation/configuration_utils.py:595: UserWarning: do_sample
is set to False
. However, top_p
is set to 0.9
-- this flag is only used in sample-based generation modes. You should set do_sample=True
or unset top_p
.
warnings.warn(
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's attention_mask
to obtain reliable results.
Setting pad_token_id
to eos_token_id
:None for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's attention_mask
to obtain reliable results.
Exception in thread Thread-13 (generate):
Traceback (most recent call last):
File "/usr/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
self.run()
File "/usr/lib/python3.10/threading.py", line 953, in run
self._target(*self._args, **self._kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/transformers/generation/utils.py", line 2215, in generate
result = self._sample(
File "/usr/local/lib/python3.10/dist-packages/transformers/generation/utils.py", line 3206, in _sample
outputs = self(**model_inputs, return_dict=True)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/transformers/models/llama/modeling_llama.py", line 1190, in forward
outputs = self.model(
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/transformers/models/llama/modeling_llama.py", line 921, in forward
position_embeddings = self.rotary_emb(hidden_states, position_ids)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/transformers/models/llama/modeling_llama.py", line 158, in forward
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat2 in method wrapper_CUDA_bmm)
Empty Traceback (most recent call last)
in <cell line: 59>()
57 ###################################################################################################
58
---> 59 outputs = chat_processor({'system_prompt':"You are a helpful assistant.",
60 'chat_history':[],
61 'message':"How can I build a car?"
2 frames
/usr/lib/python3.10/queue.py in get(self, block, timeout)
177 remaining = endtime - time()
178 if remaining <= 0.0:
--> 179 raise Empty
180 self.not_empty.wait(remaining)
181 item = self._get()
Empty:
model_id = 'mobiuslabsgmbh/Llama-2-7b-chat-hf-4bit_g64-HQQ'
from hqq.engine.hf import HQQModelForCausalLM, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = HQQModelForCausalLM.from_quantized(model_id)
##########################################################################################################
import transformers
from threading import Thread
from sys import stdout
def print_flush(data):
stdout.write("\r" + data)
stdout.flush()
#Adapted from https://huggingface.co/spaces/huggingface-projects/llama-2-7b-chat/blob/main/app.py
def process_conversation(chat):
system_prompt = chat['system_prompt']
chat_history = chat['chat_history']
message = chat['message']
conversation = []
if system_prompt:
conversation.append({"role": "system", "content": system_prompt})
for user, assistant in chat_history:
conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
conversation.append({"role": "user", "content": message})
return tokenizer.apply_chat_template(conversation, return_tensors="pt").to('cuda')
def chat_processor(chat, max_new_tokens=100, do_sample=True):
tokenizer.use_default_system_prompt = False
streamer = transformers.TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
generate_params = dict(
{"input_ids": process_conversation(chat)},
streamer=streamer,
max_new_tokens=max_new_tokens,
do_sample=do_sample,
top_p=0.90,
top_k=50,
temperature= 0.6,
num_beams=1,
repetition_penalty=1.2,
)
t = Thread(target=model.generate, kwargs=generate_params)
t.start()
outputs = []
for text in streamer:
outputs.append(text)
print_flush("".join(outputs))
return outputs
###################################################################################################
outputs = chat_processor({'system_prompt':"You are a helpful assistant.",
'chat_history':[],
'message':"How can I build a car?"
},
max_new_tokens=1000, do_sample=False)
Fetchingβ9βfiles:β100%
β9/9β[00:00<00:00,β209.51it/s]
100%|ββββββββββ| 32/32 [00:00<00:00, 3212.56it/s]
100%|ββββββββββ| 32/32 [00:00<00:00, 451.46it/s]
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's attention_mask
to obtain reliable results.
Setting pad_token_id
to eos_token_id
:None for open-end generation.
Exception in thread Thread-14 (generate):
Traceback (most recent call last):
File "/usr/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
self.run()
File "/usr/lib/python3.10/threading.py", line 953, in run
self._target(*self._args, **self._kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/transformers/generation/utils.py", line 2215, in generate
result = self._sample(
File "/usr/local/lib/python3.10/dist-packages/transformers/generation/utils.py", line 3206, in _sample
outputs = self(**model_inputs, return_dict=True)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/transformers/models/llama/modeling_llama.py", line 1190, in forward
outputs = self.model(
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/transformers/models/llama/modeling_llama.py", line 921, in forward
position_embeddings = self.rotary_emb(hidden_states, position_ids)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/transformers/models/llama/modeling_llama.py", line 158, in forward
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat2 in method wrapper_CUDA_bmm)
Empty Traceback (most recent call last)
in <cell line: 59>()
57 ###################################################################################################
58
---> 59 outputs = chat_processor({'system_prompt':"You are a helpful assistant.",
60 'chat_history':[],
61 'message':"How can I build a car?"
2 frames
/usr/lib/python3.10/queue.py in get(self, block, timeout)
177 remaining = endtime - time()
178 if remaining <= 0.0:
--> 179 raise Empty
180 self.not_empty.wait(remaining)
181 item = self._get()
Empty:
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat2 in method wrapper_CUDA_bmm)