Flash attention 2 is not working
#9
by
nalf3in2
- opened
Flash attention 2 is not working using the code in the readme:
Code used:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
quantization_config = BitsAndBytesConfig(load_in_4bit=True)
tokenizer = AutoTokenizer.from_pretrained("models/google_gemma-2-9b-it")
model = AutoModelForCausalLM.from_pretrained(
"models/google_gemma-2-9b-it",
torch_dtype=torch.float16,
attn_implementation="flash_attention_2",
quantization_config=quantization_config)
input_text = "Write me a poem about Machine Learning."
input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
outputs = model.generate(**input_ids)
print(tokenizer.decode(outputs[0]))
Error:
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/joe/ai/text-generation-webui/installer_files/env/lib/python3.11/site-packages/transformers/models/gemma2/modeling_gemma2.py", line 370, in forward
attn_output = self._flash_attention_forward(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/joe/ai/text-generation-webui/installer_files/env/lib/python3.11/site-packages/transformers/models/gemma2/modeling_gemma2.py", line 425, in _flash_attention_forward
_flash_supports_window_size and self.sliding_window is not None and cache_position > self.sliding_window
Full Output:
text-generation-webui]$ cd /home/joe/ai/text-generation-webui ; /usr/bin/env /home/joe/ai/text-generation-webui/installer_files/env/bin/python /home/joe/.vscode-oss/extensions/ms-python.debugpy-2024.6.0-linux-x64/bundled/libs/debugpy/adapter/../../debugpy/launcher 44543 -- /home/joe/ai/text-generation-webui/temp/test.py
`low_cpu_mem_usage` was None, now set to True since model is quantized.
It is strongly recommended to train Gemma2 models with the `eager` attention implementation instead of `flash_attention_2`. Use `eager` with `AutoModel....from_pretrained('...', attn_implementation='eager')`.
Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:04<00:00, 1.19s/it]
/home/joe/ai/text-generation-webui/installer_files/env/lib/python3.11/site-packages/transformers/generation/utils.py:1249: UserWarning: Using the model-agnostic default `max_length` (=20) to control the generation length. We recommend setting `max_new_tokens` to control the maximum length of the generation.
warnings.warn(
Traceback (most recent call last):
File "/home/joe/ai/text-generation-webui/installer_files/env/lib/python3.11/runpy.py", line 198, in _run_module_as_main
return _run_code(code, main_globals, None,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/joe/ai/text-generation-webui/installer_files/env/lib/python3.11/runpy.py", line 88, in _run_code
exec(code, run_globals)
File "/home/joe/.vscode-oss/extensions/ms-python.debugpy-2024.6.0-linux-x64/bundled/libs/debugpy/adapter/../../debugpy/launcher/../../debugpy/__main__.py", line 39, in <module>
cli.main()
File "/home/joe/.vscode-oss/extensions/ms-python.debugpy-2024.6.0-linux-x64/bundled/libs/debugpy/adapter/../../debugpy/launcher/../../debugpy/../debugpy/server/cli.py", line 430, in main
run()
File "/home/joe/.vscode-oss/extensions/ms-python.debugpy-2024.6.0-linux-x64/bundled/libs/debugpy/adapter/../../debugpy/launcher/../../debugpy/../debugpy/server/cli.py", line 284, in run_file
runpy.run_path(target, run_name="__main__")
File "/home/joe/.vscode-oss/extensions/ms-python.debugpy-2024.6.0-linux-x64/bundled/libs/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_runpy.py", line 321, in run_path
return _run_module_code(code, init_globals, run_name,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/joe/.vscode-oss/extensions/ms-python.debugpy-2024.6.0-linux-x64/bundled/libs/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_runpy.py", line 135, in _run_module_code
_run_code(code, mod_globals, init_globals,
File "/home/joe/.vscode-oss/extensions/ms-python.debugpy-2024.6.0-linux-x64/bundled/libs/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_runpy.py", line 124, in _run_code
exec(code, run_globals)
File "/home/joe/ai/text-generation-webui/temp/test.py", line 17, in <module>
outputs = model.generate(**input_ids)
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/joe/ai/text-generation-webui/installer_files/env/lib/python3.11/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/home/joe/ai/text-generation-webui/installer_files/env/lib/python3.11/site-packages/transformers/generation/utils.py", line 1912, in generate
result = self._sample(
^^^^^^^^^^^^^
File "/home/joe/ai/text-generation-webui/installer_files/env/lib/python3.11/site-packages/transformers/generation/utils.py", line 2649, in _sample
outputs = self(
^^^^^
File "/home/joe/ai/text-generation-webui/installer_files/env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/joe/ai/text-generation-webui/installer_files/env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/joe/ai/text-generation-webui/installer_files/env/lib/python3.11/site-packages/accelerate/hooks.py", line 166, in new_forward
output = module._old_forward(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/joe/ai/text-generation-webui/installer_files/env/lib/python3.11/site-packages/transformers/models/gemma2/modeling_gemma2.py", line 1045, in forward
outputs = self.model(
^^^^^^^^^^^
File "/home/joe/ai/text-generation-webui/installer_files/env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/joe/ai/text-generation-webui/installer_files/env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/joe/ai/text-generation-webui/installer_files/env/lib/python3.11/site-packages/accelerate/hooks.py", line 166, in new_forward
output = module._old_forward(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/joe/ai/text-generation-webui/installer_files/env/lib/python3.11/site-packages/transformers/models/gemma2/modeling_gemma2.py", line 894, in forward
layer_outputs = decoder_layer(
^^^^^^^^^^^^^^
File "/home/joe/ai/text-generation-webui/installer_files/env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/joe/ai/text-generation-webui/installer_files/env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/joe/ai/text-generation-webui/installer_files/env/lib/python3.11/site-packages/accelerate/hooks.py", line 166, in new_forward
output = module._old_forward(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/joe/ai/text-generation-webui/installer_files/env/lib/python3.11/site-packages/transformers/models/gemma2/modeling_gemma2.py", line 636, in forward
hidden_states, self_attn_weights, present_key_value = self.self_attn(
^^^^^^^^^^^^^^^
File "/home/joe/ai/text-generation-webui/installer_files/env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/joe/ai/text-generation-webui/installer_files/env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/joe/ai/text-generation-webui/installer_files/env/lib/python3.11/site-packages/accelerate/hooks.py", line 166, in new_forward
output = module._old_forward(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/joe/ai/text-generation-webui/installer_files/env/lib/python3.11/site-packages/transformers/models/gemma2/modeling_gemma2.py", line 370, in forward
attn_output = self._flash_attention_forward(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/joe/ai/text-generation-webui/installer_files/env/lib/python3.11/site-packages/transformers/models/gemma2/modeling_gemma2.py", line 425, in _flash_attention_forward
_flash_supports_window_size and self.sliding_window is not None and cache_position > self.sliding_window
^^^^^^^^^^^^^^^^^^^
File "/home/joe/ai/text-generation-webui/installer_files/env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1688, in __getattr__
raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'")
AttributeError: 'Gemma2FlashAttention2' object has no attribute 'sliding_window'
[joe@JoeDesktop text-generation-webui]$
nalf3in2
changed discussion title from
Flash attention 2 not working
to Flash attention 2 is not working
Hello, Surya from the Gemma team here -- @ArthurZ and I are aware of this; right now the attention logit softcapping we do is incompatible with FlashAttention, so the user has to disable this in order to run FlashAttention for inference (and you should only see minimal performance differences).
Can you provide sample code to do ? I just tried to modify the config and it didn't work unfortunately:
Code used:
import torch
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, BitsAndBytesConfig
tokenizer = AutoTokenizer.from_pretrained("models/google_gemma-2-9b-it")
config = AutoConfig.from_pretrained("models/google_gemma-2-9b-it")
config.final_logit_softcapping = None # Disable soft-capping
quantization_config = BitsAndBytesConfig(load_in_4bit=True)
model = AutoModelForCausalLM.from_pretrained(
"models/google_gemma-2-9b-it",
config=config,
torch_dtype=torch.float16,
attn_implementation="flash_attention_2",
quantization_config=quantization_config
)
input_text = "Write me a poem about Machine Learning."
input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
outputs = model.generate(**input_ids)
print(tokenizer.decode(outputs[0]))
Error:
[joe@JoeDesktop text-generation-webui]$ cd /home/joe/ai/text-generation-webui ; /usr/bin/env /home/joe/ai/text-generation-webui/installer_files/env/bin/python /home/joe/.vscode-oss/extensions/ms-python.debugpy-2024.6.0-linux-x64/bundled/libs/debugpy/adapter/../../debugpy/launcher 32941 -- /home/joe/ai/text-generation-webui/temp/test.py
`low_cpu_mem_usage` was None, now set to True since model is quantized.
It is strongly recommended to train Gemma2 models with the `eager` attention implementation instead of `flash_attention_2`. Use `eager` with `AutoModel....from_pretrained('...', attn_implementation='eager')`.
Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:04<00:00, 1.18s/it]
/home/joe/ai/text-generation-webui/installer_files/env/lib/python3.11/site-packages/transformers/generation/utils.py:1249: UserWarning: Using the model-agnostic default `max_length` (=20) to control the generation length. We recommend setting `max_new_tokens` to control the maximum length of the generation.
warnings.warn(
/home/joe/ai/text-generation-webui/installer_files/env/lib/python3.11/site-packages/bitsandbytes/nn/modules.py:426: UserWarning: Input type into Linear4bit is torch.float16, but bnb_4bit_compute_dtype=torch.float32 (default). This will lead to slow inference or training speed.
warnings.warn(
../aten/src/ATen/native/cuda/ScatterGatherKernel.cu:144: operator(): block: [215,0,0], thread: [96,0,0] Assertion `idx_dim >= 0 && idx_dim < index_size && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/ScatterGatherKernel.cu:144: operator(): block: [215,0,0], thread: [97,0,0] Assertion `idx_dim >= 0 && idx_dim < index_size && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/ScatterGatherKernel.cu:144: operator(): block: [215,0,0], thread: [98,0,0] Assertion `idx_dim >= 0 && idx_dim < index_size && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/ScatterGatherKernel.cu:144: operator(): block: [215,0,0], thread: [99,0,0] Assertion `idx_dim >= 0 && idx_dim < index_size && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/ScatterGatherKernel.cu:144: operator(): block: [215,0,0], thread: [100,0,0] Assertion `idx_dim >= 0 && idx_dim < index_size && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/ScatterGatherKernel.cu:144: operator(): block: [215,0,0], thread: [101,0,0] Assertion `idx_dim >= 0 && idx_dim < index_size && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/ScatterGatherKernel.cu:144: operator(): block: [215,0,0], thread: [102,0,0] Assertion `idx_dim >= 0 && idx_dim < index_size && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/ScatterGatherKernel.cu:144: operator(): block: [215,0,0], thread: [103,0,0] Assertion `idx_dim >= 0 && idx_dim < index_size && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/ScatterGatherKernel.cu:144: operator(): block: [215,0,0], thread: [104,0,0] Assertion `idx_dim >= 0 && idx_dim < index_size && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/ScatterGatherKernel.cu:144: operator(): block: [215,0,0], thread: [105,0,0] Assertion `idx_dim >= 0 && idx_dim < index_size && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/ScatterGatherKernel.cu:144: operator(): block: [215,0,0], thread: [106,0,0] Assertion `idx_dim >= 0 && idx_dim < index_size && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/ScatterGatherKernel.cu:144: operator(): block: [215,0,0], thread: [107,0,0] Assertion `idx_dim >= 0 && idx_dim < index_size && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/ScatterGatherKernel.cu:144: operator(): block: [215,0,0], thread: [108,0,0] Assertion `idx_dim >= 0 && idx_dim < index_size && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/ScatterGatherKernel.cu:144: operator(): block: [215,0,0], thread: [109,0,0] Assertion `idx_dim >= 0 && idx_dim < index_size && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/ScatterGatherKernel.cu:144: operator(): block: [215,0,0], thread: [110,0,0] Assertion `idx_dim >= 0 && idx_dim < index_size && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/ScatterGatherKernel.cu:144: operator(): block: [215,0,0], thread: [111,0,0] Assertion `idx_dim >= 0 && idx_dim < index_size && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/ScatterGatherKernel.cu:144: operator(): block: [215,0,0], thread: [112,0,0] Assertion `idx_dim >= 0 && idx_dim < index_size && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/ScatterGatherKernel.cu:144: operator(): block: [215,0,0], thread: [113,0,0] Assertion `idx_dim >= 0 && idx_dim < index_size && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/ScatterGatherKernel.cu:144: operator(): block: [215,0,0], thread: [114,0,0] Assertion `idx_dim >= 0 && idx_dim < index_size && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/ScatterGatherKernel.cu:144: operator(): block: [215,0,0], thread: [115,0,0] Assertion `idx_dim >= 0 && idx_dim < index_size && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/ScatterGatherKernel.cu:144: operator(): block: [215,0,0], thread: [116,0,0] Assertion `idx_dim >= 0 && idx_dim < index_size && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/ScatterGatherKernel.cu:144: operator(): block: [215,0,0], thread: [117,0,0] Assertion `idx_dim >= 0 && idx_dim < index_size && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/ScatterGatherKernel.cu:144: operator(): block: [215,0,0], thread: [118,0,0] Assertion `idx_dim >= 0 && idx_dim < index_size && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/ScatterGatherKernel.cu:144: operator(): block: [215,0,0], thread: [119,0,0] Assertion `idx_dim >= 0 && idx_dim < index_size && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/ScatterGatherKernel.cu:144: operator(): block: [215,0,0], thread: [120,0,0] Assertion `idx_dim >= 0 && idx_dim < index_size && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/ScatterGatherKernel.cu:144: operator(): block: [215,0,0], thread: [121,0,0] Assertion `idx_dim >= 0 && idx_dim < index_size && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/ScatterGatherKernel.cu:144: operator(): block: [215,0,0], thread: [122,0,0] Assertion `idx_dim >= 0 && idx_dim < index_size && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/ScatterGatherKernel.cu:144: operator(): block: [215,0,0], thread: [123,0,0] Assertion `idx_dim >= 0 && idx_dim < index_size && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/ScatterGatherKernel.cu:144: operator(): block: [215,0,0], thread: [124,0,0] Assertion `idx_dim >= 0 && idx_dim < index_size && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/ScatterGatherKernel.cu:144: operator(): block: [215,0,0], thread: [125,0,0] Assertion `idx_dim >= 0 && idx_dim < index_size && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/ScatterGatherKernel.cu:144: operator(): block: [215,0,0], thread: [126,0,0] Assertion `idx_dim >= 0 && idx_dim < index_size && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/ScatterGatherKernel.cu:144: operator(): block: [215,0,0], thread: [127,0,0] Assertion `idx_dim >= 0 && idx_dim < index_size && "index out of bounds"` failed.
Traceback (most recent call last):
File "/home/joe/ai/text-generation-webui/installer_files/env/lib/python3.11/runpy.py", line 198, in _run_module_as_main
return _run_code(code, main_globals, None,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/joe/ai/text-generation-webui/installer_files/env/lib/python3.11/runpy.py", line 88, in _run_code
exec(code, run_globals)
File "/home/joe/.vscode-oss/extensions/ms-python.debugpy-2024.6.0-linux-x64/bundled/libs/debugpy/adapter/../../debugpy/launcher/../../debugpy/__main__.py", line 39, in <module>
cli.main()
File "/home/joe/.vscode-oss/extensions/ms-python.debugpy-2024.6.0-linux-x64/bundled/libs/debugpy/adapter/../../debugpy/launcher/../../debugpy/../debugpy/server/cli.py", line 430, in main
run()
File "/home/joe/.vscode-oss/extensions/ms-python.debugpy-2024.6.0-linux-x64/bundled/libs/debugpy/adapter/../../debugpy/launcher/../../debugpy/../debugpy/server/cli.py", line 284, in run_file
runpy.run_path(target, run_name="__main__")
File "/home/joe/.vscode-oss/extensions/ms-python.debugpy-2024.6.0-linux-x64/bundled/libs/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_runpy.py", line 321, in run_path
return _run_module_code(code, init_globals, run_name,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/joe/.vscode-oss/extensions/ms-python.debugpy-2024.6.0-linux-x64/bundled/libs/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_runpy.py", line 135, in _run_module_code
_run_code(code, mod_globals, init_globals,
File "/home/joe/.vscode-oss/extensions/ms-python.debugpy-2024.6.0-linux-x64/bundled/libs/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_runpy.py", line 124, in _run_code
exec(code, run_globals)
File "/home/joe/ai/text-generation-webui/temp/test.py", line 26, in <module>
outputs = model.generate(**input_ids)
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/joe/ai/text-generation-webui/installer_files/env/lib/python3.11/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/home/joe/ai/text-generation-webui/installer_files/env/lib/python3.11/site-packages/transformers/generation/utils.py", line 1912, in generate
result = self._sample(
^^^^^^^^^^^^^
File "/home/joe/ai/text-generation-webui/installer_files/env/lib/python3.11/site-packages/transformers/generation/utils.py", line 2649, in _sample
outputs = self(
^^^^^
File "/home/joe/ai/text-generation-webui/installer_files/env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/joe/ai/text-generation-webui/installer_files/env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/joe/ai/text-generation-webui/installer_files/env/lib/python3.11/site-packages/accelerate/hooks.py", line 166, in new_forward
output = module._old_forward(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/joe/ai/text-generation-webui/installer_files/env/lib/python3.11/site-packages/transformers/models/gemma2/modeling_gemma2.py", line 1046, in forward
outputs = self.model(
^^^^^^^^^^^
File "/home/joe/ai/text-generation-webui/installer_files/env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/joe/ai/text-generation-webui/installer_files/env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/joe/ai/text-generation-webui/installer_files/env/lib/python3.11/site-packages/accelerate/hooks.py", line 166, in new_forward
output = module._old_forward(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/joe/ai/text-generation-webui/installer_files/env/lib/python3.11/site-packages/transformers/models/gemma2/modeling_gemma2.py", line 895, in forward
layer_outputs = decoder_layer(
^^^^^^^^^^^^^^
File "/home/joe/ai/text-generation-webui/installer_files/env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/joe/ai/text-generation-webui/installer_files/env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/joe/ai/text-generation-webui/installer_files/env/lib/python3.11/site-packages/accelerate/hooks.py", line 166, in new_forward
output = module._old_forward(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/joe/ai/text-generation-webui/installer_files/env/lib/python3.11/site-packages/transformers/models/gemma2/modeling_gemma2.py", line 637, in forward
hidden_states, self_attn_weights, present_key_value = self.self_attn(
^^^^^^^^^^^^^^^
File "/home/joe/ai/text-generation-webui/installer_files/env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/joe/ai/text-generation-webui/installer_files/env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/joe/ai/text-generation-webui/installer_files/env/lib/python3.11/site-packages/accelerate/hooks.py", line 166, in new_forward
output = module._old_forward(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/joe/ai/text-generation-webui/installer_files/env/lib/python3.11/site-packages/transformers/models/gemma2/modeling_gemma2.py", line 371, in forward
attn_output = self._flash_attention_forward(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/joe/ai/text-generation-webui/installer_files/env/lib/python3.11/site-packages/transformers/models/gemma2/modeling_gemma2.py", line 432, in _flash_attention_forward
query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
^^^^^^^^^^^^^^^^^
File "/home/joe/ai/text-generation-webui/installer_files/env/lib/python3.11/site-packages/transformers/models/gemma2/modeling_gemma2.py", line 488, in _upad_input
query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/joe/ai/text-generation-webui/installer_files/env/lib/python3.11/site-packages/flash_attn/bert_padding.py", line 110, in unpad_input
indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
Use fa2 >= 2.6.0 and upgrade transformers
nalf3in2
changed discussion status to
closed