`probability tensor contains either inf, nan or element < 0` when using multiple GPUs
#7
by
jannis-hpi
- opened
Hello! Using the example from the transformers
section of the docs works for me when I have only a single GPU or I set device_map='cuda'
. However, if I have two GPUs and set device_map='auto'
as given in the docs, I run into an assertion error when trying to generate text:
0%| | 0/1 [00:00<?, ?it/s]../aten/src/ATen/native/cuda/TensorCompare.cu:110: _assert_async_cuda_kernel: block: [0,0,0], thread: [0,0,0] Assertion probability tensor contains either inf, nan or element < 0 failed.
0%| | 0/1 [00:36<?, ?it/s]
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
Cell In[6], line 1
----> 1 model.generate('hehe')
File ~/.../src/llama.py:53, in Llama3Model.generate(self, inputs, max_new_tokens, batch_size, show_progress)
51 batch_inputs = inputs[i:i+batch_size]
52 with torch.no_grad():
---> 53 batch_outputs = self.model.generate(batch_inputs, max_new_tokens=max_new_tokens)
54 outputs.extend(batch_outputs)
56 return [self.tokenizer.decode(o[len(i):], skip_special_tokens=True) for i, o in zip(inputs, outputs)]
File ~/conda3/envs/.../lib/python3.11/site-packages/torch/utils/_contextlib.py:116, in context_decorator.<locals>.decorate_context(*args, **kwargs)
113 @functools.wraps(func)
114 def decorate_context(*args, **kwargs):
115 with ctx_factory():
--> 116 return func(*args, **kwargs)
File ~/conda3/envs/.../lib/python3.11/site-packages/transformers/generation/utils.py:2215, in GenerationMixin.generate(self, inputs, generation_config, logits_processor, stopping_criteria, prefix_allowed_tokens_fn, synced_gpus, assistant_model, streamer, negative_prompt_ids, negative_prompt_attention_mask, **kwargs)
2207 input_ids, model_kwargs = self._expand_inputs_for_generation(
2208 input_ids=input_ids,
2209 expand_size=generation_config.num_return_sequences,
2210 is_encoder_decoder=self.config.is_encoder_decoder,
2211 **model_kwargs,
2212 )
2214 # 12. run sample (it degenerates to greedy search when generation_config.do_sample=False)
-> 2215 result = self._sample(
2216 input_ids,
2217 logits_processor=prepared_logits_processor,
2218 stopping_criteria=prepared_stopping_criteria,
2219 generation_config=generation_config,
2220 synced_gpus=synced_gpus,
2221 streamer=streamer,
2222 **model_kwargs,
2223 )
2225 elif generation_mode in (GenerationMode.BEAM_SAMPLE, GenerationMode.BEAM_SEARCH):
2226 # 11. prepare beam search scorer
2227 beam_scorer = BeamSearchScorer(
2228 batch_size=batch_size,
2229 num_beams=generation_config.num_beams,
(...)
2234 max_length=generation_config.max_length,
2235 )
File ~/conda3/envs/.../lib/python3.11/site-packages/transformers/generation/utils.py:3249, in GenerationMixin._sample(self, input_ids, logits_processor, stopping_criteria, generation_config, synced_gpus, streamer, **model_kwargs)
3247 probs = nn.functional.softmax(next_token_scores, dim=-1)
3248 # TODO (joao): this OP throws "skipping cudagraphs due to ['incompatible ops']", find solution
-> 3249 next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
3250 else:
3251 next_tokens = torch.argmax(next_token_scores, dim=-1)
RuntimeError: CUDA error: device-side assert triggered
Compile with TORCH_USE_CUDA_DSA to enable device-side assertions.
Is splitting the model across multiple GPUs not supported or is there something I am doing wrong? Thanks!
This might help:
https://github.com/meta-llama/llama/issues/380