huggingface版本在训练时候sat与deepspeed适配有问题,方便提供你们使用huggingface的训练demo么
#4
by
scall
- opened
Epoch_0: 0%| | 0/16 [00:04<?, ?it/s]
╭───────────────────── Traceback (most recent call last) ──────────────────────╮
│ /export/App/training_platform/PinoModel/applications/VisualGLM/visual_chatgl │
│ m_instructing_mergeclose_v1.py:229 in <module> │
│ │
│ 226 │ parser.add_argument('--lr', type=float, default=5e-6) │
│ 227 │ parser.add_argument('--accimulation_steps', type=int, default=4) │
│ 228 │ args = parser.parse_args() │
│ ❱ 229 │ train(args) │
│ 230 │
│ │
│ /export/App/training_platform/PinoModel/applications/VisualGLM/visual_chatgl │
│ m_instructing_mergeclose_v1.py:203 in train │
│ │
│ 200 │ │ │ │ │ │ model_save_path='/media/cfs/zhanglezhong/LLMS │
│ 201 │ │ │ │ │ │ tensorboard_writer=tensorboard_writer) │
│ 202 │ │
│ ❱ 203 │ trainer.fit(logger=logger, log_interval=args.log_interval) │
│ 204 │
│ 205 # # save model checkpoint after fitting on only rank0 │
│ 206 # trainer.save_model(path=args.save_path, only_rank0=True, tokeniz │
│ │
│ /export/App/training_platform/PinoModel/applications/VisualGLM/coati/trainer │
│ /visual_sft_glm.py:134 in fit │
│ │
│ 131 │ │ │ │ labels = batch["labels"].to(torch.cuda.current_device( │
│ 132 │ │ │ │ image = batch["img"].to(torch.cuda.current_device()) │
│ 133 │ │ │ │ pre_image = batch["pre_image"] │
│ ❱ 134 │ │ │ │ outputs = self.model(input_ids=prompt_ids, images=imag │
│ 135 │ │ │ │ │
│ 136 │ │ │ │ loss = outputs.loss │
│ 137 # if loss >= 2.5 and is_rank_0() : │
│ │
│ /usr/local/anaconda3/lib/python3.8/site-packages/torch/nn/modules/module.py: │
│ 1110 in _call_impl │
│ │
│ 1107 │ │ # this function, and just call forward. │
│ 1108 │ │ if not (self._backward_hooks or self._forward_hooks or self._ │
│ 1109 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks │
│ ❱ 1110 │ │ │ return forward_call(*input, **kwargs) │
│ 1111 │ │ # Do not call functions when jit is used │
│ 1112 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1113 │ │ if self._backward_hooks or _global_backward_hooks: │
│ │
│ /root/.cache/huggingface/modules/transformers_modules/visualglm/modeling_cha │
│ tglm.py:1462 in forward │
│ │
│ 1459 │ │ │ return_dict: Optional[bool] = None, │
│ 1460 │ ): │
│ 1461 │ │ if inputs_embeds is None and past_key_values is None and imag │
│ ❱ 1462 │ │ │ image_embeds = self.image_encoder(images) │
│ 1463 │ │ │ pre_id, pads, post_id = torch.tensor_split(input_ids, │
│ 1464 │ │ │ │ │ │ │ │ │ │ │ │ │ [pre_image_len │
│ 1465 │ │ │ │ │ │ │ │ │ │ │ │ │ dim=1) # imag │
│ │
│ /usr/local/anaconda3/lib/python3.8/site-packages/torch/nn/modules/module.py: │
│ 1110 in _call_impl │
│ │
│ 1107 │ │ # this function, and just call forward. │
│ 1108 │ │ if not (self._backward_hooks or self._forward_hooks or self._ │
│ 1109 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks │
│ ❱ 1110 │ │ │ return forward_call(*input, **kwargs) │
│ 1111 │ │ # Do not call functions when jit is used │
│ 1112 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1113 │ │ if self._backward_hooks or _global_backward_hooks: │
│ │
│ /root/.cache/huggingface/modules/transformers_modules/visualglm/visual.py:69 │
│ in forward │
│ │
│ 66 │ │ │ self.qformer.parameters().__next__().dtype) │
│ 67 │ │
│ 68 │ def forward(self, image, **kwargs): │
│ ❱ 69 │ │ enc = self.vit(image)[0] │
│ 70 │ │ out = self.qformer(enc)[0] │
│ 71 │ │ return self.glm_proj(out) │
│ 72 │
│ │
│ /usr/local/anaconda3/lib/python3.8/site-packages/torch/nn/modules/module.py: │
│ 1110 in _call_impl │
│ │
│ 1107 │ │ # this function, and just call forward. │
│ 1108 │ │ if not (self._backward_hooks or self._forward_hooks or self._ │
│ 1109 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks │
│ ❱ 1110 │ │ │ return forward_call(*input, **kwargs) │
│ 1111 │ │ # Do not call functions when jit is used │
│ 1112 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1113 │ │ if self._backward_hooks or _global_backward_hooks: │
│ │
│ /root/.cache/huggingface/modules/transformers_modules/visualglm/visual.py:28 │
│ in forward │
│ │
│ 25 │ │ batch_size = image.size(0) │
│ 26 │ │ input_ids = torch.zeros(batch_size, 1, dtype=torch.long, devic │
│ 27 │ │ attention_mask = torch.tensor([[1.]], dtype=image.dtype, devic │
│ ❱ 28 │ │ return super().forward(input_ids=input_ids, position_ids=None, │
│ 29 │
│ 30 │
│ 31 class QFormer(BaseModel): │
│ │
│ /usr/local/anaconda3/lib/python3.8/site-packages/sat/model/base_model.py:144 │
│ in forward │
│ │
│ 141 │ │ # Attention! the transformer might be shared by multiple model │
│ 142 │ │ self.transformer.hooks.clear() │
│ 143 │ │ self.transformer.hooks.update(self.hooks) │
│ ❱ 144 │ │ return self.transformer(*args, **kwargs) │
│ 145 │ │
│ 146 │ def collect_hooks_(self): │
│ 147 │ │ names = list(HOOKS_DEFAULT.keys()) │
│ │
│ /usr/local/anaconda3/lib/python3.8/site-packages/torch/nn/modules/module.py: │
│ 1110 in _call_impl │
│ │
│ 1107 │ │ # this function, and just call forward. │
│ 1108 │ │ if not (self._backward_hooks or self._forward_hooks or self._ │
│ 1109 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks │
│ ❱ 1110 │ │ │ return forward_call(*input, **kwargs) │
│ 1111 │ │ # Do not call functions when jit is used │
│ 1112 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1113 │ │ if self._backward_hooks or _global_backward_hooks: │
│ │
│ /usr/local/anaconda3/lib/python3.8/site-packages/sat/model/transformer.py:56 │
│ 9 in forward │
│ │
│ 566 │ │ │ │ │ │ output_this_layer=output_this_layer_obj, outpu │
│ 567 │ │ │ │ │ ) │
│ 568 │ │ │ │ else: │
│ ❱ 569 │ │ │ │ │ layer_ret = layer(*args, layer_id=torch.tensor(i), │
│ 570 │ │ │ │ │ │ output_this_layer=output_this_layer_obj, outpu │
│ 571 │ │ │ │ if isinstance(layer_ret, tuple): │
│ 572 │ │ │ │ │ layer_ret = layer_ret[0] # for legacy API │
│ │
│ /usr/local/anaconda3/lib/python3.8/site-packages/torch/nn/modules/module.py: │
│ 1110 in _call_impl │
│ │
│ 1107 │ │ # this function, and just call forward. │
│ 1108 │ │ if not (self._backward_hooks or self._forward_hooks or self._ │
│ 1109 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks │
│ ❱ 1110 │ │ │ return forward_call(*input, **kwargs) │
│ 1111 │ │ # Do not call functions when jit is used │
│ 1112 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1113 │ │ if self._backward_hooks or _global_backward_hooks: │
│ │
│ /usr/local/anaconda3/lib/python3.8/site-packages/sat/model/transformer.py:33 │
│ 0 in forward │
│ │
│ 327 │ │ ) │
│ 328 │ │
│ 329 │ def forward(self, hidden_states, mask, *args, **kw_args): │
│ ❱ 330 │ │ return HOOKS_DEFAULT['layer_forward'](self, hidden_states, mas │
│ 331 │
│ 332 │
│ 333 class BaseTransformer(torch.nn.Module): │
│ │
│ /usr/local/anaconda3/lib/python3.8/site-packages/sat/transformer_defaults.py │
│ :127 in layer_forward_default │
│ │
│ 124 │ # Layer norm at the begining of the transformer layer. │
│ 125 │ attention_input = self.input_layernorm(hidden_states) │
│ 126 │ # Self attention. │
│ ❱ 127 │ attention_output = self.attention(attention_input, mask, **kw_args │
│ 128 │ │
│ 129 │ # Third LayerNorm │
│ 130 │ if self.layernorm_order == 'sandwich': │
│ │
│ /usr/local/anaconda3/lib/python3.8/site-packages/torch/nn/modules/module.py: │
│ 1110 in _call_impl │
│ │
│ 1107 │ │ # this function, and just call forward. │
│ 1108 │ │ if not (self._backward_hooks or self._forward_hooks or self._ │
│ 1109 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks │
│ ❱ 1110 │ │ │ return forward_call(*input, **kwargs) │
│ 1111 │ │ # Do not call functions when jit is used │
│ 1112 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1113 │ │ if self._backward_hooks or _global_backward_hooks: │
│ │
│ /usr/local/anaconda3/lib/python3.8/site-packages/sat/model/transformer.py:10 │
│ 3 in forward │
│ │
│ 100 │ │ if 'attention_forward' in self.hooks: │
│ 101 │ │ │ return self.hooks['attention_forward'](hidden_states, mask │
│ 102 │ │ else: │
│ ❱ 103 │ │ │ return HOOKS_DEFAULT['attention_forward'](self, hidden_sta │
│ 104 │
│ 105 │
│ 106 class CrossAttention(torch.nn.Module): │
│ │
│ /usr/local/anaconda3/lib/python3.8/site-packages/sat/transformer_defaults.py │
│ :63 in attention_forward_default │
│ │
│ 60 │ key_layer = self._transpose_for_scores(mixed_key_layer) │
│ 61 │ value_layer = self._transpose_for_scores(mixed_value_layer) │
│ 62 │ │
│ ❱ 63 │ context_layer = attention_fn(query_layer, key_layer, value_layer, │
│ 64 │ │
│ 65 │ context_layer = context_layer.permute(0, 2, 1, 3).contiguous() │
│ 66 │ new_context_layer_shape = context_layer.size()[:-2] + (self.hidden │
│ │
│ /usr/local/anaconda3/lib/python3.8/site-packages/sat/transformer_defaults.py │
│ :38 in standard_attention │
│ │
│ 35 │ │
│ 36 │ if attention_dropout is not None: │
│ 37 │ │ if mpu.get_cuda_rng_tracker is not None: │
│ ❱ 38 │ │ │ with mpu.get_cuda_rng_tracker().fork(): │
│ 39 │ │ │ │ attention_probs = attention_dropout(attention_probs) │
│ 40 │ │ else: │
│ 41 │ │ │ attention_probs = attention_dropout(attention_probs) │
│ │
│ /usr/local/anaconda3/lib/python3.8/contextlib.py:113 in __enter__ │
│ │
│ 110 │ │ # they are only needed for recreation, which is not possible a │
│ 111 │ │ del self.args, self.kwds, self.func │
│ 112 │ │ try: │
│ ❱ 113 │ │ │ return next(self.gen) │
│ 114 │ │ except StopIteration: │
│ 115 │ │ │ raise RuntimeError("generator didn't yield") from None │
│ 116 │
│ │
│ /usr/local/anaconda3/lib/python3.8/site-packages/deepspeed/runtime/activatio │
│ n_checkpointing/checkpointing.py:174 in fork │
│ │
│ 171 │ │ the original state.""" │
│ 172 │ │ # Check if we have added the state │
│ 173 │ │ if name not in self.states_: │
│ ❱ 174 │ │ │ raise Exception('cuda rng state {} is not added'.format(na │
│ 175 │ │ # Store current rng state. │
│ 176 │ │ orig_cuda_rng_state = get_accelerator().get_rng_state() │
│ 177 │ │ # Set rng state to the desired one │
╰──────────────────────────────────────────────────────────────────────────────╯
Exception: cuda rng state model-parallel-rng is not added
scall
changed discussion title from
huggingface版本在训练时候sat与deepspeed适配有问题,方便提高你们使用huggingface的训练demo么
to huggingface版本在训练时候sat与deepspeed适配有问题,方便提供你们使用huggingface的训练demo么