the config class and config.json uses DeepseekConfig, not v2
Browse files- modeling_deepseek.py +7 -7
modeling_deepseek.py
CHANGED
@@ -54,7 +54,7 @@ from transformers.utils import (
|
|
54 |
replace_return_docstrings,
|
55 |
)
|
56 |
from transformers.utils.import_utils import is_torch_fx_available
|
57 |
-
from .configuration_deepseek import
|
58 |
import torch.distributed as dist
|
59 |
import numpy as np
|
60 |
|
@@ -681,7 +681,7 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
|
|
681 |
class DeepseekV2Attention(nn.Module):
|
682 |
"""Multi-headed attention from 'Attention Is All You Need' paper"""
|
683 |
|
684 |
-
def __init__(self, config:
|
685 |
super().__init__()
|
686 |
self.config = config
|
687 |
self.layer_idx = layer_idx
|
@@ -1190,7 +1190,7 @@ ATTENTION_CLASSES = {
|
|
1190 |
|
1191 |
|
1192 |
class DeepseekV2DecoderLayer(nn.Module):
|
1193 |
-
def __init__(self, config:
|
1194 |
super().__init__()
|
1195 |
self.hidden_size = config.hidden_size
|
1196 |
|
@@ -1287,7 +1287,7 @@ DeepseekV2_START_DOCSTRING = r"""
|
|
1287 |
and behavior.
|
1288 |
|
1289 |
Parameters:
|
1290 |
-
config ([`
|
1291 |
Model configuration class with all the parameters of the model. Initializing with a config file does not
|
1292 |
load the weights associated with the model, only the configuration. Check out the
|
1293 |
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
|
@@ -1299,7 +1299,7 @@ DeepseekV2_START_DOCSTRING = r"""
|
|
1299 |
DeepseekV2_START_DOCSTRING,
|
1300 |
)
|
1301 |
class DeepseekV2PreTrainedModel(PreTrainedModel):
|
1302 |
-
config_class =
|
1303 |
base_model_prefix = "model"
|
1304 |
supports_gradient_checkpointing = True
|
1305 |
_no_split_modules = ["DeepseekV2DecoderLayer"]
|
@@ -1398,10 +1398,10 @@ class DeepseekV2Model(DeepseekV2PreTrainedModel):
|
|
1398 |
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`DeepseekV2DecoderLayer`]
|
1399 |
|
1400 |
Args:
|
1401 |
-
config:
|
1402 |
"""
|
1403 |
|
1404 |
-
def __init__(self, config:
|
1405 |
super().__init__(config)
|
1406 |
self.padding_idx = config.pad_token_id
|
1407 |
self.vocab_size = config.vocab_size
|
|
|
54 |
replace_return_docstrings,
|
55 |
)
|
56 |
from transformers.utils.import_utils import is_torch_fx_available
|
57 |
+
from .configuration_deepseek import DeepseekConfig
|
58 |
import torch.distributed as dist
|
59 |
import numpy as np
|
60 |
|
|
|
681 |
class DeepseekV2Attention(nn.Module):
|
682 |
"""Multi-headed attention from 'Attention Is All You Need' paper"""
|
683 |
|
684 |
+
def __init__(self, config: DeepseekConfig, layer_idx: Optional[int] = None):
|
685 |
super().__init__()
|
686 |
self.config = config
|
687 |
self.layer_idx = layer_idx
|
|
|
1190 |
|
1191 |
|
1192 |
class DeepseekV2DecoderLayer(nn.Module):
|
1193 |
+
def __init__(self, config: DeepseekConfig, layer_idx: int):
|
1194 |
super().__init__()
|
1195 |
self.hidden_size = config.hidden_size
|
1196 |
|
|
|
1287 |
and behavior.
|
1288 |
|
1289 |
Parameters:
|
1290 |
+
config ([`DeepseekConfig`]):
|
1291 |
Model configuration class with all the parameters of the model. Initializing with a config file does not
|
1292 |
load the weights associated with the model, only the configuration. Check out the
|
1293 |
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
|
|
|
1299 |
DeepseekV2_START_DOCSTRING,
|
1300 |
)
|
1301 |
class DeepseekV2PreTrainedModel(PreTrainedModel):
|
1302 |
+
config_class = DeepseekConfig
|
1303 |
base_model_prefix = "model"
|
1304 |
supports_gradient_checkpointing = True
|
1305 |
_no_split_modules = ["DeepseekV2DecoderLayer"]
|
|
|
1398 |
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`DeepseekV2DecoderLayer`]
|
1399 |
|
1400 |
Args:
|
1401 |
+
config: DeepseekConfig
|
1402 |
"""
|
1403 |
|
1404 |
+
def __init__(self, config: DeepseekConfig):
|
1405 |
super().__init__(config)
|
1406 |
self.padding_idx = config.pad_token_id
|
1407 |
self.vocab_size = config.vocab_size
|