zhibinlu commited on
Commit
b5a178d
·
1 Parent(s): 4546d52

Upload model

Browse files
config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "zhibinlu/vgcn-distilbert-base-uncased",
3
+ "activation": "gelu",
4
+ "architectures": [
5
+ "VGCNBertModel"
6
+ ],
7
+ "attention_dropout": 0.1,
8
+ "auto_map": {
9
+ "AutoConfig": "configuration_vgcn_bert.VGCNBertConfig",
10
+ "AutoModel": "modeling_vgcn_bert.VGCNBertModel"
11
+ },
12
+ "dim": 768,
13
+ "dropout": 0.1,
14
+ "hidden_dim": 3072,
15
+ "initializer_range": 0.02,
16
+ "max_position_embeddings": 512,
17
+ "model_type": "vgcn-bert",
18
+ "n_heads": 12,
19
+ "n_layers": 6,
20
+ "pad_token_id": 0,
21
+ "qa_dropout": 0.1,
22
+ "seq_classif_dropout": 0.2,
23
+ "sinusoidal_pos_embds": false,
24
+ "tie_weights_": true,
25
+ "torch_dtype": "float32",
26
+ "transformers_version": "4.31.0.dev0",
27
+ "vgcn_activation": null,
28
+ "vgcn_dropout": 0.1,
29
+ "vgcn_graph_embds_dim": 16,
30
+ "vgcn_hidden_dim": 128,
31
+ "vgcn_weight_init_mode": "transparent",
32
+ "vocab_size": 30522
33
+ }
configuration_vgcn_bert.py ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2019-present, the HuggingFace Inc. team, The Google AI Language Team and Facebook, Inc.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """ VGCN-BERT model configuration"""
16
+ from collections import OrderedDict
17
+ from typing import Mapping
18
+
19
+ from transformers.configuration_utils import PretrainedConfig
20
+ from transformers.onnx import OnnxConfig
21
+ from transformers.utils import logging
22
+
23
+
24
+ logger = logging.get_logger(__name__)
25
+
26
+ VGCNBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
27
+ "zhibinlu/vgcn-distilbert-base-uncased": "https://huggingface.co/zhibinlu/vgcn-distilbert-base-uncased/resolve/main/config.json",
28
+ }
29
+
30
+
31
+ class VGCNBertConfig(PretrainedConfig):
32
+ r"""
33
+ This is the configuration class to store the configuration of a [`VGCNBertModel`] or a [`TFVGCNBertModel`]. It
34
+ is used to instantiate a VGCN-BERT model according to the specified arguments, defining the model architecture.
35
+ Instantiating a configuration with the defaults will yield a similar configuration to that of the VGCN-BERT
36
+ [zhibinlu/vgcn-distilbert-base-uncased](https://huggingface.co/zhibinlu/vgcn-distilbert-base-uncased) architecture.
37
+
38
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
39
+ documentation from [`PretrainedConfig`] for more information.
40
+
41
+ Args:
42
+ vgcn_graph_embedding_dim (`int`, *optional*, defaults to 16):
43
+ Dimensionality of the number of output embedding from VGCN graph embedding module.
44
+ vgcn_hidden_dim (`int`, *optional*, defaults to 128):
45
+ Dimensionality of the graph convolutional hidden layer in VGCN.
46
+ vgcn_activation (`str` or `Callable`, *optional*, defaults to `"None"`):
47
+ The non-linear activation function (function or string) for graph convolutional layer in VGCN.
48
+ If string, `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
49
+ vgcn_dropout (`float`, *optional*, defaults to 0.1):
50
+ The dropout probability for VGCN graph embedding module.
51
+ vgcn_weight_init_mode (`str`, defaults to `"transparent"`):
52
+ The weight initialization mode for VGCN graph embedding module,
53
+ `"transparent"`, `"normal"`, `"uniform"` are supported.
54
+ vocab_size (`int`, *optional*, defaults to 30522):
55
+ Vocabulary size of the VGCN-BERT model. Defines the number of different tokens that can be represented by
56
+ the `inputs_ids` passed when calling [`VGCNBertModel`] or [`TFVGCNBertModel`].
57
+ max_position_embeddings (`int`, *optional*, defaults to 512):
58
+ The maximum sequence length that this model might ever be used with. Typically set this to something large
59
+ just in case (e.g., 512 or 1024 or 2048).
60
+ sinusoidal_pos_embds (`boolean`, *optional*, defaults to `False`):
61
+ Whether to use sinusoidal positional embeddings.
62
+ n_layers (`int`, *optional*, defaults to 6):
63
+ Number of hidden layers in the Transformer encoder.
64
+ n_heads (`int`, *optional*, defaults to 12):
65
+ Number of attention heads for each attention layer in the Transformer encoder.
66
+ dim (`int`, *optional*, defaults to 768):
67
+ Dimensionality of the encoder layers and the pooler layer.
68
+ hidden_dim (`int`, *optional*, defaults to 3072):
69
+ The size of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
70
+ dropout (`float`, *optional*, defaults to 0.1):
71
+ The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
72
+ attention_dropout (`float`, *optional*, defaults to 0.1):
73
+ The dropout ratio for the attention probabilities.
74
+ activation (`str` or `Callable`, *optional*, defaults to `"gelu"`):
75
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
76
+ `"relu"`, `"silu"` and `"gelu_new"` are supported.
77
+ initializer_range (`float`, *optional*, defaults to 0.02):
78
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
79
+ qa_dropout (`float`, *optional*, defaults to 0.1):
80
+ The dropout probabilities used in the question answering model [`VGCNBertForQuestionAnswering`].
81
+ seq_classif_dropout (`float`, *optional*, defaults to 0.2):
82
+ The dropout probabilities used in the sequence classification and the multiple choice model
83
+ [`VGCNBertForSequenceClassification`].
84
+
85
+ Examples:
86
+
87
+ ```python
88
+ >>> from transformers import VGCNBertConfig, VGCNBertModel
89
+
90
+ >>> # Initializing a VGCN-BERT configuration
91
+ >>> configuration = VGCNBertConfig()
92
+
93
+ >>> # Initializing a model (with random weights) from the configuration
94
+ >>> model = VGCNBertModel(configuration)
95
+
96
+ >>> # Accessing the model configuration
97
+ >>> configuration = model.config
98
+ ```"""
99
+ model_type = "vgcn-bert"
100
+ attribute_map = {
101
+ "hidden_size": "dim",
102
+ "num_attention_heads": "n_heads",
103
+ "num_hidden_layers": "n_layers",
104
+ }
105
+
106
+ def __init__(
107
+ self,
108
+ vgcn_graph_embds_dim=16,
109
+ vgcn_hidden_dim=128,
110
+ vgcn_activation=None,
111
+ vgcn_dropout=0.1,
112
+ vgcn_weight_init_mode="transparent",
113
+ vocab_size=30522,
114
+ max_position_embeddings=512,
115
+ sinusoidal_pos_embds=False,
116
+ n_layers=6,
117
+ n_heads=12,
118
+ dim=768,
119
+ hidden_dim=4 * 768,
120
+ dropout=0.1,
121
+ attention_dropout=0.1,
122
+ activation="gelu",
123
+ initializer_range=0.02,
124
+ qa_dropout=0.1,
125
+ seq_classif_dropout=0.2,
126
+ pad_token_id=0,
127
+ **kwargs,
128
+ ):
129
+ self.vgcn_graph_embds_dim = vgcn_graph_embds_dim
130
+ self.vgcn_hidden_dim = vgcn_hidden_dim
131
+ self.vgcn_activation = vgcn_activation
132
+ self.vgcn_dropout = vgcn_dropout
133
+ self.vgcn_weight_init_mode = vgcn_weight_init_mode
134
+ self.vocab_size = vocab_size
135
+ self.max_position_embeddings = max_position_embeddings
136
+ self.sinusoidal_pos_embds = sinusoidal_pos_embds
137
+ self.n_layers = n_layers
138
+ self.n_heads = n_heads
139
+ self.dim = dim
140
+ self.hidden_dim = hidden_dim
141
+ self.dropout = dropout
142
+ self.attention_dropout = attention_dropout
143
+ self.activation = activation
144
+ self.initializer_range = initializer_range
145
+ self.qa_dropout = qa_dropout
146
+ self.seq_classif_dropout = seq_classif_dropout
147
+ super().__init__(**kwargs, pad_token_id=pad_token_id)
148
+
149
+
150
+ class VGCNBertOnnxConfig(OnnxConfig):
151
+ @property
152
+ def inputs(self) -> Mapping[str, Mapping[int, str]]:
153
+ if self.task == "multiple-choice":
154
+ dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
155
+ else:
156
+ dynamic_axis = {0: "batch", 1: "sequence"}
157
+ return OrderedDict(
158
+ [
159
+ ("input_ids", dynamic_axis),
160
+ ("attention_mask", dynamic_axis),
161
+ ]
162
+ )
modeling_vgcn_bert.py ADDED
@@ -0,0 +1,1507 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2019-present, the HuggingFace Inc. team, The Google AI Language Team and Facebook, Inc.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ """
17
+ PyTorch VGCN-BERT model adapted in part from Facebook, Inc XLM model (https://github.com/facebookresearch/XLM) and in
18
+ part from HuggingFace PyTorch version of Google AI Bert model (https://github.com/google-research/bert)
19
+ """
20
+
21
+
22
+ import math
23
+ from typing import Dict, List, Optional, Set, Tuple, Union
24
+
25
+ import numpy as np
26
+ import torch
27
+ from torch import nn
28
+ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
29
+ from transformers.configuration_utils import PretrainedConfig
30
+
31
+ from transformers.activations import get_activation
32
+ from transformers.deepspeed import is_deepspeed_zero3_enabled
33
+ from transformers.modeling_outputs import (
34
+ BaseModelOutput,
35
+ MaskedLMOutput,
36
+ MultipleChoiceModelOutput,
37
+ QuestionAnsweringModelOutput,
38
+ SequenceClassifierOutput,
39
+ TokenClassifierOutput,
40
+ )
41
+ from transformers.modeling_utils import PreTrainedModel
42
+ from transformers.pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
43
+ from transformers.utils import (
44
+ add_code_sample_docstrings,
45
+ add_start_docstrings,
46
+ add_start_docstrings_to_model_forward,
47
+ logging,
48
+ replace_return_docstrings,
49
+ )
50
+ from .configuration_vgcn_bert import VGCNBertConfig
51
+
52
+
53
+ logger = logging.get_logger(__name__)
54
+ _CHECKPOINT_FOR_DOC = "zhibinlu/vgcn-distilbert-base-uncased"
55
+ _CONFIG_FOR_DOC = "VGCNBertConfig"
56
+
57
+ VGCNBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
58
+ "zhibinlu/vgcn-distilbert-base-uncased",
59
+ # See all VGCN-BERT models at https://huggingface.co/models?filter=VGCNBert
60
+ ]
61
+
62
+
63
+ # UTILS AND BUILDING BLOCKS OF THE ARCHITECTURE #
64
+
65
+
66
+ def create_sinusoidal_embeddings(n_pos: int, dim: int, out: torch.Tensor):
67
+ if is_deepspeed_zero3_enabled():
68
+ import deepspeed
69
+
70
+ with deepspeed.zero.GatheredParameters(out, modifier_rank=0):
71
+ if torch.distributed.get_rank() == 0:
72
+ _create_sinusoidal_embeddings(n_pos=n_pos, dim=dim, out=out)
73
+ else:
74
+ _create_sinusoidal_embeddings(n_pos=n_pos, dim=dim, out=out)
75
+
76
+
77
+ def _create_sinusoidal_embeddings(n_pos: int, dim: int, out: torch.Tensor):
78
+ position_enc = np.array([[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)])
79
+ out.requires_grad = False
80
+ out[:, 0::2] = torch.FloatTensor(np.sin(position_enc[:, 0::2]))
81
+ out[:, 1::2] = torch.FloatTensor(np.cos(position_enc[:, 1::2]))
82
+ out.detach_()
83
+
84
+
85
+ class VgcnParameterList(nn.ParameterList):
86
+ def __init__(self, values=None, requires_grad=True) -> None:
87
+ super().__init__(values)
88
+ self.requires_grad = requires_grad
89
+
90
+ def _load_from_state_dict(
91
+ self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
92
+ ):
93
+ keys = filter(lambda x: x.startswith(prefix), state_dict.keys())
94
+ for k in keys:
95
+ self.append(nn.Parameter(state_dict[k], requires_grad=self.requires_grad))
96
+ super()._load_from_state_dict(
97
+ state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
98
+ )
99
+ for i in range(len(self)):
100
+ if self[i].layout is torch.sparse_coo and not self[i].is_coalesced():
101
+ self[i] = self[i].coalesce()
102
+ self[i].requires_grad = self.requires_grad
103
+
104
+
105
+ class VocabGraphConvolution(nn.Module):
106
+ """Vocabulary GCN module.
107
+
108
+ Params:
109
+ `wgraphs`: List of vocabulary graph, normally adjacency matrix
110
+ `wgraph_id_to_tokenizer_id_maps`: wgraph.vocabulary to tokenizer.vocabulary id-mapping
111
+ `hid_dim`: The hidden dimension after `GCN=XAW` (GCN layer)
112
+ `out_dim`: The output dimension after `out=Relu(XAW)W` (GCN output)
113
+ `activation`: The activation function in `out=act(XAW)W`
114
+ `dropout_rate`: The dropout probabilitiy in `out=dropout(act(XAW))W`.
115
+
116
+ Inputs:
117
+ `X_dv`: the feature of mini batch document, can be TF-IDF (batch, vocab), or word embedding (batch, word_embedding_dim, vocab)
118
+
119
+ Outputs:
120
+ The graph embedding representation, dimension (batch, `out_dim`) or (batch, word_embedding_dim, `out_dim`)
121
+
122
+ """
123
+
124
+ def __init__(
125
+ self,
126
+ hid_dim: int,
127
+ out_dim: int,
128
+ wgraphs: Optional[list] = None,
129
+ wgraph_id_to_tokenizer_id_maps: Optional[List[dict]] = None,
130
+ activation=None,
131
+ dropout_rate=0.1,
132
+ ):
133
+ super().__init__()
134
+ self.hid_dim = hid_dim
135
+ self.out_dim = out_dim
136
+ self.fc_hg = nn.Linear(hid_dim, out_dim)
137
+ self.fc_hg._is_vgcn_linear = True
138
+ self.activation = get_activation(activation) if activation else None
139
+ self.dropout = nn.Dropout(dropout_rate) if dropout_rate > 0 else None
140
+ # TODO: add a Linear layer for vgcn fintune/pretrain task
141
+
142
+ # after init.set_wgraphs, _init_weights will set again the mode (transparent,normal,uniform)
143
+ # but if load wgraph parameters from checkpoint/pretrain, the mode weights will be updated from to checkpoint
144
+ # you can call again set_parameters to change the mode
145
+ self.set_wgraphs(wgraphs, wgraph_id_to_tokenizer_id_maps)
146
+
147
+ def set_parameters(self, mode="transparent"):
148
+ """Set the parameters of the model (transparent, uniform, normal)."""
149
+ assert mode in ["transparent", "uniform", "normal"]
150
+ for n, p in self.named_parameters():
151
+ if n.startswith("W"):
152
+ nn.init.constant_(p, 1.0) if mode == "transparent" else nn.init.normal_(
153
+ p, mean=0.0, std=0.02
154
+ ) if mode == "normal" else nn.init.kaiming_uniform_(p, a=math.sqrt(5))
155
+ self.fc_hg.weight.data.fill_(1.0) if mode == "transparent" else self.fc_hg.weight.data.normal_(
156
+ mean=0.0, std=0.02
157
+ ) if mode == "normal" else nn.init.kaiming_uniform_(self.fc_hg.weight, a=math.sqrt(5))
158
+ self.fc_hg.bias.data.zero_()
159
+
160
+ def set_wgraphs(
161
+ self,
162
+ wgraphs: Optional[list] = None,
163
+ wgraph_id_to_tokenizer_id_maps: Optional[List[dict]] = None,
164
+ mode="transparent",
165
+ ):
166
+ assert (
167
+ wgraphs is None
168
+ and wgraph_id_to_tokenizer_id_maps is None
169
+ or wgraphs is not None
170
+ and wgraph_id_to_tokenizer_id_maps is not None
171
+ )
172
+ self.wgraphs: VgcnParameterList = (
173
+ self._prepare_wgraphs(wgraphs) if wgraphs else VgcnParameterList(requires_grad=False)
174
+ )
175
+ self.gvoc_ordered_tokenizer_id_arrays, self.tokenizer_id_to_wgraph_id_arrays = VgcnParameterList(
176
+ requires_grad=False
177
+ ), VgcnParameterList(requires_grad=False)
178
+ if wgraph_id_to_tokenizer_id_maps:
179
+ (
180
+ self.gvoc_ordered_tokenizer_id_arrays,
181
+ self.tokenizer_id_to_wgraph_id_arrays,
182
+ ) = self._prepare_inverted_arrays(wgraph_id_to_tokenizer_id_maps)
183
+ self.W_vh_list = VgcnParameterList(requires_grad=True)
184
+ self.W_vh_list._is_vgcn_weights = True
185
+ for g in self.wgraphs:
186
+ self.W_vh_list.append(nn.Parameter(torch.randn(g.shape[0], self.hid_dim)))
187
+ # self.W_vh_list.append(nn.Parameter(torch.ones(g.shape[0], self.hid_dim)))
188
+ self.set_parameters(mode=mode)
189
+
190
+ def _prepare_wgraphs(self, wgraphs: list) -> VgcnParameterList:
191
+ # def _zero_padding_graph(adj_matrix: torch.Tensor):
192
+ # if adj_matrix.layout is not torch.sparse_coo:
193
+ # adj_matrix=adj_matrix.to_sparse_coo()
194
+ # indices=adj_matrix.indices()+1
195
+ # padded_adj= torch.sparse_coo_tensor(indices=indices, values=adj_matrix.values(), size=(adj_matrix.shape[0]+1,adj_matrix.shape[1]+1))
196
+ # return padded_adj.coalesce()
197
+ glist = VgcnParameterList(requires_grad=False)
198
+ for g in wgraphs:
199
+ assert g.layout is torch.sparse_coo
200
+ # g[0,:] and g[:,0] should be 0
201
+ assert 0 not in g.indices()
202
+ glist.append(nn.Parameter(g.coalesce(), requires_grad=False))
203
+ return glist
204
+
205
+ def _prepare_inverted_arrays(self, wgraph_id_to_tokenizer_id_maps: List[dict]):
206
+ wgraph_id_to_tokenizer_id_maps = [dict(sorted(m.items())) for m in wgraph_id_to_tokenizer_id_maps]
207
+ assert all([list(m.keys())[-1] == len(m) - 1 for m in wgraph_id_to_tokenizer_id_maps])
208
+ gvoc_ordered_tokenizer_id_arrays = VgcnParameterList(
209
+ [
210
+ nn.Parameter(torch.LongTensor(list(m.values())), requires_grad=False)
211
+ for m in wgraph_id_to_tokenizer_id_maps
212
+ ],
213
+ requires_grad=False,
214
+ )
215
+
216
+ tokenizer_id_to_wgraph_id_arrays = VgcnParameterList(
217
+ [
218
+ nn.Parameter(torch.zeros(max(m.values()) + 1, dtype=torch.long), requires_grad=False)
219
+ for m in wgraph_id_to_tokenizer_id_maps
220
+ ],
221
+ requires_grad=False,
222
+ )
223
+ for m, t in zip(wgraph_id_to_tokenizer_id_maps, tokenizer_id_to_wgraph_id_arrays):
224
+ for graph_id, tok_id in m.items():
225
+ t[tok_id] = graph_id
226
+
227
+ return gvoc_ordered_tokenizer_id_arrays, tokenizer_id_to_wgraph_id_arrays
228
+
229
+ def get_subgraphs(self, adj_matrix: torch.Tensor, gx_ids: torch.LongTensor):
230
+ device = gx_ids.device
231
+ batch_size = gx_ids.shape[0]
232
+ batch_masks = torch.any(
233
+ torch.any(
234
+ (adj_matrix.indices().view(-1) == gx_ids.unsqueeze(-1)).view(batch_size, gx_ids.shape[1], 2, -1), dim=1
235
+ ),
236
+ dim=1,
237
+ )
238
+ nnz_len = len(adj_matrix.values())
239
+
240
+ batch_values = adj_matrix.values().unsqueeze(0).repeat(batch_size, 1)
241
+ batch_values = batch_values.view(-1)[batch_masks.view(-1)]
242
+
243
+ batch_positions = torch.arange(batch_size, device=device).unsqueeze(1).repeat(1, nnz_len)
244
+ indices = torch.cat([batch_positions.view(1, -1), adj_matrix.indices().repeat(1, batch_size)], dim=0)
245
+ indices = indices[batch_masks.view(-1).expand(3, -1)].view(3, -1)
246
+
247
+ batch_sub_adj_matrix = torch.sparse_coo_tensor(
248
+ indices=indices,
249
+ values=batch_values.view(-1),
250
+ size=(batch_size, adj_matrix.size(0), adj_matrix.size(1)),
251
+ dtype=adj_matrix.dtype,
252
+ device=device,
253
+ )
254
+
255
+ return batch_sub_adj_matrix.coalesce()
256
+
257
+ def forward(self, word_embeddings: nn.Embedding, input_ids: torch.Tensor): # , position_ids: torch.Tensor = None):
258
+ if not self.wgraphs:
259
+ raise ValueError(
260
+ "No wgraphs is provided. There are 3 ways to initalize wgraphs:"
261
+ " instantiate VGCN_BERT with wgraphs, or call model.vgcn_bert.set_wgraphs(),"
262
+ " or load from_pretrained/checkpoint (make sure there is wgraphs in checkpoint"
263
+ " or you should call set_wgraphs)."
264
+ )
265
+ device = input_ids.device
266
+ batch_size = input_ids.shape[0]
267
+ word_emb_dim = word_embeddings.weight.shape[1]
268
+
269
+ gx_ids_list = []
270
+ # positon_embeddings_in_gvocab_order_list=[]
271
+ for m in self.tokenizer_id_to_wgraph_id_arrays:
272
+ # tmp_ids is still in sentence order, but value is graph id, e.g. [0, 5, 2, 2, 0, 10,0]
273
+ # 0 means no correspond graph id (like padding in graph), so we need to replace it with 0
274
+ tmp_ids = input_ids.clone()
275
+ tmp_ids[tmp_ids > len(m) - 1] = 0
276
+ tmp_ids = m[tmp_ids]
277
+
278
+ # # position in graph is meaningless and computationally expensive
279
+ # if position_ids:
280
+ # position_ids_in_g=torch.zeros(g.shape[0], dtype=torch.LongTensor)
281
+ # # maybe gcn_swop_eye in original vgcn_bert preprocess is more efficient?
282
+ # for p_id, g_id in zip(position_ids, tmp_ids):
283
+ # position_ids_in_g[g_id]=p_id
284
+ # position_embeddings_in_g=self.position_embeddings(position_ids_in_g)
285
+ # position_embeddings_in_g*=position_ids_in_g>0
286
+ # positon_embeddings_in_gvocab_order_list.append(position_embeddings_in_g)
287
+
288
+ gx_ids_list.append(torch.unique(tmp_ids, dim=1))
289
+
290
+ # G_embedding=(act(V1*A1_sub*W1_vh)+act(V2*A2_sub*W2_vh))*W_hg
291
+ fused_H = torch.zeros((batch_size, word_emb_dim, self.hid_dim), device=device)
292
+ for gv_ids, g, gx_ids, W_vh in zip( # , position_in_gvocab_ev
293
+ self.gvoc_ordered_tokenizer_id_arrays,
294
+ self.wgraphs,
295
+ gx_ids_list,
296
+ self.W_vh_list,
297
+ # positon_embeddings_in_gvocab_order_list,
298
+ ):
299
+ # batch_A1_sub*W1_vh, batch_A2_sub*W2_vh, ...
300
+ sub_wgraphs = self.get_subgraphs(g, gx_ids)
301
+ H_vh = torch.bmm(sub_wgraphs, W_vh.unsqueeze(0).expand(batch_size, *W_vh.shape))
302
+
303
+ # V1*batch_A1_sub*W1_vh, V2*batch_A2_sub*W2_vh, ...
304
+ gvocab_ev = word_embeddings(gv_ids).t()
305
+ # if position_ids:
306
+ # gvocab_ev += position_in_gvocab_ev
307
+ H_eh = gvocab_ev.matmul(H_vh)
308
+
309
+ # fc -> act -> dropout
310
+ if self.activation:
311
+ H_eh = self.activation(H_eh)
312
+ if self.dropout:
313
+ H_eh = self.dropout(H_eh)
314
+
315
+ fused_H += H_eh
316
+
317
+ # fused_H=LayerNorm(fused_H) # embedding assemble layer will do LayerNorm
318
+ out_ge = self.fc_hg(fused_H).transpose(1, 2)
319
+ # self.dropout(out_ge) # embedding assemble layer will do dropout
320
+ return out_ge
321
+
322
+
323
+ class VGCNEmbeddings(nn.Module):
324
+ """Construct the embeddings from word, VGCN graph, position and token_type embeddings."""
325
+
326
+ def __init__(
327
+ self,
328
+ config: PretrainedConfig,
329
+ wgraphs: Optional[list] = None,
330
+ wgraph_id_to_tokenizer_id_maps: Optional[List[dict]] = None,
331
+ ):
332
+ super().__init__()
333
+
334
+ self.word_embeddings = nn.Embedding(config.vocab_size, config.dim, padding_idx=config.pad_token_id)
335
+ self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.dim)
336
+
337
+ self.vgcn_graph_embds_dim = config.vgcn_graph_embds_dim
338
+ self.vgcn = VocabGraphConvolution(
339
+ hid_dim=config.vgcn_hidden_dim,
340
+ out_dim=config.vgcn_graph_embds_dim,
341
+ wgraphs=wgraphs,
342
+ wgraph_id_to_tokenizer_id_maps=wgraph_id_to_tokenizer_id_maps,
343
+ activation=config.vgcn_activation,
344
+ dropout_rate=config.vgcn_dropout,
345
+ )
346
+
347
+ if config.sinusoidal_pos_embds:
348
+ create_sinusoidal_embeddings(
349
+ n_pos=config.max_position_embeddings, dim=config.dim, out=self.position_embeddings.weight
350
+ )
351
+
352
+ self.LayerNorm = nn.LayerNorm(config.dim, eps=1e-12)
353
+ self.dropout = nn.Dropout(config.dropout)
354
+ self.register_buffer(
355
+ "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
356
+ )
357
+
358
+ def forward(self, input_ids: torch.Tensor, input_embeds: Optional[torch.Tensor] = None) -> torch.Tensor:
359
+ """
360
+ Parameters:
361
+ input_ids (torch.Tensor):
362
+ torch.tensor(bs, max_seq_length) The token ids to embed.
363
+ input_ids is mandatory in vgcn-bert.
364
+
365
+ Returns: torch.tensor(bs, max_seq_length, dim) The embedded tokens (plus position embeddings, no token_type
366
+ embeddings)
367
+ """
368
+
369
+ # input_ids is mandatory in vgcn-bert
370
+ input_embeds = self.word_embeddings(input_ids) # (bs, max_seq_length, dim)
371
+
372
+ # device = input_embeds.device
373
+ # input_lengths = (
374
+ # (input_ids > 0).sum(-1)
375
+ # if input_ids is not None
376
+ # else torch.ones(input_embeds.size(0), device=device, dtype=torch.int64) * input_embeds.size(1)
377
+ # )
378
+
379
+ seq_length = input_embeds.size(1)
380
+
381
+ # Setting the position-ids to the registered buffer in constructor, it helps
382
+ # when tracing the model without passing position-ids, solves
383
+ # isues similar to issue #5664
384
+ if hasattr(self, "position_ids"):
385
+ position_ids = self.position_ids[:, :seq_length]
386
+ else:
387
+ position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device) # (max_seq_length)
388
+ position_ids = position_ids.unsqueeze(0).expand_as(input_ids) # (bs, max_seq_length)
389
+
390
+ position_embeddings = self.position_embeddings(position_ids) # (bs, max_seq_length, dim)
391
+
392
+ embeddings = input_embeds + position_embeddings # (bs, max_seq_length, dim)
393
+
394
+ if self.vgcn_graph_embds_dim > 0:
395
+ # TODO: check input_ids/position_ids donot include [CLS], [SEP][SEP]
396
+ graph_embeds = self.vgcn(self.word_embeddings, input_ids) # , position_ids)
397
+
398
+ # vgcn_words_embeddings = input_embeds.clone()
399
+ # for i in range(self.vgcn_graph_embds_dim):
400
+ # tmp_pos = (input_lengths - 2 - self.vgcn_graph_embds_dim + 1 + i) + torch.arange(
401
+ # 0, input_embeds.shape[0]
402
+ # ).to(device) * input_embeds.shape[1]
403
+ # vgcn_words_embeddings.flatten(start_dim=0, end_dim=1)[tmp_pos, :] = graph_embeds[:, :, i]
404
+
405
+ embeddings = torch.cat([embeddings, graph_embeds], dim=1) # (bs, max_seq_length+graph_emb_dim_size, dim)
406
+
407
+ embeddings = self.LayerNorm(embeddings) # (bs, max_seq_length, dim)
408
+ embeddings = self.dropout(embeddings) # (bs, max_seq_length, dim)
409
+ return embeddings
410
+
411
+
412
+ class MultiHeadSelfAttention(nn.Module):
413
+ def __init__(self, config: PretrainedConfig):
414
+ super().__init__()
415
+
416
+ self.n_heads = config.n_heads
417
+ self.dim = config.dim
418
+ self.dropout = nn.Dropout(p=config.attention_dropout)
419
+
420
+ # Have an even number of multi heads that divide the dimensions
421
+ if self.dim % self.n_heads != 0:
422
+ # Raise value errors for even multi-head attention nodes
423
+ raise ValueError(f"self.n_heads: {self.n_heads} must divide self.dim: {self.dim} evenly")
424
+
425
+ self.q_lin = nn.Linear(in_features=config.dim, out_features=config.dim)
426
+ self.k_lin = nn.Linear(in_features=config.dim, out_features=config.dim)
427
+ self.v_lin = nn.Linear(in_features=config.dim, out_features=config.dim)
428
+ self.out_lin = nn.Linear(in_features=config.dim, out_features=config.dim)
429
+
430
+ self.pruned_heads: Set[int] = set()
431
+ self.attention_head_size = self.dim // self.n_heads
432
+
433
+ def prune_heads(self, heads: List[int]):
434
+ if len(heads) == 0:
435
+ return
436
+ heads, index = find_pruneable_heads_and_indices(
437
+ heads, self.n_heads, self.attention_head_size, self.pruned_heads
438
+ )
439
+ # Prune linear layers
440
+ self.q_lin = prune_linear_layer(self.q_lin, index)
441
+ self.k_lin = prune_linear_layer(self.k_lin, index)
442
+ self.v_lin = prune_linear_layer(self.v_lin, index)
443
+ self.out_lin = prune_linear_layer(self.out_lin, index, dim=1)
444
+ # Update hyper params
445
+ self.n_heads = self.n_heads - len(heads)
446
+ self.dim = self.attention_head_size * self.n_heads
447
+ self.pruned_heads = self.pruned_heads.union(heads)
448
+
449
+ def forward(
450
+ self,
451
+ query: torch.Tensor,
452
+ key: torch.Tensor,
453
+ value: torch.Tensor,
454
+ mask: torch.Tensor,
455
+ head_mask: Optional[torch.Tensor] = None,
456
+ output_attentions: bool = False,
457
+ ) -> Tuple[torch.Tensor, ...]:
458
+ """
459
+ Parameters:
460
+ query: torch.tensor(bs, seq_length, dim)
461
+ key: torch.tensor(bs, seq_length, dim)
462
+ value: torch.tensor(bs, seq_length, dim)
463
+ mask: torch.tensor(bs, seq_length)
464
+
465
+ Returns:
466
+ weights: torch.tensor(bs, n_heads, seq_length, seq_length) Attention weights context: torch.tensor(bs,
467
+ seq_length, dim) Contextualized layer. Optional: only if `output_attentions=True`
468
+ """
469
+ bs, q_length, dim = query.size()
470
+ k_length = key.size(1)
471
+ # assert dim == self.dim, f'Dimensions do not match: {dim} input vs {self.dim} configured'
472
+ # assert key.size() == value.size()
473
+
474
+ dim_per_head = self.dim // self.n_heads
475
+
476
+ mask_reshp = (bs, 1, 1, k_length)
477
+
478
+ def shape(x: torch.Tensor) -> torch.Tensor:
479
+ """separate heads"""
480
+ return x.view(bs, -1, self.n_heads, dim_per_head).transpose(1, 2)
481
+
482
+ def unshape(x: torch.Tensor) -> torch.Tensor:
483
+ """group heads"""
484
+ return x.transpose(1, 2).contiguous().view(bs, -1, self.n_heads * dim_per_head)
485
+
486
+ q = shape(self.q_lin(query)) # (bs, n_heads, q_length, dim_per_head)
487
+ k = shape(self.k_lin(key)) # (bs, n_heads, k_length, dim_per_head)
488
+ v = shape(self.v_lin(value)) # (bs, n_heads, k_length, dim_per_head)
489
+
490
+ q = q / math.sqrt(dim_per_head) # (bs, n_heads, q_length, dim_per_head)
491
+ scores = torch.matmul(q, k.transpose(2, 3)) # (bs, n_heads, q_length, k_length)
492
+ mask = (mask == 0).view(mask_reshp).expand_as(scores) # (bs, n_heads, q_length, k_length)
493
+ scores = scores.masked_fill(
494
+ mask, torch.tensor(torch.finfo(scores.dtype).min)
495
+ ) # (bs, n_heads, q_length, k_length)
496
+
497
+ weights = nn.functional.softmax(scores, dim=-1) # (bs, n_heads, q_length, k_length)
498
+ weights = self.dropout(weights) # (bs, n_heads, q_length, k_length)
499
+
500
+ # Mask heads if we want to
501
+ if head_mask is not None:
502
+ weights = weights * head_mask
503
+
504
+ context = torch.matmul(weights, v) # (bs, n_heads, q_length, dim_per_head)
505
+ context = unshape(context) # (bs, q_length, dim)
506
+ context = self.out_lin(context) # (bs, q_length, dim)
507
+
508
+ if output_attentions:
509
+ return (context, weights)
510
+ else:
511
+ return (context,)
512
+
513
+
514
+ class FFN(nn.Module):
515
+ def __init__(self, config: PretrainedConfig):
516
+ super().__init__()
517
+ self.dropout = nn.Dropout(p=config.dropout)
518
+ self.chunk_size_feed_forward = config.chunk_size_feed_forward
519
+ self.seq_len_dim = 1
520
+ self.lin1 = nn.Linear(in_features=config.dim, out_features=config.hidden_dim)
521
+ self.lin2 = nn.Linear(in_features=config.hidden_dim, out_features=config.dim)
522
+ self.activation = get_activation(config.activation)
523
+
524
+ def forward(self, input: torch.Tensor) -> torch.Tensor:
525
+ return apply_chunking_to_forward(self.ff_chunk, self.chunk_size_feed_forward, self.seq_len_dim, input)
526
+
527
+ def ff_chunk(self, input: torch.Tensor) -> torch.Tensor:
528
+ x = self.lin1(input)
529
+ x = self.activation(x)
530
+ x = self.lin2(x)
531
+ x = self.dropout(x)
532
+ return x
533
+
534
+
535
+ class TransformerBlock(nn.Module):
536
+ def __init__(self, config: PretrainedConfig):
537
+ super().__init__()
538
+
539
+ # Have an even number of Configure multi-heads
540
+ if config.dim % config.n_heads != 0:
541
+ raise ValueError(f"config.n_heads {config.n_heads} must divide config.dim {config.dim} evenly")
542
+
543
+ self.attention = MultiHeadSelfAttention(config)
544
+ self.sa_layer_norm = nn.LayerNorm(normalized_shape=config.dim, eps=1e-12)
545
+
546
+ self.ffn = FFN(config)
547
+ self.output_layer_norm = nn.LayerNorm(normalized_shape=config.dim, eps=1e-12)
548
+
549
+ def forward(
550
+ self,
551
+ x: torch.Tensor,
552
+ attn_mask: Optional[torch.Tensor] = None,
553
+ head_mask: Optional[torch.Tensor] = None,
554
+ output_attentions: bool = False,
555
+ ) -> Tuple[torch.Tensor, ...]:
556
+ """
557
+ Parameters:
558
+ x: torch.tensor(bs, seq_length, dim)
559
+ attn_mask: torch.tensor(bs, seq_length)
560
+
561
+ Returns:
562
+ sa_weights: torch.tensor(bs, n_heads, seq_length, seq_length) The attention weights ffn_output:
563
+ torch.tensor(bs, seq_length, dim) The output of the transformer block contextualization.
564
+ """
565
+ # Self-Attention
566
+ sa_output = self.attention(
567
+ query=x,
568
+ key=x,
569
+ value=x,
570
+ mask=attn_mask,
571
+ head_mask=head_mask,
572
+ output_attentions=output_attentions,
573
+ )
574
+ if output_attentions:
575
+ sa_output, sa_weights = sa_output # (bs, seq_length, dim), (bs, n_heads, seq_length, seq_length)
576
+ else: # To handle these `output_attentions` or `output_hidden_states` cases returning tuples
577
+ if type(sa_output) != tuple:
578
+ raise TypeError(f"sa_output must be a tuple but it is {type(sa_output)} type")
579
+
580
+ sa_output = sa_output[0]
581
+ sa_output = self.sa_layer_norm(sa_output + x) # (bs, seq_length, dim)
582
+
583
+ # Feed Forward Network
584
+ ffn_output = self.ffn(sa_output) # (bs, seq_length, dim)
585
+ ffn_output: torch.Tensor = self.output_layer_norm(ffn_output + sa_output) # (bs, seq_length, dim)
586
+
587
+ output = (ffn_output,)
588
+ if output_attentions:
589
+ output = (sa_weights,) + output
590
+ return output
591
+
592
+
593
+ class Transformer(nn.Module):
594
+ def __init__(self, config: PretrainedConfig):
595
+ super().__init__()
596
+ self.n_layers = config.n_layers
597
+ self.layer = nn.ModuleList([TransformerBlock(config) for _ in range(config.n_layers)])
598
+
599
+ def forward(
600
+ self,
601
+ x: torch.Tensor,
602
+ attn_mask: Optional[torch.Tensor] = None,
603
+ head_mask: Optional[torch.Tensor] = None,
604
+ output_attentions: bool = False,
605
+ output_hidden_states: bool = False,
606
+ return_dict: Optional[bool] = None,
607
+ ) -> Union[BaseModelOutput, Tuple[torch.Tensor, ...]]: # docstyle-ignore
608
+ """
609
+ Parameters:
610
+ x: torch.tensor(bs, seq_length, dim) Input sequence embedded.
611
+ attn_mask: torch.tensor(bs, seq_length) Attention mask on the sequence.
612
+
613
+ Returns:
614
+ hidden_state: torch.tensor(bs, seq_length, dim) Sequence of hidden states in the last (top)
615
+ layer all_hidden_states: Tuple[torch.tensor(bs, seq_length, dim)]
616
+ Tuple of length n_layers with the hidden states from each layer.
617
+ Optional: only if output_hidden_states=True
618
+ all_attentions: Tuple[torch.tensor(bs, n_heads, seq_length, seq_length)]
619
+ Tuple of length n_layers with the attention weights from each layer
620
+ Optional: only if output_attentions=True
621
+ """
622
+ all_hidden_states = () if output_hidden_states else None
623
+ all_attentions = () if output_attentions else None
624
+
625
+ hidden_state = x
626
+ for i, layer_module in enumerate(self.layer):
627
+ if output_hidden_states:
628
+ all_hidden_states = all_hidden_states + (hidden_state,)
629
+
630
+ layer_outputs = layer_module(
631
+ x=hidden_state, attn_mask=attn_mask, head_mask=head_mask[i], output_attentions=output_attentions
632
+ )
633
+ hidden_state = layer_outputs[-1]
634
+
635
+ if output_attentions:
636
+ if len(layer_outputs) != 2:
637
+ raise ValueError(f"The length of the layer_outputs should be 2, but it is {len(layer_outputs)}")
638
+
639
+ attentions = layer_outputs[0]
640
+ all_attentions = all_attentions + (attentions,)
641
+ else:
642
+ if len(layer_outputs) != 1:
643
+ raise ValueError(f"The length of the layer_outputs should be 1, but it is {len(layer_outputs)}")
644
+
645
+ # Add last layer
646
+ if output_hidden_states:
647
+ all_hidden_states = all_hidden_states + (hidden_state,)
648
+
649
+ if not return_dict:
650
+ return tuple(v for v in [hidden_state, all_hidden_states, all_attentions] if v is not None)
651
+ return BaseModelOutput(
652
+ last_hidden_state=hidden_state, hidden_states=all_hidden_states, attentions=all_attentions
653
+ )
654
+
655
+
656
+ # INTERFACE FOR ENCODER AND TASK SPECIFIC MODEL #
657
+ # Copied from transformers.models.distilbert.modeling_distilbert.DistilBertPreTrainedModel with DistilBert->VGCNBert,distilbert->vgcn_bert
658
+ class VGCNBertPreTrainedModel(PreTrainedModel):
659
+ """
660
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
661
+ models.
662
+ """
663
+
664
+ config_class = VGCNBertConfig
665
+ load_tf_weights = None
666
+ base_model_prefix = "vgcn_bert"
667
+
668
+ def _init_weights(self, module: nn.Module):
669
+ """Initialize the weights."""
670
+ if isinstance(module, nn.Linear):
671
+ # Slightly different from the TF version which uses truncated_normal for initialization
672
+ # cf https://github.com/pytorch/pytorch/pull/5617
673
+ if getattr(module, "_is_vgcn_linear", False):
674
+ if self.config.vgcn_weight_init_mode == "transparent":
675
+ module.weight.data.fill_(1.0)
676
+ elif self.config.vgcn_weight_init_mode == "normal":
677
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
678
+ elif self.config.vgcn_weight_init_mode == "uniform":
679
+ nn.init.kaiming_uniform_(module.weight, a=math.sqrt(5))
680
+ else:
681
+ raise ValueError(f"Unknown VGCN-BERT weight init mode: {self.config.vgcn_weight_init_mode}.")
682
+ if module.bias is not None:
683
+ module.bias.data.zero_()
684
+ else:
685
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
686
+ if module.bias is not None:
687
+ module.bias.data.zero_()
688
+ elif isinstance(module, nn.Embedding):
689
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
690
+ if module.padding_idx is not None:
691
+ module.weight.data[module.padding_idx].zero_()
692
+ elif isinstance(module, nn.LayerNorm):
693
+ module.bias.data.zero_()
694
+ module.weight.data.fill_(1.0)
695
+ elif isinstance(module, nn.ParameterList):
696
+ if getattr(module, "_is_vgcn_weights", False):
697
+ for p in module:
698
+ if self.config.vgcn_weight_init_mode == "transparent":
699
+ nn.init.constant_(p, 1.0)
700
+ elif self.config.vgcn_weight_init_mode == "normal":
701
+ nn.init.normal_(p, mean=0.0, std=self.config.initializer_range)
702
+ elif self.config.vgcn_weight_init_mode == "uniform":
703
+ nn.init.kaiming_uniform_(p, a=math.sqrt(5))
704
+ else:
705
+ raise ValueError(f"Unknown VGCN-BERT weight init mode: {self.config.vgcn_weight_init_mode}.")
706
+
707
+
708
+ VGCNBERT_START_DOCSTRING = r"""
709
+
710
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
711
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
712
+ etc.)
713
+
714
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
715
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
716
+ and behavior.
717
+
718
+ Parameters:
719
+ config ([`VGCNBertConfig`]): Model configuration class with all the parameters of the model.
720
+ Initializing with a config file does not load the weights associated with the model, only the
721
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
722
+ """
723
+
724
+ VGCNBERT_INPUTS_DOCSTRING = r"""
725
+ Args:
726
+ input_ids (`torch.LongTensor` of shape `({0})`):
727
+ Indices of input sequence tokens in the vocabulary.
728
+
729
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
730
+ [`PreTrainedTokenizer.__call__`] for details.
731
+
732
+ [What are input IDs?](../glossary#input-ids)
733
+ attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
734
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
735
+
736
+ - 1 for tokens that are **not masked**,
737
+ - 0 for tokens that are **masked**.
738
+
739
+ [What are attention masks?](../glossary#attention-mask)
740
+ head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
741
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
742
+
743
+ - 1 indicates the head is **not masked**,
744
+ - 0 indicates the head is **masked**.
745
+
746
+ inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
747
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
748
+ is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
749
+ model's internal embedding lookup matrix.
750
+ output_attentions (`bool`, *optional*):
751
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
752
+ tensors for more detail.
753
+ output_hidden_states (`bool`, *optional*):
754
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
755
+ more detail.
756
+ return_dict (`bool`, *optional*):
757
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
758
+ """
759
+
760
+
761
+ @add_start_docstrings(
762
+ "The bare VGCN-BERT encoder/transformer outputting raw hidden-states without any specific head on top.",
763
+ VGCNBERT_START_DOCSTRING,
764
+ )
765
+ # Copied from transformers.models.distilbert.modeling_distilbert.DistilBertModel with DISTILBERT->VGCNBERT,DistilBert->VGCNBert
766
+ class VGCNBertModel(VGCNBertPreTrainedModel):
767
+ def __init__(
768
+ self,
769
+ config: PretrainedConfig,
770
+ wgraphs: Optional[list] = None,
771
+ wgraph_id_to_tokenizer_id_maps: Optional[List[dict]] = None,
772
+ ):
773
+ super().__init__(config)
774
+
775
+ self.embeddings = VGCNEmbeddings(config, wgraphs, wgraph_id_to_tokenizer_id_maps) # Graph Embeddings
776
+ self.transformer = Transformer(config) # Encoder
777
+
778
+ # Initialize weights and apply final processing
779
+ self.post_init()
780
+
781
+ def set_wgraphs(
782
+ self,
783
+ wgraphs: Optional[list] = None,
784
+ wgraph_id_to_tokenizer_id_maps: Optional[List[dict]] = None,
785
+ mode="transparent",
786
+ ):
787
+ self.embeddings.vgcn.set_wgraphs(wgraphs, wgraph_id_to_tokenizer_id_maps, mode)
788
+
789
+ def get_position_embeddings(self) -> nn.Embedding:
790
+ """
791
+ Returns the position embeddings
792
+ """
793
+ return self.embeddings.position_embeddings
794
+
795
+ def resize_position_embeddings(self, new_num_position_embeddings: int):
796
+ """
797
+ Resizes position embeddings of the model if `new_num_position_embeddings != config.max_position_embeddings`.
798
+
799
+ Arguments:
800
+ new_num_position_embeddings (`int`):
801
+ The number of new position embedding matrix. If position embeddings are learned, increasing the size
802
+ will add newly initialized vectors at the end, whereas reducing the size will remove vectors from the
803
+ end. If position embeddings are not learned (*e.g.* sinusoidal position embeddings), increasing the
804
+ size will add correct vectors at the end following the position encoding algorithm, whereas reducing
805
+ the size will remove vectors from the end.
806
+ """
807
+ num_position_embeds_diff = new_num_position_embeddings - self.config.max_position_embeddings
808
+
809
+ # no resizing needs to be done if the length stays the same
810
+ if num_position_embeds_diff == 0:
811
+ return
812
+
813
+ logger.info(f"Setting `config.max_position_embeddings={new_num_position_embeddings}`...")
814
+ self.config.max_position_embeddings = new_num_position_embeddings
815
+
816
+ old_position_embeddings_weight = self.embeddings.position_embeddings.weight.clone()
817
+
818
+ self.embeddings.position_embeddings = nn.Embedding(self.config.max_position_embeddings, self.config.dim)
819
+
820
+ if self.config.sinusoidal_pos_embds:
821
+ create_sinusoidal_embeddings(
822
+ n_pos=self.config.max_position_embeddings, dim=self.config.dim, out=self.position_embeddings.weight
823
+ )
824
+ else:
825
+ with torch.no_grad():
826
+ if num_position_embeds_diff > 0:
827
+ self.embeddings.position_embeddings.weight[:-num_position_embeds_diff] = nn.Parameter(
828
+ old_position_embeddings_weight
829
+ )
830
+ else:
831
+ self.embeddings.position_embeddings.weight = nn.Parameter(
832
+ old_position_embeddings_weight[:num_position_embeds_diff]
833
+ )
834
+ # move position_embeddings to correct device
835
+ self.embeddings.position_embeddings.to(self.device)
836
+
837
+ def get_input_embeddings(self) -> nn.Embedding:
838
+ return self.embeddings.word_embeddings
839
+
840
+ def set_input_embeddings(self, new_embeddings: nn.Embedding):
841
+ self.embeddings.word_embeddings = new_embeddings
842
+
843
+ def _prune_heads(self, heads_to_prune: Dict[int, List[List[int]]]):
844
+ """
845
+ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
846
+ class PreTrainedModel
847
+ """
848
+ for layer, heads in heads_to_prune.items():
849
+ self.transformer.layer[layer].attention.prune_heads(heads)
850
+
851
+ @add_start_docstrings_to_model_forward(VGCNBERT_INPUTS_DOCSTRING.format("batch_size, num_choices"))
852
+ @add_code_sample_docstrings(
853
+ checkpoint=_CHECKPOINT_FOR_DOC,
854
+ output_type=BaseModelOutput,
855
+ config_class=_CONFIG_FOR_DOC,
856
+ )
857
+ def forward(
858
+ self,
859
+ input_ids: Optional[torch.Tensor] = None,
860
+ attention_mask: Optional[torch.Tensor] = None,
861
+ head_mask: Optional[torch.Tensor] = None,
862
+ inputs_embeds: Optional[torch.Tensor] = None,
863
+ output_attentions: Optional[bool] = None,
864
+ output_hidden_states: Optional[bool] = None,
865
+ return_dict: Optional[bool] = None,
866
+ ) -> Union[BaseModelOutput, Tuple[torch.Tensor, ...]]:
867
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
868
+ output_hidden_states = (
869
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
870
+ )
871
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
872
+
873
+ if input_ids is not None and inputs_embeds is not None:
874
+ raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
875
+ elif input_ids is not None:
876
+ input_shape = input_ids.size()
877
+ elif inputs_embeds is not None:
878
+ input_shape = inputs_embeds.size()[:-1]
879
+ else:
880
+ raise ValueError("You have to specify either input_ids or inputs_embeds")
881
+
882
+ device = input_ids.device if input_ids is not None else inputs_embeds.device
883
+
884
+ if attention_mask is None:
885
+ attention_mask = torch.ones(input_shape, device=device) # (bs, seq_length)
886
+
887
+ # Prepare head mask if needed
888
+ head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
889
+
890
+ embeddings = self.embeddings(input_ids, inputs_embeds) # (bs, seq_length, dim)
891
+
892
+ if self.embeddings.vgcn_graph_embds_dim > 0:
893
+ attention_mask = torch.cat(
894
+ [attention_mask, torch.ones((input_shape[0], self.embeddings.vgcn_graph_embds_dim), device=device)],
895
+ dim=1,
896
+ )
897
+
898
+ return self.transformer(
899
+ x=embeddings,
900
+ attn_mask=attention_mask,
901
+ head_mask=head_mask,
902
+ output_attentions=output_attentions,
903
+ output_hidden_states=output_hidden_states,
904
+ return_dict=return_dict,
905
+ )
906
+
907
+
908
+ @add_start_docstrings(
909
+ """VGCNBert Model with a `masked language modeling` head on top.""",
910
+ VGCNBERT_START_DOCSTRING,
911
+ )
912
+ # Copied from transformers.models.distilbert.modeling_distilbert.DistilBertForMaskedLM with DISTILBERT->VGCNBERT,DistilBert->VGCNBert,distilbert->vgcn_bert
913
+ class VGCNBertForMaskedLM(VGCNBertPreTrainedModel):
914
+ _keys_to_ignore_on_load_missing = ["vocab_projector.weight"]
915
+
916
+ def __init__(
917
+ self,
918
+ config: PretrainedConfig,
919
+ wgraphs: Optional[list] = None,
920
+ wgraph_id_to_tokenizer_id_maps: Optional[List[dict]] = None,
921
+ ):
922
+ super().__init__(config)
923
+
924
+ self.activation = get_activation(config.activation)
925
+
926
+ self.vgcn_bert = VGCNBertModel(config, wgraphs, wgraph_id_to_tokenizer_id_maps)
927
+ self.vocab_transform = nn.Linear(config.dim, config.dim)
928
+ self.vocab_layer_norm = nn.LayerNorm(config.dim, eps=1e-12)
929
+ self.vocab_projector = nn.Linear(config.dim, config.vocab_size)
930
+
931
+ # Initialize weights and apply final processing
932
+ self.post_init()
933
+
934
+ self.mlm_loss_fct = nn.CrossEntropyLoss()
935
+
936
+ def get_position_embeddings(self) -> nn.Embedding:
937
+ """
938
+ Returns the position embeddings
939
+ """
940
+ return self.vgcn_bert.get_position_embeddings()
941
+
942
+ def resize_position_embeddings(self, new_num_position_embeddings: int):
943
+ """
944
+ Resizes position embeddings of the model if `new_num_position_embeddings != config.max_position_embeddings`.
945
+
946
+ Arguments:
947
+ new_num_position_embeddings (`int`):
948
+ The number of new position embedding matrix. If position embeddings are learned, increasing the size
949
+ will add newly initialized vectors at the end, whereas reducing the size will remove vectors from the
950
+ end. If position embeddings are not learned (*e.g.* sinusoidal position embeddings), increasing the
951
+ size will add correct vectors at the end following the position encoding algorithm, whereas reducing
952
+ the size will remove vectors from the end.
953
+ """
954
+ self.vgcn_bert.resize_position_embeddings(new_num_position_embeddings)
955
+
956
+ def get_output_embeddings(self) -> nn.Module:
957
+ return self.vocab_projector
958
+
959
+ def set_output_embeddings(self, new_embeddings: nn.Module):
960
+ self.vocab_projector = new_embeddings
961
+
962
+ @add_start_docstrings_to_model_forward(VGCNBERT_INPUTS_DOCSTRING.format("batch_size, num_choices"))
963
+ @add_code_sample_docstrings(
964
+ checkpoint=_CHECKPOINT_FOR_DOC,
965
+ output_type=MaskedLMOutput,
966
+ config_class=_CONFIG_FOR_DOC,
967
+ )
968
+ def forward(
969
+ self,
970
+ input_ids: Optional[torch.Tensor] = None,
971
+ attention_mask: Optional[torch.Tensor] = None,
972
+ head_mask: Optional[torch.Tensor] = None,
973
+ inputs_embeds: Optional[torch.Tensor] = None,
974
+ labels: Optional[torch.LongTensor] = None,
975
+ output_attentions: Optional[bool] = None,
976
+ output_hidden_states: Optional[bool] = None,
977
+ return_dict: Optional[bool] = None,
978
+ ) -> Union[MaskedLMOutput, Tuple[torch.Tensor, ...]]:
979
+ r"""
980
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
981
+ Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
982
+ config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
983
+ loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
984
+ """
985
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
986
+
987
+ dlbrt_output = self.vgcn_bert(
988
+ input_ids=input_ids,
989
+ attention_mask=attention_mask,
990
+ head_mask=head_mask,
991
+ inputs_embeds=inputs_embeds,
992
+ output_attentions=output_attentions,
993
+ output_hidden_states=output_hidden_states,
994
+ return_dict=return_dict,
995
+ )
996
+ hidden_states = dlbrt_output[0] # (bs, seq_length, dim)
997
+ prediction_logits = self.vocab_transform(hidden_states) # (bs, seq_length, dim)
998
+ prediction_logits = self.activation(prediction_logits) # (bs, seq_length, dim)
999
+ prediction_logits = self.vocab_layer_norm(prediction_logits) # (bs, seq_length, dim)
1000
+ prediction_logits = self.vocab_projector(prediction_logits) # (bs, seq_length, vocab_size)
1001
+
1002
+ # remove graph embedding outputs
1003
+ prediction_logits = prediction_logits[:, : input_ids.size(1), :]
1004
+
1005
+ mlm_loss = None
1006
+ if labels is not None:
1007
+ mlm_loss = self.mlm_loss_fct(prediction_logits.reshape(-1, prediction_logits.size(-1)), labels.view(-1))
1008
+
1009
+ if not return_dict:
1010
+ output = (prediction_logits,) + dlbrt_output[1:]
1011
+ return ((mlm_loss,) + output) if mlm_loss is not None else output
1012
+
1013
+ return MaskedLMOutput(
1014
+ loss=mlm_loss,
1015
+ logits=prediction_logits,
1016
+ hidden_states=dlbrt_output.hidden_states,
1017
+ attentions=dlbrt_output.attentions,
1018
+ )
1019
+
1020
+
1021
+ @add_start_docstrings(
1022
+ """
1023
+ VGCNBert Model transformer with a sequence classification/regression head on top (a linear layer on top of the
1024
+ pooled output) e.g. for GLUE tasks.
1025
+ """,
1026
+ VGCNBERT_START_DOCSTRING,
1027
+ )
1028
+ # Copied from transformers.models.distilbert.modeling_distilbert.DistilBertForSequenceClassification with DISTILBERT->VGCNBERT,DistilBert->VGCNBert,distilbert->vgcn_bert
1029
+ class VGCNBertForSequenceClassification(VGCNBertPreTrainedModel):
1030
+ def __init__(
1031
+ self,
1032
+ config: PretrainedConfig,
1033
+ wgraphs: Optional[list] = None,
1034
+ wgraph_id_to_tokenizer_id_maps: Optional[List[dict]] = None,
1035
+ ):
1036
+ super().__init__(config)
1037
+ self.num_labels = config.num_labels
1038
+ self.config = config
1039
+
1040
+ self.vgcn_bert = VGCNBertModel(config, wgraphs, wgraph_id_to_tokenizer_id_maps)
1041
+ self.pre_classifier = nn.Linear(config.dim, config.dim)
1042
+ self.classifier = nn.Linear(config.dim, config.num_labels)
1043
+ self.dropout = nn.Dropout(config.seq_classif_dropout)
1044
+
1045
+ # Initialize weights and apply final processing
1046
+ self.post_init()
1047
+
1048
+ def get_position_embeddings(self) -> nn.Embedding:
1049
+ """
1050
+ Returns the position embeddings
1051
+ """
1052
+ return self.vgcn_bert.get_position_embeddings()
1053
+
1054
+ def resize_position_embeddings(self, new_num_position_embeddings: int):
1055
+ """
1056
+ Resizes position embeddings of the model if `new_num_position_embeddings != config.max_position_embeddings`.
1057
+
1058
+ Arguments:
1059
+ new_num_position_embeddings (`int`):
1060
+ The number of new position embedding matrix. If position embeddings are learned, increasing the size
1061
+ will add newly initialized vectors at the end, whereas reducing the size will remove vectors from the
1062
+ end. If position embeddings are not learned (*e.g.* sinusoidal position embeddings), increasing the
1063
+ size will add correct vectors at the end following the position encoding algorithm, whereas reducing
1064
+ the size will remove vectors from the end.
1065
+ """
1066
+ self.vgcn_bert.resize_position_embeddings(new_num_position_embeddings)
1067
+
1068
+ @add_start_docstrings_to_model_forward(VGCNBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
1069
+ @add_code_sample_docstrings(
1070
+ checkpoint=_CHECKPOINT_FOR_DOC,
1071
+ output_type=SequenceClassifierOutput,
1072
+ config_class=_CONFIG_FOR_DOC,
1073
+ )
1074
+ def forward(
1075
+ self,
1076
+ input_ids: Optional[torch.Tensor] = None,
1077
+ attention_mask: Optional[torch.Tensor] = None,
1078
+ head_mask: Optional[torch.Tensor] = None,
1079
+ inputs_embeds: Optional[torch.Tensor] = None,
1080
+ labels: Optional[torch.LongTensor] = None,
1081
+ output_attentions: Optional[bool] = None,
1082
+ output_hidden_states: Optional[bool] = None,
1083
+ return_dict: Optional[bool] = None,
1084
+ ) -> Union[SequenceClassifierOutput, Tuple[torch.Tensor, ...]]:
1085
+ r"""
1086
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1087
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
1088
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
1089
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
1090
+ """
1091
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1092
+
1093
+ vgcn_bert_output = self.vgcn_bert(
1094
+ input_ids=input_ids,
1095
+ attention_mask=attention_mask,
1096
+ head_mask=head_mask,
1097
+ inputs_embeds=inputs_embeds,
1098
+ output_attentions=output_attentions,
1099
+ output_hidden_states=output_hidden_states,
1100
+ return_dict=return_dict,
1101
+ )
1102
+ hidden_state = vgcn_bert_output[0] # (bs, seq_len, dim)
1103
+ pooled_output = hidden_state[:, 0] # (bs, dim)
1104
+ pooled_output = self.pre_classifier(pooled_output) # (bs, dim)
1105
+ pooled_output = nn.ReLU()(pooled_output) # (bs, dim)
1106
+ pooled_output = self.dropout(pooled_output) # (bs, dim)
1107
+ logits = self.classifier(pooled_output) # (bs, num_labels)
1108
+
1109
+ loss = None
1110
+ if labels is not None:
1111
+ if self.config.problem_type is None:
1112
+ if self.num_labels == 1:
1113
+ self.config.problem_type = "regression"
1114
+ elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
1115
+ self.config.problem_type = "single_label_classification"
1116
+ else:
1117
+ self.config.problem_type = "multi_label_classification"
1118
+
1119
+ if self.config.problem_type == "regression":
1120
+ loss_fct = MSELoss()
1121
+ if self.num_labels == 1:
1122
+ loss = loss_fct(logits.squeeze(), labels.squeeze())
1123
+ else:
1124
+ loss = loss_fct(logits, labels)
1125
+ elif self.config.problem_type == "single_label_classification":
1126
+ loss_fct = CrossEntropyLoss()
1127
+ loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
1128
+ elif self.config.problem_type == "multi_label_classification":
1129
+ loss_fct = BCEWithLogitsLoss()
1130
+ loss = loss_fct(logits, labels)
1131
+
1132
+ if not return_dict:
1133
+ output = (logits,) + vgcn_bert_output[1:]
1134
+ return ((loss,) + output) if loss is not None else output
1135
+
1136
+ return SequenceClassifierOutput(
1137
+ loss=loss,
1138
+ logits=logits,
1139
+ hidden_states=vgcn_bert_output.hidden_states,
1140
+ attentions=vgcn_bert_output.attentions,
1141
+ )
1142
+
1143
+
1144
+ @add_start_docstrings(
1145
+ """
1146
+ VGCNBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a
1147
+ linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
1148
+ """,
1149
+ VGCNBERT_START_DOCSTRING,
1150
+ )
1151
+ # Copied from transformers.models.distilbert.modeling_distilbert.DistilBertForQuestionAnswering with DISTILBERT->VGCNBERT,DistilBert->VGCNBert,distilbert->vgcn_bert
1152
+ class VGCNBertForQuestionAnswering(VGCNBertPreTrainedModel):
1153
+ def __init__(
1154
+ self,
1155
+ config: PretrainedConfig,
1156
+ wgraphs: Optional[list] = None,
1157
+ wgraph_id_to_tokenizer_id_maps: Optional[List[dict]] = None,
1158
+ ):
1159
+ super().__init__(config)
1160
+
1161
+ self.vgcn_bert = VGCNBertModel(config, wgraphs, wgraph_id_to_tokenizer_id_maps)
1162
+ self.qa_outputs = nn.Linear(config.dim, config.num_labels)
1163
+ if config.num_labels != 2:
1164
+ raise ValueError(f"config.num_labels should be 2, but it is {config.num_labels}")
1165
+
1166
+ self.dropout = nn.Dropout(config.qa_dropout)
1167
+
1168
+ # Initialize weights and apply final processing
1169
+ self.post_init()
1170
+
1171
+ def get_position_embeddings(self) -> nn.Embedding:
1172
+ """
1173
+ Returns the position embeddings
1174
+ """
1175
+ return self.vgcn_bert.get_position_embeddings()
1176
+
1177
+ def resize_position_embeddings(self, new_num_position_embeddings: int):
1178
+ """
1179
+ Resizes position embeddings of the model if `new_num_position_embeddings != config.max_position_embeddings`.
1180
+
1181
+ Arguments:
1182
+ new_num_position_embeddings (`int`):
1183
+ The number of new position embedding matrix. If position embeddings are learned, increasing the size
1184
+ will add newly initialized vectors at the end, whereas reducing the size will remove vectors from the
1185
+ end. If position embeddings are not learned (*e.g.* sinusoidal position embeddings), increasing the
1186
+ size will add correct vectors at the end following the position encoding algorithm, whereas reducing
1187
+ the size will remove vectors from the end.
1188
+ """
1189
+ self.vgcn_bert.resize_position_embeddings(new_num_position_embeddings)
1190
+
1191
+ @add_start_docstrings_to_model_forward(VGCNBERT_INPUTS_DOCSTRING.format("batch_size, num_choices"))
1192
+ @add_code_sample_docstrings(
1193
+ checkpoint=_CHECKPOINT_FOR_DOC,
1194
+ output_type=QuestionAnsweringModelOutput,
1195
+ config_class=_CONFIG_FOR_DOC,
1196
+ )
1197
+ def forward(
1198
+ self,
1199
+ input_ids: Optional[torch.Tensor] = None,
1200
+ attention_mask: Optional[torch.Tensor] = None,
1201
+ head_mask: Optional[torch.Tensor] = None,
1202
+ inputs_embeds: Optional[torch.Tensor] = None,
1203
+ start_positions: Optional[torch.Tensor] = None,
1204
+ end_positions: Optional[torch.Tensor] = None,
1205
+ output_attentions: Optional[bool] = None,
1206
+ output_hidden_states: Optional[bool] = None,
1207
+ return_dict: Optional[bool] = None,
1208
+ ) -> Union[QuestionAnsweringModelOutput, Tuple[torch.Tensor, ...]]:
1209
+ r"""
1210
+ start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1211
+ Labels for position (index) of the start of the labelled span for computing the token classification loss.
1212
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
1213
+ are not taken into account for computing the loss.
1214
+ end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1215
+ Labels for position (index) of the end of the labelled span for computing the token classification loss.
1216
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
1217
+ are not taken into account for computing the loss.
1218
+ """
1219
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1220
+
1221
+ vgcn_bert_output = self.vgcn_bert(
1222
+ input_ids=input_ids,
1223
+ attention_mask=attention_mask,
1224
+ head_mask=head_mask,
1225
+ inputs_embeds=inputs_embeds,
1226
+ output_attentions=output_attentions,
1227
+ output_hidden_states=output_hidden_states,
1228
+ return_dict=return_dict,
1229
+ )
1230
+ hidden_states = vgcn_bert_output[0] # (bs, max_query_len, dim)
1231
+
1232
+ hidden_states = self.dropout(hidden_states) # (bs, max_query_len, dim)
1233
+ logits = self.qa_outputs(hidden_states) # (bs, max_query_len, 2)
1234
+ # remove graph embedding outputs
1235
+ logits = logits[:, : input_ids.size(1), :]
1236
+
1237
+ start_logits, end_logits = logits.split(1, dim=-1)
1238
+ start_logits = start_logits.squeeze(-1).contiguous() # (bs, max_query_len)
1239
+ end_logits = end_logits.squeeze(-1).contiguous() # (bs, max_query_len)
1240
+
1241
+ total_loss = None
1242
+ if start_positions is not None and end_positions is not None:
1243
+ # If we are on multi-GPU, split add a dimension
1244
+ if len(start_positions.size()) > 1:
1245
+ start_positions = start_positions.squeeze(-1)
1246
+ if len(end_positions.size()) > 1:
1247
+ end_positions = end_positions.squeeze(-1)
1248
+ # sometimes the start/end positions are outside our model inputs, we ignore these terms
1249
+ ignored_index = start_logits.size(1)
1250
+ start_positions = start_positions.clamp(0, ignored_index)
1251
+ end_positions = end_positions.clamp(0, ignored_index)
1252
+
1253
+ loss_fct = nn.CrossEntropyLoss(ignore_index=ignored_index)
1254
+ start_loss = loss_fct(start_logits, start_positions)
1255
+ end_loss = loss_fct(end_logits, end_positions)
1256
+ total_loss = (start_loss + end_loss) / 2
1257
+
1258
+ if not return_dict:
1259
+ output = (start_logits, end_logits) + vgcn_bert_output[1:]
1260
+ return ((total_loss,) + output) if total_loss is not None else output
1261
+
1262
+ return QuestionAnsweringModelOutput(
1263
+ loss=total_loss,
1264
+ start_logits=start_logits,
1265
+ end_logits=end_logits,
1266
+ hidden_states=vgcn_bert_output.hidden_states,
1267
+ attentions=vgcn_bert_output.attentions,
1268
+ )
1269
+
1270
+
1271
+ @add_start_docstrings(
1272
+ """
1273
+ VGCNBert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
1274
+ for Named-Entity-Recognition (NER) tasks.
1275
+ """,
1276
+ VGCNBERT_START_DOCSTRING,
1277
+ )
1278
+ # Copied from transformers.models.distilbert.modeling_distilbert.DistilBertForTokenClassification with DISTILBERT->VGCNBERT,DistilBert->VGCNBert,distilbert->vgcn_bert
1279
+ class VGCNBertForTokenClassification(VGCNBertPreTrainedModel):
1280
+ def __init__(
1281
+ self,
1282
+ config: PretrainedConfig,
1283
+ wgraphs: Optional[list] = None,
1284
+ wgraph_id_to_tokenizer_id_maps: Optional[List[dict]] = None,
1285
+ ):
1286
+ super().__init__(config)
1287
+ self.num_labels = config.num_labels
1288
+
1289
+ self.vgcn_bert = VGCNBertModel(config, wgraphs, wgraph_id_to_tokenizer_id_maps)
1290
+ self.dropout = nn.Dropout(config.dropout)
1291
+ self.classifier = nn.Linear(config.hidden_size, config.num_labels)
1292
+
1293
+ # Initialize weights and apply final processing
1294
+ self.post_init()
1295
+
1296
+ def get_position_embeddings(self) -> nn.Embedding:
1297
+ """
1298
+ Returns the position embeddings
1299
+ """
1300
+ return self.vgcn_bert.get_position_embeddings()
1301
+
1302
+ def resize_position_embeddings(self, new_num_position_embeddings: int):
1303
+ """
1304
+ Resizes position embeddings of the model if `new_num_position_embeddings != config.max_position_embeddings`.
1305
+
1306
+ Arguments:
1307
+ new_num_position_embeddings (`int`):
1308
+ The number of new position embedding matrix. If position embeddings are learned, increasing the size
1309
+ will add newly initialized vectors at the end, whereas reducing the size will remove vectors from the
1310
+ end. If position embeddings are not learned (*e.g.* sinusoidal position embeddings), increasing the
1311
+ size will add correct vectors at the end following the position encoding algorithm, whereas reducing
1312
+ the size will remove vectors from the end.
1313
+ """
1314
+ self.vgcn_bert.resize_position_embeddings(new_num_position_embeddings)
1315
+
1316
+ @add_start_docstrings_to_model_forward(VGCNBERT_INPUTS_DOCSTRING)
1317
+ @add_code_sample_docstrings(
1318
+ checkpoint=_CHECKPOINT_FOR_DOC,
1319
+ output_type=TokenClassifierOutput,
1320
+ config_class=_CONFIG_FOR_DOC,
1321
+ )
1322
+ def forward(
1323
+ self,
1324
+ input_ids: Optional[torch.Tensor] = None,
1325
+ attention_mask: Optional[torch.Tensor] = None,
1326
+ head_mask: Optional[torch.Tensor] = None,
1327
+ inputs_embeds: Optional[torch.Tensor] = None,
1328
+ labels: Optional[torch.LongTensor] = None,
1329
+ output_attentions: Optional[bool] = None,
1330
+ output_hidden_states: Optional[bool] = None,
1331
+ return_dict: Optional[bool] = None,
1332
+ ) -> Union[TokenClassifierOutput, Tuple[torch.Tensor, ...]]:
1333
+ r"""
1334
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1335
+ Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
1336
+ """
1337
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1338
+
1339
+ outputs = self.vgcn_bert(
1340
+ input_ids,
1341
+ attention_mask=attention_mask,
1342
+ head_mask=head_mask,
1343
+ inputs_embeds=inputs_embeds,
1344
+ output_attentions=output_attentions,
1345
+ output_hidden_states=output_hidden_states,
1346
+ return_dict=return_dict,
1347
+ )
1348
+
1349
+ sequence_output = outputs[0]
1350
+
1351
+ sequence_output = self.dropout(sequence_output)
1352
+ logits = self.classifier(sequence_output)
1353
+
1354
+ # remove graph embedding outputs
1355
+ logits = logits[:, : input_ids.size(1), :]
1356
+
1357
+ loss = None
1358
+ if labels is not None:
1359
+ loss_fct = CrossEntropyLoss()
1360
+ loss = loss_fct(logits.reshape(-1, self.num_labels), labels.view(-1))
1361
+
1362
+ if not return_dict:
1363
+ output = (logits,) + outputs[1:]
1364
+ return ((loss,) + output) if loss is not None else output
1365
+
1366
+ return TokenClassifierOutput(
1367
+ loss=loss,
1368
+ logits=logits,
1369
+ hidden_states=outputs.hidden_states,
1370
+ attentions=outputs.attentions,
1371
+ )
1372
+
1373
+
1374
+ @add_start_docstrings(
1375
+ """
1376
+ VGCNBert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and
1377
+ a softmax) e.g. for RocStories/SWAG tasks.
1378
+ """,
1379
+ VGCNBERT_START_DOCSTRING,
1380
+ )
1381
+ # Copied from transformers.models.distilbert.modeling_distilbert.DistilBertForMultipleChoice with DISTILBERT->VGCNBERT,DistilBert->VGCNBert,distilbert->vgcn_bert
1382
+ class VGCNBertForMultipleChoice(VGCNBertPreTrainedModel):
1383
+ def __init__(
1384
+ self,
1385
+ config: PretrainedConfig,
1386
+ wgraphs: Optional[list] = None,
1387
+ wgraph_id_to_tokenizer_id_maps: Optional[List[dict]] = None,
1388
+ ):
1389
+ super().__init__(config)
1390
+
1391
+ self.vgcn_bert = VGCNBertModel(config, wgraphs, wgraph_id_to_tokenizer_id_maps)
1392
+ self.pre_classifier = nn.Linear(config.dim, config.dim)
1393
+ self.classifier = nn.Linear(config.dim, 1)
1394
+ self.dropout = nn.Dropout(config.seq_classif_dropout)
1395
+
1396
+ # Initialize weights and apply final processing
1397
+ self.post_init()
1398
+
1399
+ def get_position_embeddings(self) -> nn.Embedding:
1400
+ """
1401
+ Returns the position embeddings
1402
+ """
1403
+ return self.vgcn_bert.get_position_embeddings()
1404
+
1405
+ def resize_position_embeddings(self, new_num_position_embeddings: int):
1406
+ """
1407
+ Resizes position embeddings of the model if `new_num_position_embeddings != config.max_position_embeddings`.
1408
+
1409
+ Arguments:
1410
+ new_num_position_embeddings (`int`)
1411
+ The number of new position embeddings. If position embeddings are learned, increasing the size will add
1412
+ newly initialized vectors at the end, whereas reducing the size will remove vectors from the end. If
1413
+ position embeddings are not learned (*e.g.* sinusoidal position embeddings), increasing the size will
1414
+ add correct vectors at the end following the position encoding algorithm, whereas reducing the size
1415
+ will remove vectors from the end.
1416
+ """
1417
+ self.vgcn_bert.resize_position_embeddings(new_num_position_embeddings)
1418
+
1419
+ @add_start_docstrings_to_model_forward(
1420
+ VGCNBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
1421
+ )
1422
+ @replace_return_docstrings(output_type=MultipleChoiceModelOutput, config_class=_CONFIG_FOR_DOC)
1423
+ def forward(
1424
+ self,
1425
+ input_ids: Optional[torch.Tensor] = None,
1426
+ attention_mask: Optional[torch.Tensor] = None,
1427
+ head_mask: Optional[torch.Tensor] = None,
1428
+ inputs_embeds: Optional[torch.Tensor] = None,
1429
+ labels: Optional[torch.LongTensor] = None,
1430
+ output_attentions: Optional[bool] = None,
1431
+ output_hidden_states: Optional[bool] = None,
1432
+ return_dict: Optional[bool] = None,
1433
+ ) -> Union[MultipleChoiceModelOutput, Tuple[torch.Tensor, ...]]:
1434
+ r"""
1435
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1436
+ Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
1437
+ num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
1438
+ `input_ids` above)
1439
+
1440
+ Returns:
1441
+
1442
+ Examples:
1443
+
1444
+ ```python
1445
+ >>> from transformers import AutoTokenizer, VGCNBertForMultipleChoice
1446
+ >>> import torch
1447
+
1448
+ >>> tokenizer = AutoTokenizer.from_pretrained("vgcn_bert-base-cased")
1449
+ >>> model = VGCNBertForMultipleChoice.from_pretrained("vgcn_bert-base-cased")
1450
+
1451
+ >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
1452
+ >>> choice0 = "It is eaten with a fork and a knife."
1453
+ >>> choice1 = "It is eaten while held in the hand."
1454
+ >>> labels = torch.tensor(0).unsqueeze(0) # choice0 is correct (according to Wikipedia ;)), batch size 1
1455
+
1456
+ >>> encoding = tokenizer([[prompt, choice0], [prompt, choice1]], return_tensors="pt", padding=True)
1457
+ >>> outputs = model(**{k: v.unsqueeze(0) for k, v in encoding.items()}, labels=labels) # batch size is 1
1458
+
1459
+ >>> # the linear classifier still needs to be trained
1460
+ >>> loss = outputs.loss
1461
+ >>> logits = outputs.logits
1462
+ ```"""
1463
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1464
+ num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
1465
+
1466
+ input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
1467
+ attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
1468
+ inputs_embeds = (
1469
+ inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
1470
+ if inputs_embeds is not None
1471
+ else None
1472
+ )
1473
+
1474
+ outputs = self.vgcn_bert(
1475
+ input_ids,
1476
+ attention_mask=attention_mask,
1477
+ head_mask=head_mask,
1478
+ inputs_embeds=inputs_embeds,
1479
+ output_attentions=output_attentions,
1480
+ output_hidden_states=output_hidden_states,
1481
+ return_dict=return_dict,
1482
+ )
1483
+
1484
+ hidden_state = outputs[0] # (bs * num_choices, seq_len, dim)
1485
+ pooled_output = hidden_state[:, 0] # (bs * num_choices, dim)
1486
+ pooled_output = self.pre_classifier(pooled_output) # (bs * num_choices, dim)
1487
+ pooled_output = nn.ReLU()(pooled_output) # (bs * num_choices, dim)
1488
+ pooled_output = self.dropout(pooled_output) # (bs * num_choices, dim)
1489
+ logits = self.classifier(pooled_output) # (bs * num_choices, 1)
1490
+
1491
+ reshaped_logits = logits.view(-1, num_choices) # (bs, num_choices)
1492
+
1493
+ loss = None
1494
+ if labels is not None:
1495
+ loss_fct = CrossEntropyLoss()
1496
+ loss = loss_fct(reshaped_logits, labels)
1497
+
1498
+ if not return_dict:
1499
+ output = (reshaped_logits,) + outputs[1:]
1500
+ return ((loss,) + output) if loss is not None else output
1501
+
1502
+ return MultipleChoiceModelOutput(
1503
+ loss=loss,
1504
+ logits=reshaped_logits,
1505
+ hidden_states=outputs.hidden_states,
1506
+ attentions=outputs.attentions,
1507
+ )
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3693a466be5c0473824d22c9ec4bb25ed0df7fdf4057f2859e835fdd80840948
3
+ size 265492133