Upload AbLang
Browse files- config.json +25 -0
- config.py +33 -0
- encoderblocks.py +112 -0
- extra_fns.py +26 -0
- model.py +54 -0
- pytorch_model.bin +3 -0
config.json
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "ablang-test",
|
3 |
+
"architectures": [
|
4 |
+
"AbLang"
|
5 |
+
],
|
6 |
+
"attention_probs_dropout_prob": 0.1,
|
7 |
+
"auto_map": {
|
8 |
+
"AutoConfig": "config.AbLangConfig",
|
9 |
+
"AutoModel": "model.AbLang"
|
10 |
+
},
|
11 |
+
"chain": "heavy",
|
12 |
+
"hidden_act": "gelu",
|
13 |
+
"hidden_dropout_prob": 0.1,
|
14 |
+
"hidden_size": 768,
|
15 |
+
"initializer_range": 0.02,
|
16 |
+
"intermediate_size": 3072,
|
17 |
+
"layer_norm_eps": 1e-12,
|
18 |
+
"max_position_embeddings": 160,
|
19 |
+
"num_attention_heads": 12,
|
20 |
+
"num_hidden_layers": 12,
|
21 |
+
"ptid": 21,
|
22 |
+
"torch_dtype": "float32",
|
23 |
+
"transformers_version": "4.26.1",
|
24 |
+
"vocab_size": 24
|
25 |
+
}
|
config.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import PretrainedConfig
|
2 |
+
from typing import List
|
3 |
+
|
4 |
+
class AbLangConfig(PretrainedConfig):
|
5 |
+
def __init__(
|
6 |
+
self,
|
7 |
+
max_position_embeddings: int=160,
|
8 |
+
hidden_size: int=768,
|
9 |
+
num_hidden_layers: int=12,
|
10 |
+
num_attention_heads: int=12,
|
11 |
+
attention_probs_dropout_prob: float=0.1,
|
12 |
+
intermediate_size: int=3072,
|
13 |
+
hidden_act: str="gelu",
|
14 |
+
hidden_dropout_prob: float=0.1,
|
15 |
+
initializer_range: float=0.02,
|
16 |
+
layer_norm_eps: float=1e-12,
|
17 |
+
chain: str="heavy",
|
18 |
+
**kwargs,
|
19 |
+
):
|
20 |
+
self.ptid = 21
|
21 |
+
self.vocab_size=24
|
22 |
+
self.max_position_embeddings=max_position_embeddings
|
23 |
+
self.hidden_size=hidden_size
|
24 |
+
self.num_hidden_layers=num_hidden_layers
|
25 |
+
self.num_attention_heads=num_attention_heads
|
26 |
+
self.attention_probs_dropout_prob=attention_probs_dropout_prob
|
27 |
+
self.intermediate_size=intermediate_size
|
28 |
+
self.hidden_act=hidden_act
|
29 |
+
self.hidden_dropout_prob=hidden_dropout_prob
|
30 |
+
self.initializer_range=initializer_range
|
31 |
+
self.layer_norm_eps=layer_norm_eps
|
32 |
+
self.chain=chain
|
33 |
+
super().__init__(**kwargs)
|
encoderblocks.py
ADDED
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math
|
2 |
+
from transformers import PreTrainedModel
|
3 |
+
from typing import List, Optional, Tuple
|
4 |
+
from dataclasses import dataclass
|
5 |
+
import torch
|
6 |
+
import torch.nn as nn
|
7 |
+
from fairseq.modules.multihead_attention import MultiheadAttention
|
8 |
+
from .extra_fns import ACT2FN
|
9 |
+
|
10 |
+
|
11 |
+
@dataclass
|
12 |
+
class AbRepOutput():
|
13 |
+
"""
|
14 |
+
Dataclass used to store AbRep output.
|
15 |
+
"""
|
16 |
+
last_hidden_states: torch.FloatTensor
|
17 |
+
all_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
18 |
+
attentions: Optional[Tuple[torch.FloatTensor]] = None
|
19 |
+
|
20 |
+
|
21 |
+
class EncoderBlocks(PreTrainedModel):
|
22 |
+
"""
|
23 |
+
Wrapper for multiple EncoderBlocks (or a single).
|
24 |
+
"""
|
25 |
+
def __init__(self, config):
|
26 |
+
super().__init__(config)
|
27 |
+
self.config = config
|
28 |
+
self.Layers = nn.ModuleList([EncoderBlock(config) for _ in range(config.num_hidden_layers)])
|
29 |
+
|
30 |
+
def forward(self, hidden_states, attention_mask=None, output_attentions=False, output_hidden_states=False):
|
31 |
+
all_hidden_states = () if output_hidden_states else None
|
32 |
+
all_self_attentions = () if output_attentions else None
|
33 |
+
for num_block, a_EncoderBlock in enumerate(self.Layers):
|
34 |
+
hidden_states, attentions = a_EncoderBlock(hidden_states, attention_mask, output_attentions)
|
35 |
+
if output_hidden_states:
|
36 |
+
all_hidden_states = all_hidden_states + (hidden_states,) # Takes out each hidden states after each EncoderBlock
|
37 |
+
if output_attentions:
|
38 |
+
all_self_attentions = all_self_attentions + (attentions,) # Takes out attention layers for analysis
|
39 |
+
return AbRepOutput(last_hidden_states=hidden_states, all_hidden_states=all_hidden_states, attentions=all_self_attentions)
|
40 |
+
|
41 |
+
|
42 |
+
class EncoderBlock(PreTrainedModel):
|
43 |
+
"""
|
44 |
+
Single EncoderBlock.
|
45 |
+
An EncoderBlock consists of a MultiHeadAttention and a IntermediateLayer.
|
46 |
+
"""
|
47 |
+
def __init__(self, config):
|
48 |
+
super().__init__(config)
|
49 |
+
self.MultiHeadAttention = ThirdMultiHeadAttention(config)
|
50 |
+
self.MHADropout = nn.Dropout(config.hidden_dropout_prob)
|
51 |
+
self.MHALayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
52 |
+
self.IntermediateLayer = IntermediateLayer(config)
|
53 |
+
|
54 |
+
def forward(self, hidden_states, attention_mask=None, output_attentions=False):
|
55 |
+
MHAoutput, attentions = self.MultiHeadAttention(hidden_states, attention_mask, output_attentions=output_attentions)
|
56 |
+
output = self.MHADropout(MHAoutput)
|
57 |
+
output = self.MHALayerNorm(output + hidden_states) # HIDDEN_STATES ARE ADDED FOR RESIDUAL BLOCK EFFECT
|
58 |
+
output = self.IntermediateLayer(output) # INTERMEDIATELAYER HAS RESIDUAL BLOCK EFFECT INTERNALLY
|
59 |
+
return output, attentions
|
60 |
+
|
61 |
+
|
62 |
+
class ThirdMultiHeadAttention(PreTrainedModel):
|
63 |
+
"""
|
64 |
+
New MultiHeadAttention which can return the weights of the individual heads.
|
65 |
+
"""
|
66 |
+
def __init__(self, config):
|
67 |
+
super().__init__(config)
|
68 |
+
self.Attention = MultiheadAttention(config.hidden_size, config.num_attention_heads, dropout=config.attention_probs_dropout_prob, self_attention=True)
|
69 |
+
|
70 |
+
def forward(self, hidden_states, attention_mask=None, output_attentions=False):
|
71 |
+
hidden_states = torch.transpose(hidden_states, 0, 1)
|
72 |
+
# static_kv is only True because there is currently a bug which doesn't return the head weights unaveraged unless its true
|
73 |
+
attn_output, attn_weights = self.Attention(hidden_states, hidden_states, hidden_states, key_padding_mask=attention_mask, static_kv=True,
|
74 |
+
need_weights=output_attentions, need_head_weights=output_attentions)
|
75 |
+
return torch.transpose(attn_output, 0, 1), attn_weights
|
76 |
+
|
77 |
+
|
78 |
+
class OldMultiHeadAttention(PreTrainedModel):
|
79 |
+
"""
|
80 |
+
MultiHeadAttention contains a Scaled Dot Product Attention and a Linear Layer.
|
81 |
+
"""
|
82 |
+
def __init__(self, config):
|
83 |
+
super().__init__(config)
|
84 |
+
self.Attention = torch.nn.MultiheadAttention(config.hidden_size, config.num_attention_heads, config.attention_probs_dropout_prob)
|
85 |
+
|
86 |
+
def forward(self, hidden_states, attention_mask=None, output_attentions=False):
|
87 |
+
hidden_states = torch.transpose(hidden_states, 0, 1)
|
88 |
+
output, attentions = self.Attention(hidden_states, hidden_states, hidden_states, key_padding_mask=attention_mask, need_weights=output_attentions)
|
89 |
+
attention_output = torch.transpose(output, 0, 1)
|
90 |
+
return attention_output, attentions
|
91 |
+
|
92 |
+
|
93 |
+
class IntermediateLayer(PreTrainedModel):
|
94 |
+
"""
|
95 |
+
Contains an expanding layer, while also functioning as a residual block ending with a drop-norm layer
|
96 |
+
"""
|
97 |
+
def __init__(self, config):
|
98 |
+
super().__init__(config)
|
99 |
+
self.expand_dense = nn.Linear(config.hidden_size, config.intermediate_size)
|
100 |
+
self.intermediate_act_fn = ACT2FN[config.hidden_act]
|
101 |
+
|
102 |
+
self.dense_dense = nn.Linear(config.intermediate_size, config.hidden_size)
|
103 |
+
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
104 |
+
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
105 |
+
|
106 |
+
def forward(self, hidden_states):
|
107 |
+
output = self.expand_dense(hidden_states)
|
108 |
+
output = self.intermediate_act_fn(output)
|
109 |
+
output = self.dense_dense(output)
|
110 |
+
output = self.dropout(output)
|
111 |
+
output = self.LayerNorm(output + hidden_states)
|
112 |
+
return output
|
extra_fns.py
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import math
|
3 |
+
|
4 |
+
|
5 |
+
def gelu_new(x):
|
6 |
+
"""
|
7 |
+
Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see
|
8 |
+
the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
|
9 |
+
"""
|
10 |
+
return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))
|
11 |
+
|
12 |
+
def gelu_fast(x):
|
13 |
+
return 0.5 * x * (1.0 + torch.tanh(x * 0.7978845608 * (1.0 + 0.044715 * x * x)))
|
14 |
+
|
15 |
+
def mish(x):
|
16 |
+
return x * torch.tanh(torch.nn.functional.softplus(x))
|
17 |
+
|
18 |
+
ACT2FN = {
|
19 |
+
"relu": torch.nn.functional.relu,
|
20 |
+
"gelu": torch.nn.functional.gelu,
|
21 |
+
"tanh": torch.tanh,
|
22 |
+
"gelu_new": gelu_new,
|
23 |
+
"gelu_fast": gelu_fast,
|
24 |
+
"mish": mish,
|
25 |
+
"sigmoid": torch.sigmoid,
|
26 |
+
}
|
model.py
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from transformers import PreTrainedModel
|
3 |
+
from .extra_fns import ACT2FN
|
4 |
+
from .encoderblocks import EncoderBlocks
|
5 |
+
from .config import AbLangConfig
|
6 |
+
|
7 |
+
class AbEmbeddings(PreTrainedModel):
|
8 |
+
def __init__(self, config):
|
9 |
+
super().__init__(config)
|
10 |
+
self.pad_token_id = config.ptid
|
11 |
+
self.AAEmbeddings = torch.nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=self.pad_token_id)
|
12 |
+
self.PositionEmbeddings = torch.nn.Embedding(config.max_position_embeddings, config.hidden_size, padding_idx=0) # here padding_idx is always 0
|
13 |
+
self.LayerNorm = torch.nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
14 |
+
self.Dropout = torch.nn.Dropout(config.hidden_dropout_prob)
|
15 |
+
|
16 |
+
def forward(self, src):
|
17 |
+
inputs_embeds = self.AAEmbeddings(src)
|
18 |
+
position_ids = self.create_position_ids_from_input_ids(src, self.pad_token_id)
|
19 |
+
position_embeddings = self.PositionEmbeddings(position_ids)
|
20 |
+
embeddings = inputs_embeds + position_embeddings
|
21 |
+
return self.Dropout(self.LayerNorm(embeddings))
|
22 |
+
|
23 |
+
def create_position_ids_from_input_ids(self, input_ids, padding_idx):
|
24 |
+
"""
|
25 |
+
Replace non-padding symbols with their position numbers. Padding idx will get position 0, which will be ignored later on.
|
26 |
+
"""
|
27 |
+
mask = input_ids.ne(padding_idx).int()
|
28 |
+
return torch.cumsum(mask, dim=1).long() * mask
|
29 |
+
|
30 |
+
|
31 |
+
class AbLang(PreTrainedModel):
|
32 |
+
config_class = AbLangConfig
|
33 |
+
def __init__(self, config):
|
34 |
+
super().__init__(config)
|
35 |
+
self.AbEmbeddings = AbEmbeddings(config)
|
36 |
+
self.EncoderBlocks = EncoderBlocks(config)
|
37 |
+
|
38 |
+
def forward(self, inputs):
|
39 |
+
src = self.AbEmbeddings(inputs['input_ids'])
|
40 |
+
outputs = self.EncoderBlocks(src, attention_mask=1-inputs['attention_mask'], output_attentions=False)
|
41 |
+
return apply_cls_embeddings(inputs, outputs)
|
42 |
+
|
43 |
+
def apply_cls_embeddings(inputs, outputs):
|
44 |
+
mask = inputs['attention_mask'].float()
|
45 |
+
d = {k: v for k, v in torch.nonzero(mask).cpu().numpy()} # dict of sep tokens
|
46 |
+
# make sep token invisible
|
47 |
+
for i in d:
|
48 |
+
mask[i, d[i]] = 0
|
49 |
+
mask[:, 0] = 0.0 # make cls token invisible
|
50 |
+
mask = mask.unsqueeze(-1).expand(outputs.last_hidden_states.size())
|
51 |
+
sum_embeddings = torch.sum(outputs.last_hidden_states * mask, 1)
|
52 |
+
sum_mask = torch.clamp(mask.sum(1), min=1e-9)
|
53 |
+
outputs.last_hidden_states[:, 0, :] = sum_embeddings / sum_mask
|
54 |
+
return outputs
|
pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:dec3268da263e5c21085a7e736c81b521da2662822c5c86d4024c7e558a1b669
|
3 |
+
size 340855773
|