Upload 18 files
Browse files- .gitattributes +2 -0
- config.py +26 -0
- config_small.yaml +23 -0
- datasets/dev_clean.dat +0 -0
- datasets/dev_other.dat +0 -0
- datasets/train_clean.dat +3 -0
- datasets/train_other.dat +3 -0
- layers/decoder_layer.py +57 -0
- layers/encoder_layer.py +42 -0
- layers/projection_layer.py +16 -0
- model/encoder.py +38 -0
- modules/dot_product_attention.py +55 -0
- modules/multi_head_attention.py +79 -0
- modules/positional_encoding.py +34 -0
- modules/positionwise_feed_forward.py +28 -0
- modules/transformer_embedding.py +23 -0
- modules/wrapper.py +28 -0
- transformer.py +161 -0
- translate.py +69 -0
.gitattributes
CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
datasets/train_clean.dat filter=lfs diff=lfs merge=lfs -text
|
37 |
+
datasets/train_other.dat filter=lfs diff=lfs merge=lfs -text
|
config.py
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Dict, Any
|
2 |
+
|
3 |
+
import yaml
|
4 |
+
|
5 |
+
from pathlib import Path
|
6 |
+
|
7 |
+
from utils import get_full_file_path
|
8 |
+
|
9 |
+
def load_config(file_name: str = 'config.yaml') -> Dict[str, Any]:
|
10 |
+
"""
|
11 |
+
Loads a YAML config from a relative file path
|
12 |
+
"""
|
13 |
+
file_path = get_full_file_path(file_name=file_name)
|
14 |
+
with open(file_path, 'r') as file:
|
15 |
+
config = yaml.safe_load(file)
|
16 |
+
|
17 |
+
return config
|
18 |
+
|
19 |
+
# print(list(load_config().items()))
|
20 |
+
|
21 |
+
def get_weights_file_path(config, epoch: str) -> str:
|
22 |
+
model_folder = config['model']['model_folder']
|
23 |
+
model_basename = config['model']['model_basename']
|
24 |
+
model_filename = f'{model_basename}{epoch}.pt'
|
25 |
+
return str(Path('.') / model_folder / model_filename)
|
26 |
+
|
config_small.yaml
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
experiment_name: "runs/transformer"
|
2 |
+
|
3 |
+
dataset:
|
4 |
+
src_lang: 'lo'
|
5 |
+
src_tokenizer: 'BPE'
|
6 |
+
src_max_seq_len: 400
|
7 |
+
tgt_lang: 'vi'
|
8 |
+
tgt_tokenizer: 'WordLevel'
|
9 |
+
tgt_max_seq_len: 350
|
10 |
+
train_dataset: 'train_clean.dat'
|
11 |
+
validate_dataset: 'dev_clean.dat'
|
12 |
+
tokenizer_file: "tokenizer_{0}.json"
|
13 |
+
|
14 |
+
model: # 16629775 parameters
|
15 |
+
d_model: 256
|
16 |
+
num_heads: 8
|
17 |
+
d_ff: 1024
|
18 |
+
dropout_p: 0.3
|
19 |
+
num_encoder_layers: 4
|
20 |
+
num_decoder_layers: 2
|
21 |
+
model_folder: "weights"
|
22 |
+
model_basename: "transformer_"
|
23 |
+
preload: "small"
|
datasets/dev_clean.dat
ADDED
The diff for this file is too large to render.
See raw diff
|
|
datasets/dev_other.dat
ADDED
The diff for this file is too large to render.
See raw diff
|
|
datasets/train_clean.dat
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3d56aa43dba2e12da818bf80e7b42d0908637ee454ced035ae56f02022e13083
|
3 |
+
size 18582873
|
datasets/train_other.dat
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:081d7f90d8098f81a8c37fd50a0d8e2d193a2e0e2cc5048b08ca4ef218100d18
|
3 |
+
size 18008839
|
layers/decoder_layer.py
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Tuple
|
2 |
+
|
3 |
+
import torch.nn as nn
|
4 |
+
from torch import Tensor
|
5 |
+
|
6 |
+
from modules.multi_head_attention import MultiHeadAttention
|
7 |
+
from modules.positionwise_feed_forward import PositionwiseFeedForwardNetwork
|
8 |
+
|
9 |
+
|
10 |
+
class DecoderLayer(nn.Module):
|
11 |
+
"""
|
12 |
+
A Decoder layer.
|
13 |
+
|
14 |
+
Args:
|
15 |
+
"""
|
16 |
+
def __init__(
|
17 |
+
self,
|
18 |
+
d_model: int,
|
19 |
+
num_heads: int,
|
20 |
+
d_ff: int,
|
21 |
+
dropout_p: int,
|
22 |
+
) -> None:
|
23 |
+
super(DecoderLayer, self).__init__()
|
24 |
+
self.self_attn_prenorm = nn.LayerNorm(d_model)
|
25 |
+
self.self_attn = MultiHeadAttention(d_model=d_model, num_heads=num_heads, dropout_p=dropout_p)
|
26 |
+
self.self_attn_dropout = nn.Dropout(p=dropout_p)
|
27 |
+
|
28 |
+
self.cross_attn_prenorm = nn.LayerNorm(d_model)
|
29 |
+
self.cross_attn = MultiHeadAttention(d_model=d_model, num_heads=num_heads, dropout_p=dropout_p)
|
30 |
+
self.cross_attn_dropout = nn.Dropout(p=dropout_p)
|
31 |
+
|
32 |
+
self.feed_forward_prenorm = nn.LayerNorm(d_model)
|
33 |
+
self.feed_forward = PositionwiseFeedForwardNetwork(d_model=d_model, d_ff=d_ff, dropout_p=dropout_p)
|
34 |
+
|
35 |
+
def forward(
|
36 |
+
self,
|
37 |
+
decoder_inputs: Tensor,
|
38 |
+
encoder_outputs: Tensor,
|
39 |
+
src_mask: Tensor,
|
40 |
+
tgt_mask: Tensor,
|
41 |
+
) -> Tuple[Tensor, Tensor]:
|
42 |
+
residual = decoder_inputs
|
43 |
+
outputs = self.self_attn_prenorm(decoder_inputs)
|
44 |
+
outputs, attn = self.self_attn(outputs, outputs, outputs, tgt_mask)
|
45 |
+
outputs = self.self_attn_dropout(outputs) + residual
|
46 |
+
|
47 |
+
residual = outputs
|
48 |
+
outputs = self.self_attn_prenorm(outputs)
|
49 |
+
outputs, attn = self.self_attn(outputs, encoder_outputs, encoder_outputs, src_mask)
|
50 |
+
outputs = self.self_attn_dropout(outputs) + residual
|
51 |
+
|
52 |
+
residual = outputs
|
53 |
+
outputs = self.feed_forward_prenorm(outputs)
|
54 |
+
outputs = self.feed_forward(outputs)
|
55 |
+
outputs += residual
|
56 |
+
|
57 |
+
return outputs, attn
|
layers/encoder_layer.py
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Tuple
|
2 |
+
|
3 |
+
import torch.nn as nn
|
4 |
+
from torch import Tensor
|
5 |
+
|
6 |
+
from modules.multi_head_attention import MultiHeadAttention
|
7 |
+
from modules.positionwise_feed_forward import PositionwiseFeedForwardNetwork
|
8 |
+
|
9 |
+
class EncoderLayer(nn.Module):
|
10 |
+
"""
|
11 |
+
An Encoder layer.
|
12 |
+
|
13 |
+
Args:
|
14 |
+
"""
|
15 |
+
def __init__(
|
16 |
+
self,
|
17 |
+
d_model: int,
|
18 |
+
num_heads: int,
|
19 |
+
d_ff: int,
|
20 |
+
dropout_p: int,
|
21 |
+
) -> None:
|
22 |
+
super(EncoderLayer, self).__init__()
|
23 |
+
self.self_attn_prenorm = nn.LayerNorm(d_model)
|
24 |
+
self.self_attn = MultiHeadAttention(d_model=d_model, num_heads=num_heads, dropout_p=dropout_p)
|
25 |
+
self.self_attn_dropout = nn.Dropout(p=dropout_p)
|
26 |
+
|
27 |
+
self.feed_forward_prenorm = nn.LayerNorm(d_model)
|
28 |
+
self.feed_forward = PositionwiseFeedForwardNetwork(d_model=d_model, d_ff=d_ff, dropout_p=dropout_p)
|
29 |
+
|
30 |
+
def forward(self, inputs: Tensor, src_mask: Tensor = None) -> Tuple[Tensor, Tensor]:
|
31 |
+
# Normalize -> sublayer -> dropout -> add residual
|
32 |
+
residual = inputs
|
33 |
+
inputs = self.self_attn_prenorm(inputs)
|
34 |
+
outputs, attn = self.self_attn(inputs, inputs, inputs, src_mask)
|
35 |
+
outputs = self.self_attn_dropout(outputs) + residual
|
36 |
+
|
37 |
+
residual = outputs
|
38 |
+
outputs = self.feed_forward_prenorm(outputs)
|
39 |
+
outputs = self.feed_forward(outputs)
|
40 |
+
outputs += residual
|
41 |
+
|
42 |
+
return outputs, attn
|
layers/projection_layer.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Tuple
|
2 |
+
|
3 |
+
import torch
|
4 |
+
import torch.nn as nn
|
5 |
+
from torch import Tensor
|
6 |
+
|
7 |
+
from modules.wrapper import Linear
|
8 |
+
|
9 |
+
class ProjectionLayer(nn.Module):
|
10 |
+
def __init__(self, d_model: int, vocab_size: int) -> None:
|
11 |
+
super(ProjectionLayer, self).__init__()
|
12 |
+
self.linear = Linear(d_model, vocab_size)
|
13 |
+
|
14 |
+
def forward(self, x):
|
15 |
+
# (batch, seq_len, d_model) -> (batch, seq_len, vocab_size)
|
16 |
+
return torch.log_softmax(self.linear(x), dim=-1)
|
model/encoder.py
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Tuple
|
2 |
+
|
3 |
+
import torch.nn as nn
|
4 |
+
from torch import Tensor
|
5 |
+
|
6 |
+
from layers.encoder_layer import EncoderLayer
|
7 |
+
|
8 |
+
class Encoder(nn.Module):
|
9 |
+
"""
|
10 |
+
A transformer Encoder (no embeddings or positional embeddings)
|
11 |
+
|
12 |
+
Args:
|
13 |
+
"""
|
14 |
+
def __init__(
|
15 |
+
self,
|
16 |
+
d_model: int,
|
17 |
+
num_heads: int,
|
18 |
+
d_ff: int,
|
19 |
+
dropout_p: int,
|
20 |
+
num_layers: int,
|
21 |
+
) -> None:
|
22 |
+
super(Encoder, self).__init__()
|
23 |
+
self.layers = nn.ModuleList(
|
24 |
+
[
|
25 |
+
EncoderLayer(
|
26 |
+
d_model=d_model,
|
27 |
+
num_heads=num_heads,
|
28 |
+
d_ff=d_ff,
|
29 |
+
dropout_p=dropout_p,
|
30 |
+
)
|
31 |
+
for _ in range(num_layers)
|
32 |
+
]
|
33 |
+
)
|
34 |
+
|
35 |
+
def forward(self, x: Tensor, src_mask: Tensor):
|
36 |
+
for layer in self.layers:
|
37 |
+
x, attn = layer(x, src_mask)
|
38 |
+
return x
|
modules/dot_product_attention.py
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Optional, Tuple
|
2 |
+
|
3 |
+
import numpy as np
|
4 |
+
import torch
|
5 |
+
import torch.nn as nn
|
6 |
+
import torch.nn.functional as F
|
7 |
+
from torch import Tensor
|
8 |
+
|
9 |
+
class ScaledDotProductAttention(nn.Module):
|
10 |
+
"""
|
11 |
+
Scaled Dot-Product Attention (section 3.2.1)
|
12 |
+
|
13 |
+
Args:
|
14 |
+
- dim (int): dimension of d_k or d_head
|
15 |
+
- dropout_p (float): probability of dropout
|
16 |
+
|
17 |
+
Input:
|
18 |
+
- query (batch, num_heads, seq_len, d_head)
|
19 |
+
- key (batch, num_heads, seq_len, d_head)
|
20 |
+
- value (batch, num_heads, seq_len, d_head)
|
21 |
+
- mask ()
|
22 |
+
|
23 |
+
Output:
|
24 |
+
- context (batch, num_head, seq_len, d_head): Context matrix.
|
25 |
+
- attn (batch, num_head, seq_len, seq_len): Attention matrix for visualization.
|
26 |
+
"""
|
27 |
+
def __init__(self, dim: int, dropout_p: float) -> None:
|
28 |
+
super(ScaledDotProductAttention, self).__init__()
|
29 |
+
self.sqrt_dim = np.sqrt(dim)
|
30 |
+
self.dropout = nn.Dropout(p = dropout_p)
|
31 |
+
|
32 |
+
def forward(
|
33 |
+
self,
|
34 |
+
query: Tensor,
|
35 |
+
key: Tensor,
|
36 |
+
value: Tensor,
|
37 |
+
mask: Optional[Tensor] = None,
|
38 |
+
) -> Tuple[Tensor, Tensor]:
|
39 |
+
|
40 |
+
# (batch, num_heads, seq_len, d_head) @ (batch, num_heads, d_head, seq_len)
|
41 |
+
# ==> score: (batch, num_heads, seq_len, seq_len)
|
42 |
+
score = torch.matmul(query, key.transpose(-2, -1)) / self.sqrt_dim
|
43 |
+
|
44 |
+
if mask is not None:
|
45 |
+
score.masked_fill_(mask == 0, -1e4)
|
46 |
+
|
47 |
+
attn = F.softmax(score, -1)
|
48 |
+
# (batch, num_head, seq_len, seq_len)
|
49 |
+
attn = self.dropout(attn)
|
50 |
+
|
51 |
+
context = torch.matmul(attn, value)
|
52 |
+
# (batch, num_head, seq_len, d_head)
|
53 |
+
|
54 |
+
return context, attn
|
55 |
+
|
modules/multi_head_attention.py
ADDED
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Optional, Tuple
|
2 |
+
|
3 |
+
import torch
|
4 |
+
import torch.nn as nn
|
5 |
+
from torch import Tensor
|
6 |
+
|
7 |
+
from modules.wrapper import Linear
|
8 |
+
from modules.dot_product_attention import ScaledDotProductAttention
|
9 |
+
|
10 |
+
|
11 |
+
class MultiHeadAttention(nn.Module):
|
12 |
+
"""
|
13 |
+
Multi-Head Attention (section 3.2.2)
|
14 |
+
|
15 |
+
Args:
|
16 |
+
- d_model (int): dimension of model
|
17 |
+
- num_heads (int): number of heads
|
18 |
+
- dropout_p (float): probability of dropout
|
19 |
+
|
20 |
+
Inputs:
|
21 |
+
- query (batch, seq_len, d_model):
|
22 |
+
- key (batch, seq_len, d_model):
|
23 |
+
- value (batch, seq_len, d_model):
|
24 |
+
- mask ():
|
25 |
+
|
26 |
+
Output: (Tensor, Tensor):
|
27 |
+
- context ()
|
28 |
+
- attn (): Attention matrix for visualization.
|
29 |
+
"""
|
30 |
+
def __init__(
|
31 |
+
self,
|
32 |
+
d_model: int,
|
33 |
+
num_heads: int,
|
34 |
+
dropout_p: int,
|
35 |
+
) -> None:
|
36 |
+
super(MultiHeadAttention, self).__init__()
|
37 |
+
|
38 |
+
assert d_model % num_heads == 0, "d_model % num_heads should be 0"
|
39 |
+
|
40 |
+
self.d_model = d_model
|
41 |
+
self.num_heads = num_heads
|
42 |
+
self.d_head = d_model // num_heads
|
43 |
+
|
44 |
+
self.W_query = Linear(d_model, d_model)
|
45 |
+
self.W_key = Linear(d_model, d_model)
|
46 |
+
self.W_value = Linear(d_model, d_model)
|
47 |
+
# self.W_output = Linear(d_model, d_model)
|
48 |
+
|
49 |
+
self.scaled_dot_attn = ScaledDotProductAttention(d_model, dropout_p)
|
50 |
+
|
51 |
+
|
52 |
+
def forward(
|
53 |
+
self,
|
54 |
+
query: Tensor,
|
55 |
+
key: Tensor,
|
56 |
+
value: Tensor,
|
57 |
+
mask: Optional[Tensor] = None,
|
58 |
+
) -> Tuple[Tensor, Tensor]:
|
59 |
+
batch_size = query.shape[0]
|
60 |
+
|
61 |
+
# original: (batch, seq_len, d_model)
|
62 |
+
# --forward--> (batch, seq_len, d_model)
|
63 |
+
# --view--> (batch, seq_len, num_heads, d_head)
|
64 |
+
# --transpose--> (batch, num_heads, seq_len, d_head)
|
65 |
+
query = self.W_query(query).view(batch_size, -1, self.num_heads, self.d_head).transpose(1,2)
|
66 |
+
key = self.W_key(key).view(batch_size, -1, self.num_heads, self.d_head).transpose(1,2)
|
67 |
+
value = self.W_value(value).view(batch_size, -1, self.num_heads, self.d_head).transpose(1,2)
|
68 |
+
|
69 |
+
context, attn = self.scaled_dot_attn(query, key, value, mask)
|
70 |
+
|
71 |
+
# (batch, num_heads, seq_len, d_head)
|
72 |
+
# --transpose--> (batch, seq_len, num_heads, d_head)
|
73 |
+
# --view--> (batch, seq_len, d_model)
|
74 |
+
|
75 |
+
context = context.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)
|
76 |
+
# context = self.W_output(context)
|
77 |
+
|
78 |
+
return context, attn
|
79 |
+
|
modules/positional_encoding.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math
|
2 |
+
|
3 |
+
import torch
|
4 |
+
import torch.nn as nn
|
5 |
+
from torch import Tensor
|
6 |
+
|
7 |
+
|
8 |
+
class PositionalEncoding(nn.Module):
|
9 |
+
r"""
|
10 |
+
Positional Encoding in "Attention Is All You Need" (section 3.5).
|
11 |
+
|
12 |
+
"Attention Is All You Need" uses sine and cosine functions of different frequencies:
|
13 |
+
PE_(pos, 2i) = sin(pos / power(10000, 2i / d_model))
|
14 |
+
PE_(pos, 2i+1) = cos(pos / power(10000, 2i / d_model))
|
15 |
+
only change is that calculations are done with -log(power(10000, 2i / d_model))
|
16 |
+
|
17 |
+
Uses OpenSpeech's PositionalEncoding, as I don't see the point in coding this from scratch.
|
18 |
+
"""
|
19 |
+
def __init__(self, d_model: int, dropout_p: float, max_length: int = 5000) -> None:
|
20 |
+
super(PositionalEncoding, self).__init__()
|
21 |
+
|
22 |
+
self.dropout = nn.Dropout(p=dropout_p)
|
23 |
+
|
24 |
+
pe = torch.zeros(max_length, d_model, requires_grad=False)
|
25 |
+
position = torch.arange(0, max_length, dtype=torch.float).unsqueeze(1)
|
26 |
+
div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
|
27 |
+
pe[:, 0::2] = torch.sin(position * div_term)
|
28 |
+
pe[:, 1::2] = torch.cos(position * div_term)
|
29 |
+
pe = pe.unsqueeze(0)
|
30 |
+
self.register_buffer("pe", pe)
|
31 |
+
|
32 |
+
def forward(self, x: Tensor) -> Tensor:
|
33 |
+
x = x + (self.pe[:, :x.shape[1], :]).requires_grad_(False)
|
34 |
+
return self.dropout(x)
|
modules/positionwise_feed_forward.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch.nn as nn
|
2 |
+
from torch import Tensor
|
3 |
+
|
4 |
+
from modules.wrapper import Linear
|
5 |
+
|
6 |
+
class PositionwiseFeedForwardNetwork(nn.Module):
|
7 |
+
"""
|
8 |
+
Position-wise Feed-Forward Network (section 3.3)
|
9 |
+
|
10 |
+
Args:
|
11 |
+
- d_model (int): dimension of input and output
|
12 |
+
- d_ff (int): dimension of inner-layer
|
13 |
+
- dropout_p (float): dropout probability
|
14 |
+
"""
|
15 |
+
|
16 |
+
def __init__(self, d_model: int, d_ff: int, dropout_p: float) -> None:
|
17 |
+
super(PositionwiseFeedForwardNetwork, self).__init__()
|
18 |
+
|
19 |
+
self.feed_forward = nn.Sequential(
|
20 |
+
Linear(d_model, d_ff),
|
21 |
+
nn.Dropout(dropout_p),
|
22 |
+
nn.ReLU(),
|
23 |
+
Linear(d_ff, d_model),
|
24 |
+
nn.Dropout(dropout_p)
|
25 |
+
)
|
26 |
+
|
27 |
+
def forward(self, x: Tensor) -> Tensor:
|
28 |
+
return self.feed_forward(x)
|
modules/transformer_embedding.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch.nn as nn
|
2 |
+
from torch import Tensor
|
3 |
+
import numpy as np
|
4 |
+
|
5 |
+
|
6 |
+
class TransformerEmbedding(nn.Module):
|
7 |
+
"""
|
8 |
+
Input Embeddings (section 3.4)
|
9 |
+
|
10 |
+
Embedds words to vectors of size d_
|
11 |
+
|
12 |
+
Args:
|
13 |
+
- d_model (int): dimension of model
|
14 |
+
- num_embeddings (int): size of the dictionary
|
15 |
+
"""
|
16 |
+
def __init__(self, d_model: int, num_embeddings: int) -> None:
|
17 |
+
super(TransformerEmbedding, self).__init__()
|
18 |
+
self.sqrt_d_model = np.sqrt(d_model)
|
19 |
+
self.embedding = nn.Embedding(num_embeddings=num_embeddings, embedding_dim=d_model)
|
20 |
+
|
21 |
+
def forward(self, x: Tensor) -> Tensor:
|
22 |
+
return self.embedding(x) * self.sqrt_d_model
|
23 |
+
|
modules/wrapper.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch.nn as nn
|
2 |
+
from torch import Tensor
|
3 |
+
|
4 |
+
|
5 |
+
class Linear(nn.Module):
|
6 |
+
"""
|
7 |
+
A wrapper class for nn.Linear
|
8 |
+
Initialize values using xxx
|
9 |
+
"""
|
10 |
+
def __init__(
|
11 |
+
self,
|
12 |
+
in_features: int,
|
13 |
+
out_features: int,
|
14 |
+
bias: bool = True,
|
15 |
+
device=None,
|
16 |
+
dtype=None,
|
17 |
+
):
|
18 |
+
super(Linear, self).__init__()
|
19 |
+
self.linear = nn.Linear(in_features, out_features, bias, device, dtype)
|
20 |
+
nn.init.xavier_uniform_(self.linear.weight)
|
21 |
+
if bias:
|
22 |
+
nn.init.zeros_(self.linear.bias)
|
23 |
+
|
24 |
+
def forward(self, x: Tensor) -> Tensor:
|
25 |
+
"""
|
26 |
+
forward pass through linear layer.
|
27 |
+
"""
|
28 |
+
return self.linear(x)
|
transformer.py
ADDED
@@ -0,0 +1,161 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Tuple
|
2 |
+
|
3 |
+
import torch.nn as nn
|
4 |
+
from torch import Tensor
|
5 |
+
|
6 |
+
from modules.transformer_embedding import TransformerEmbedding
|
7 |
+
from modules.positional_encoding import PositionalEncoding
|
8 |
+
|
9 |
+
from model.encoder import Encoder
|
10 |
+
from model.decoder import Decoder
|
11 |
+
from layers.projection_layer import ProjectionLayer
|
12 |
+
|
13 |
+
class Transformer(nn.Module):
|
14 |
+
"""
|
15 |
+
Transformer.
|
16 |
+
|
17 |
+
Args:
|
18 |
+
- src_vocab_size (int): source vocabulary size
|
19 |
+
- tgt_vocab_size (int): target vocabulary size
|
20 |
+
- src_max_seq_len (int): source max sequence length
|
21 |
+
- tgt_max_seq_len (int): target max sequence length
|
22 |
+
- d_model (int): dimension of model
|
23 |
+
- num_heads (int): number of heads
|
24 |
+
- d_ff (int): dimension of hidden feed forward layer
|
25 |
+
- dropout_p (float): probability of dropout
|
26 |
+
- num_encoder_layers (int): number of encoder layers
|
27 |
+
- num_decoder_layers (int): number of decoder layers
|
28 |
+
"""
|
29 |
+
def __init__(
|
30 |
+
self,
|
31 |
+
src_vocab_size: int,
|
32 |
+
tgt_vocab_size: int,
|
33 |
+
src_max_seq_len: int,
|
34 |
+
tgt_max_seq_len: int,
|
35 |
+
d_model: int = 512,
|
36 |
+
num_heads: int = 8,
|
37 |
+
d_ff: int = 2048,
|
38 |
+
dropout_p: float = 0.1,
|
39 |
+
num_encoder_layers: int = 6,
|
40 |
+
num_decoder_layers: int = 6,
|
41 |
+
) -> None:
|
42 |
+
super(Transformer, self).__init__()
|
43 |
+
|
44 |
+
# Embedding layers
|
45 |
+
self.src_embedding = TransformerEmbedding(
|
46 |
+
d_model=d_model,
|
47 |
+
num_embeddings=src_vocab_size
|
48 |
+
)
|
49 |
+
self.tgt_embedding = TransformerEmbedding(
|
50 |
+
d_model=d_model,
|
51 |
+
num_embeddings=tgt_vocab_size
|
52 |
+
)
|
53 |
+
|
54 |
+
# Positional Encoding layers
|
55 |
+
self.src_positional_encoding = PositionalEncoding(
|
56 |
+
d_model=d_model,
|
57 |
+
dropout_p=dropout_p,
|
58 |
+
max_length=src_max_seq_len
|
59 |
+
)
|
60 |
+
self.tgt_positional_encoding = PositionalEncoding(
|
61 |
+
d_model=d_model,
|
62 |
+
dropout_p=dropout_p,
|
63 |
+
max_length=tgt_max_seq_len
|
64 |
+
)
|
65 |
+
|
66 |
+
# Encoder
|
67 |
+
self.encoder = Encoder(
|
68 |
+
d_model=d_model,
|
69 |
+
num_heads=num_heads,
|
70 |
+
d_ff=d_ff,
|
71 |
+
dropout_p=dropout_p,
|
72 |
+
num_layers=num_encoder_layers
|
73 |
+
)
|
74 |
+
# Decoder
|
75 |
+
self.decoder = Decoder(
|
76 |
+
d_model=d_model,
|
77 |
+
num_heads=num_heads,
|
78 |
+
d_ff=d_ff,
|
79 |
+
dropout_p=dropout_p,
|
80 |
+
num_layers=num_decoder_layers
|
81 |
+
)
|
82 |
+
# projecting decoder's output to the target language.
|
83 |
+
self.projection_layer = ProjectionLayer(
|
84 |
+
d_model=d_model,
|
85 |
+
vocab_size=tgt_vocab_size
|
86 |
+
)
|
87 |
+
|
88 |
+
def encode(
|
89 |
+
self,
|
90 |
+
src: Tensor,
|
91 |
+
src_mask: Tensor
|
92 |
+
) -> Tensor:
|
93 |
+
"""
|
94 |
+
Get encoder outputs.
|
95 |
+
"""
|
96 |
+
src = self.src_embedding(src)
|
97 |
+
src = self.src_positional_encoding(src)
|
98 |
+
return self.encoder(src, src_mask)
|
99 |
+
|
100 |
+
def decode(
|
101 |
+
self,
|
102 |
+
encoder_output: Tensor,
|
103 |
+
src_mask: Tensor,
|
104 |
+
tgt: Tensor,
|
105 |
+
tgt_mask: Tensor
|
106 |
+
) -> Tuple[Tensor, Tensor]:
|
107 |
+
"""
|
108 |
+
Get decoder outputs for a set of target inputs.
|
109 |
+
"""
|
110 |
+
tgt = self.tgt_embedding(tgt)
|
111 |
+
tgt = self.tgt_positional_encoding(tgt)
|
112 |
+
return self.decoder(
|
113 |
+
x=tgt,
|
114 |
+
encoder_output=encoder_output,
|
115 |
+
src_mask=src_mask,
|
116 |
+
tgt_mask=tgt_mask
|
117 |
+
)
|
118 |
+
|
119 |
+
def project(self, decoder_output: Tensor) -> Tensor:
|
120 |
+
"""
|
121 |
+
Project decoder outputs to target vocabulary.
|
122 |
+
"""
|
123 |
+
return self.projection_layer(decoder_output)
|
124 |
+
|
125 |
+
def forward(
|
126 |
+
self,
|
127 |
+
src: Tensor,
|
128 |
+
src_mask: Tensor,
|
129 |
+
tgt: Tensor,
|
130 |
+
tgt_mask: Tensor
|
131 |
+
) -> Tuple[Tensor, Tensor]:
|
132 |
+
# src_mask = self.make_src_mask(src)
|
133 |
+
# tgt_mask = self.make_tgt_mask(tgt)
|
134 |
+
|
135 |
+
encoder_output = self.encode(src, src_mask)
|
136 |
+
decoder_output, attn = self.decode(
|
137 |
+
encoder_output, src_mask, tgt, tgt_mask
|
138 |
+
)
|
139 |
+
output = self.project(decoder_output)
|
140 |
+
return output, attn
|
141 |
+
|
142 |
+
def count_parameters(self):
|
143 |
+
return sum(p.numel() for p in self.parameters() if p.requires_grad)
|
144 |
+
|
145 |
+
|
146 |
+
def get_model(config, src_vocab_size: int, tgt_vocab_size: int) -> Transformer:
|
147 |
+
"""
|
148 |
+
returns a `Transformer` model for a given config.
|
149 |
+
"""
|
150 |
+
return Transformer(
|
151 |
+
src_vocab_size=src_vocab_size,
|
152 |
+
tgt_vocab_size=tgt_vocab_size,
|
153 |
+
src_max_seq_len=config['dataset']['src_max_seq_len'],
|
154 |
+
tgt_max_seq_len=config['dataset']['tgt_max_seq_len'],
|
155 |
+
d_model=config['model']['d_model'],
|
156 |
+
num_heads=config['model']['num_heads'],
|
157 |
+
d_ff=config['model']['d_ff'],
|
158 |
+
dropout_p=config['model']['dropout_p'],
|
159 |
+
num_encoder_layers=config['model']['num_encoder_layers'],
|
160 |
+
num_decoder_layers=config['model']['num_decoder_layers'],
|
161 |
+
)
|
translate.py
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Tuple
|
2 |
+
|
3 |
+
import torch
|
4 |
+
from torch import Tensor
|
5 |
+
|
6 |
+
from tokenizers import Tokenizer
|
7 |
+
|
8 |
+
from transformer import Transformer
|
9 |
+
from decode_method import greedy_decode
|
10 |
+
|
11 |
+
def translate(
|
12 |
+
model: Transformer,
|
13 |
+
src_tokenizer: Tokenizer,
|
14 |
+
tgt_tokenizer: Tokenizer,
|
15 |
+
text: str,
|
16 |
+
decode_method: str = 'greedy',
|
17 |
+
device = torch.device('cpu')
|
18 |
+
) -> Tuple[str, Tensor]:
|
19 |
+
"""
|
20 |
+
Translation function.
|
21 |
+
|
22 |
+
Output:
|
23 |
+
- translation (str): the translated string.
|
24 |
+
- attn (Tensor): The decoder's attention (for visualization)
|
25 |
+
"""
|
26 |
+
sos_token = torch.tensor([src_tokenizer.token_to_id('<sos>')], dtype=torch.int64)
|
27 |
+
eos_token = torch.tensor([src_tokenizer.token_to_id('<eos>')], dtype=torch.int64)
|
28 |
+
pad_token = torch.tensor([src_tokenizer.token_to_id('<pad>')], dtype=torch.int64)
|
29 |
+
|
30 |
+
encoder_input_tokens = src_tokenizer.encode(text).ids
|
31 |
+
# <sos> + source_text + <eos> = encoder_input
|
32 |
+
encoder_input = torch.cat(
|
33 |
+
[
|
34 |
+
sos_token,
|
35 |
+
torch.tensor(encoder_input_tokens, dtype=torch.int64),
|
36 |
+
eos_token,
|
37 |
+
]
|
38 |
+
)
|
39 |
+
encoder_mask = (encoder_input != pad_token).unsqueeze(0).unsqueeze(0).unsqueeze(0).int() # (1, 1, seq_len)
|
40 |
+
|
41 |
+
encoder_input = encoder_input.unsqueeze(0)
|
42 |
+
# encoder_mask = torch.tensor(encoder_mask)
|
43 |
+
|
44 |
+
assert encoder_input.size(0) == 1
|
45 |
+
|
46 |
+
if decode_method == 'greedy':
|
47 |
+
model_out, attn = greedy_decode(
|
48 |
+
model, encoder_input, encoder_mask, src_tokenizer, tgt_tokenizer, 400, device,
|
49 |
+
give_attn=True,
|
50 |
+
)
|
51 |
+
elif decode_method == 'beam-search':
|
52 |
+
raise NotImplementedError
|
53 |
+
else:
|
54 |
+
raise ValueError("Unsuppored decode method")
|
55 |
+
|
56 |
+
model_out_text = tgt_tokenizer.decode(model_out.detach().cpu().numpy())
|
57 |
+
return model_out_text, attn
|
58 |
+
|
59 |
+
|
60 |
+
from config import load_config
|
61 |
+
from load_and_save_model import load_model_tokenizer
|
62 |
+
if __name__ == '__main__':
|
63 |
+
config = load_config(file_name='config_small.yaml')
|
64 |
+
model, src_tokenizer, tgt_tokenizer = load_model_tokenizer(config)
|
65 |
+
text = "ສະບາຍດີ" # Hello.
|
66 |
+
translation, attn = translate(
|
67 |
+
model, src_tokenizer, tgt_tokenizer, text
|
68 |
+
)
|
69 |
+
print(translation)
|