moiduy04 commited on
Commit
bc1ada8
1 Parent(s): ab5dc94

Upload 18 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ datasets/train_clean.dat filter=lfs diff=lfs merge=lfs -text
37
+ datasets/train_other.dat filter=lfs diff=lfs merge=lfs -text
config.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, Any
2
+
3
+ import yaml
4
+
5
+ from pathlib import Path
6
+
7
+ from utils import get_full_file_path
8
+
9
+ def load_config(file_name: str = 'config.yaml') -> Dict[str, Any]:
10
+ """
11
+ Loads a YAML config from a relative file path
12
+ """
13
+ file_path = get_full_file_path(file_name=file_name)
14
+ with open(file_path, 'r') as file:
15
+ config = yaml.safe_load(file)
16
+
17
+ return config
18
+
19
+ # print(list(load_config().items()))
20
+
21
+ def get_weights_file_path(config, epoch: str) -> str:
22
+ model_folder = config['model']['model_folder']
23
+ model_basename = config['model']['model_basename']
24
+ model_filename = f'{model_basename}{epoch}.pt'
25
+ return str(Path('.') / model_folder / model_filename)
26
+
config_small.yaml ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ experiment_name: "runs/transformer"
2
+
3
+ dataset:
4
+ src_lang: 'lo'
5
+ src_tokenizer: 'BPE'
6
+ src_max_seq_len: 400
7
+ tgt_lang: 'vi'
8
+ tgt_tokenizer: 'WordLevel'
9
+ tgt_max_seq_len: 350
10
+ train_dataset: 'train_clean.dat'
11
+ validate_dataset: 'dev_clean.dat'
12
+ tokenizer_file: "tokenizer_{0}.json"
13
+
14
+ model: # 16629775 parameters
15
+ d_model: 256
16
+ num_heads: 8
17
+ d_ff: 1024
18
+ dropout_p: 0.3
19
+ num_encoder_layers: 4
20
+ num_decoder_layers: 2
21
+ model_folder: "weights"
22
+ model_basename: "transformer_"
23
+ preload: "small"
datasets/dev_clean.dat ADDED
The diff for this file is too large to render. See raw diff
 
datasets/dev_other.dat ADDED
The diff for this file is too large to render. See raw diff
 
datasets/train_clean.dat ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d56aa43dba2e12da818bf80e7b42d0908637ee454ced035ae56f02022e13083
3
+ size 18582873
datasets/train_other.dat ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:081d7f90d8098f81a8c37fd50a0d8e2d193a2e0e2cc5048b08ca4ef218100d18
3
+ size 18008839
layers/decoder_layer.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Tuple
2
+
3
+ import torch.nn as nn
4
+ from torch import Tensor
5
+
6
+ from modules.multi_head_attention import MultiHeadAttention
7
+ from modules.positionwise_feed_forward import PositionwiseFeedForwardNetwork
8
+
9
+
10
+ class DecoderLayer(nn.Module):
11
+ """
12
+ A Decoder layer.
13
+
14
+ Args:
15
+ """
16
+ def __init__(
17
+ self,
18
+ d_model: int,
19
+ num_heads: int,
20
+ d_ff: int,
21
+ dropout_p: int,
22
+ ) -> None:
23
+ super(DecoderLayer, self).__init__()
24
+ self.self_attn_prenorm = nn.LayerNorm(d_model)
25
+ self.self_attn = MultiHeadAttention(d_model=d_model, num_heads=num_heads, dropout_p=dropout_p)
26
+ self.self_attn_dropout = nn.Dropout(p=dropout_p)
27
+
28
+ self.cross_attn_prenorm = nn.LayerNorm(d_model)
29
+ self.cross_attn = MultiHeadAttention(d_model=d_model, num_heads=num_heads, dropout_p=dropout_p)
30
+ self.cross_attn_dropout = nn.Dropout(p=dropout_p)
31
+
32
+ self.feed_forward_prenorm = nn.LayerNorm(d_model)
33
+ self.feed_forward = PositionwiseFeedForwardNetwork(d_model=d_model, d_ff=d_ff, dropout_p=dropout_p)
34
+
35
+ def forward(
36
+ self,
37
+ decoder_inputs: Tensor,
38
+ encoder_outputs: Tensor,
39
+ src_mask: Tensor,
40
+ tgt_mask: Tensor,
41
+ ) -> Tuple[Tensor, Tensor]:
42
+ residual = decoder_inputs
43
+ outputs = self.self_attn_prenorm(decoder_inputs)
44
+ outputs, attn = self.self_attn(outputs, outputs, outputs, tgt_mask)
45
+ outputs = self.self_attn_dropout(outputs) + residual
46
+
47
+ residual = outputs
48
+ outputs = self.self_attn_prenorm(outputs)
49
+ outputs, attn = self.self_attn(outputs, encoder_outputs, encoder_outputs, src_mask)
50
+ outputs = self.self_attn_dropout(outputs) + residual
51
+
52
+ residual = outputs
53
+ outputs = self.feed_forward_prenorm(outputs)
54
+ outputs = self.feed_forward(outputs)
55
+ outputs += residual
56
+
57
+ return outputs, attn
layers/encoder_layer.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Tuple
2
+
3
+ import torch.nn as nn
4
+ from torch import Tensor
5
+
6
+ from modules.multi_head_attention import MultiHeadAttention
7
+ from modules.positionwise_feed_forward import PositionwiseFeedForwardNetwork
8
+
9
+ class EncoderLayer(nn.Module):
10
+ """
11
+ An Encoder layer.
12
+
13
+ Args:
14
+ """
15
+ def __init__(
16
+ self,
17
+ d_model: int,
18
+ num_heads: int,
19
+ d_ff: int,
20
+ dropout_p: int,
21
+ ) -> None:
22
+ super(EncoderLayer, self).__init__()
23
+ self.self_attn_prenorm = nn.LayerNorm(d_model)
24
+ self.self_attn = MultiHeadAttention(d_model=d_model, num_heads=num_heads, dropout_p=dropout_p)
25
+ self.self_attn_dropout = nn.Dropout(p=dropout_p)
26
+
27
+ self.feed_forward_prenorm = nn.LayerNorm(d_model)
28
+ self.feed_forward = PositionwiseFeedForwardNetwork(d_model=d_model, d_ff=d_ff, dropout_p=dropout_p)
29
+
30
+ def forward(self, inputs: Tensor, src_mask: Tensor = None) -> Tuple[Tensor, Tensor]:
31
+ # Normalize -> sublayer -> dropout -> add residual
32
+ residual = inputs
33
+ inputs = self.self_attn_prenorm(inputs)
34
+ outputs, attn = self.self_attn(inputs, inputs, inputs, src_mask)
35
+ outputs = self.self_attn_dropout(outputs) + residual
36
+
37
+ residual = outputs
38
+ outputs = self.feed_forward_prenorm(outputs)
39
+ outputs = self.feed_forward(outputs)
40
+ outputs += residual
41
+
42
+ return outputs, attn
layers/projection_layer.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Tuple
2
+
3
+ import torch
4
+ import torch.nn as nn
5
+ from torch import Tensor
6
+
7
+ from modules.wrapper import Linear
8
+
9
+ class ProjectionLayer(nn.Module):
10
+ def __init__(self, d_model: int, vocab_size: int) -> None:
11
+ super(ProjectionLayer, self).__init__()
12
+ self.linear = Linear(d_model, vocab_size)
13
+
14
+ def forward(self, x):
15
+ # (batch, seq_len, d_model) -> (batch, seq_len, vocab_size)
16
+ return torch.log_softmax(self.linear(x), dim=-1)
model/encoder.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Tuple
2
+
3
+ import torch.nn as nn
4
+ from torch import Tensor
5
+
6
+ from layers.encoder_layer import EncoderLayer
7
+
8
+ class Encoder(nn.Module):
9
+ """
10
+ A transformer Encoder (no embeddings or positional embeddings)
11
+
12
+ Args:
13
+ """
14
+ def __init__(
15
+ self,
16
+ d_model: int,
17
+ num_heads: int,
18
+ d_ff: int,
19
+ dropout_p: int,
20
+ num_layers: int,
21
+ ) -> None:
22
+ super(Encoder, self).__init__()
23
+ self.layers = nn.ModuleList(
24
+ [
25
+ EncoderLayer(
26
+ d_model=d_model,
27
+ num_heads=num_heads,
28
+ d_ff=d_ff,
29
+ dropout_p=dropout_p,
30
+ )
31
+ for _ in range(num_layers)
32
+ ]
33
+ )
34
+
35
+ def forward(self, x: Tensor, src_mask: Tensor):
36
+ for layer in self.layers:
37
+ x, attn = layer(x, src_mask)
38
+ return x
modules/dot_product_attention.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional, Tuple
2
+
3
+ import numpy as np
4
+ import torch
5
+ import torch.nn as nn
6
+ import torch.nn.functional as F
7
+ from torch import Tensor
8
+
9
+ class ScaledDotProductAttention(nn.Module):
10
+ """
11
+ Scaled Dot-Product Attention (section 3.2.1)
12
+
13
+ Args:
14
+ - dim (int): dimension of d_k or d_head
15
+ - dropout_p (float): probability of dropout
16
+
17
+ Input:
18
+ - query (batch, num_heads, seq_len, d_head)
19
+ - key (batch, num_heads, seq_len, d_head)
20
+ - value (batch, num_heads, seq_len, d_head)
21
+ - mask ()
22
+
23
+ Output:
24
+ - context (batch, num_head, seq_len, d_head): Context matrix.
25
+ - attn (batch, num_head, seq_len, seq_len): Attention matrix for visualization.
26
+ """
27
+ def __init__(self, dim: int, dropout_p: float) -> None:
28
+ super(ScaledDotProductAttention, self).__init__()
29
+ self.sqrt_dim = np.sqrt(dim)
30
+ self.dropout = nn.Dropout(p = dropout_p)
31
+
32
+ def forward(
33
+ self,
34
+ query: Tensor,
35
+ key: Tensor,
36
+ value: Tensor,
37
+ mask: Optional[Tensor] = None,
38
+ ) -> Tuple[Tensor, Tensor]:
39
+
40
+ # (batch, num_heads, seq_len, d_head) @ (batch, num_heads, d_head, seq_len)
41
+ # ==> score: (batch, num_heads, seq_len, seq_len)
42
+ score = torch.matmul(query, key.transpose(-2, -1)) / self.sqrt_dim
43
+
44
+ if mask is not None:
45
+ score.masked_fill_(mask == 0, -1e4)
46
+
47
+ attn = F.softmax(score, -1)
48
+ # (batch, num_head, seq_len, seq_len)
49
+ attn = self.dropout(attn)
50
+
51
+ context = torch.matmul(attn, value)
52
+ # (batch, num_head, seq_len, d_head)
53
+
54
+ return context, attn
55
+
modules/multi_head_attention.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional, Tuple
2
+
3
+ import torch
4
+ import torch.nn as nn
5
+ from torch import Tensor
6
+
7
+ from modules.wrapper import Linear
8
+ from modules.dot_product_attention import ScaledDotProductAttention
9
+
10
+
11
+ class MultiHeadAttention(nn.Module):
12
+ """
13
+ Multi-Head Attention (section 3.2.2)
14
+
15
+ Args:
16
+ - d_model (int): dimension of model
17
+ - num_heads (int): number of heads
18
+ - dropout_p (float): probability of dropout
19
+
20
+ Inputs:
21
+ - query (batch, seq_len, d_model):
22
+ - key (batch, seq_len, d_model):
23
+ - value (batch, seq_len, d_model):
24
+ - mask ():
25
+
26
+ Output: (Tensor, Tensor):
27
+ - context ()
28
+ - attn (): Attention matrix for visualization.
29
+ """
30
+ def __init__(
31
+ self,
32
+ d_model: int,
33
+ num_heads: int,
34
+ dropout_p: int,
35
+ ) -> None:
36
+ super(MultiHeadAttention, self).__init__()
37
+
38
+ assert d_model % num_heads == 0, "d_model % num_heads should be 0"
39
+
40
+ self.d_model = d_model
41
+ self.num_heads = num_heads
42
+ self.d_head = d_model // num_heads
43
+
44
+ self.W_query = Linear(d_model, d_model)
45
+ self.W_key = Linear(d_model, d_model)
46
+ self.W_value = Linear(d_model, d_model)
47
+ # self.W_output = Linear(d_model, d_model)
48
+
49
+ self.scaled_dot_attn = ScaledDotProductAttention(d_model, dropout_p)
50
+
51
+
52
+ def forward(
53
+ self,
54
+ query: Tensor,
55
+ key: Tensor,
56
+ value: Tensor,
57
+ mask: Optional[Tensor] = None,
58
+ ) -> Tuple[Tensor, Tensor]:
59
+ batch_size = query.shape[0]
60
+
61
+ # original: (batch, seq_len, d_model)
62
+ # --forward--> (batch, seq_len, d_model)
63
+ # --view--> (batch, seq_len, num_heads, d_head)
64
+ # --transpose--> (batch, num_heads, seq_len, d_head)
65
+ query = self.W_query(query).view(batch_size, -1, self.num_heads, self.d_head).transpose(1,2)
66
+ key = self.W_key(key).view(batch_size, -1, self.num_heads, self.d_head).transpose(1,2)
67
+ value = self.W_value(value).view(batch_size, -1, self.num_heads, self.d_head).transpose(1,2)
68
+
69
+ context, attn = self.scaled_dot_attn(query, key, value, mask)
70
+
71
+ # (batch, num_heads, seq_len, d_head)
72
+ # --transpose--> (batch, seq_len, num_heads, d_head)
73
+ # --view--> (batch, seq_len, d_model)
74
+
75
+ context = context.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)
76
+ # context = self.W_output(context)
77
+
78
+ return context, attn
79
+
modules/positional_encoding.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+
3
+ import torch
4
+ import torch.nn as nn
5
+ from torch import Tensor
6
+
7
+
8
+ class PositionalEncoding(nn.Module):
9
+ r"""
10
+ Positional Encoding in "Attention Is All You Need" (section 3.5).
11
+
12
+ "Attention Is All You Need" uses sine and cosine functions of different frequencies:
13
+ PE_(pos, 2i) = sin(pos / power(10000, 2i / d_model))
14
+ PE_(pos, 2i+1) = cos(pos / power(10000, 2i / d_model))
15
+ only change is that calculations are done with -log(power(10000, 2i / d_model))
16
+
17
+ Uses OpenSpeech's PositionalEncoding, as I don't see the point in coding this from scratch.
18
+ """
19
+ def __init__(self, d_model: int, dropout_p: float, max_length: int = 5000) -> None:
20
+ super(PositionalEncoding, self).__init__()
21
+
22
+ self.dropout = nn.Dropout(p=dropout_p)
23
+
24
+ pe = torch.zeros(max_length, d_model, requires_grad=False)
25
+ position = torch.arange(0, max_length, dtype=torch.float).unsqueeze(1)
26
+ div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
27
+ pe[:, 0::2] = torch.sin(position * div_term)
28
+ pe[:, 1::2] = torch.cos(position * div_term)
29
+ pe = pe.unsqueeze(0)
30
+ self.register_buffer("pe", pe)
31
+
32
+ def forward(self, x: Tensor) -> Tensor:
33
+ x = x + (self.pe[:, :x.shape[1], :]).requires_grad_(False)
34
+ return self.dropout(x)
modules/positionwise_feed_forward.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch.nn as nn
2
+ from torch import Tensor
3
+
4
+ from modules.wrapper import Linear
5
+
6
+ class PositionwiseFeedForwardNetwork(nn.Module):
7
+ """
8
+ Position-wise Feed-Forward Network (section 3.3)
9
+
10
+ Args:
11
+ - d_model (int): dimension of input and output
12
+ - d_ff (int): dimension of inner-layer
13
+ - dropout_p (float): dropout probability
14
+ """
15
+
16
+ def __init__(self, d_model: int, d_ff: int, dropout_p: float) -> None:
17
+ super(PositionwiseFeedForwardNetwork, self).__init__()
18
+
19
+ self.feed_forward = nn.Sequential(
20
+ Linear(d_model, d_ff),
21
+ nn.Dropout(dropout_p),
22
+ nn.ReLU(),
23
+ Linear(d_ff, d_model),
24
+ nn.Dropout(dropout_p)
25
+ )
26
+
27
+ def forward(self, x: Tensor) -> Tensor:
28
+ return self.feed_forward(x)
modules/transformer_embedding.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch.nn as nn
2
+ from torch import Tensor
3
+ import numpy as np
4
+
5
+
6
+ class TransformerEmbedding(nn.Module):
7
+ """
8
+ Input Embeddings (section 3.4)
9
+
10
+ Embedds words to vectors of size d_
11
+
12
+ Args:
13
+ - d_model (int): dimension of model
14
+ - num_embeddings (int): size of the dictionary
15
+ """
16
+ def __init__(self, d_model: int, num_embeddings: int) -> None:
17
+ super(TransformerEmbedding, self).__init__()
18
+ self.sqrt_d_model = np.sqrt(d_model)
19
+ self.embedding = nn.Embedding(num_embeddings=num_embeddings, embedding_dim=d_model)
20
+
21
+ def forward(self, x: Tensor) -> Tensor:
22
+ return self.embedding(x) * self.sqrt_d_model
23
+
modules/wrapper.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch.nn as nn
2
+ from torch import Tensor
3
+
4
+
5
+ class Linear(nn.Module):
6
+ """
7
+ A wrapper class for nn.Linear
8
+ Initialize values using xxx
9
+ """
10
+ def __init__(
11
+ self,
12
+ in_features: int,
13
+ out_features: int,
14
+ bias: bool = True,
15
+ device=None,
16
+ dtype=None,
17
+ ):
18
+ super(Linear, self).__init__()
19
+ self.linear = nn.Linear(in_features, out_features, bias, device, dtype)
20
+ nn.init.xavier_uniform_(self.linear.weight)
21
+ if bias:
22
+ nn.init.zeros_(self.linear.bias)
23
+
24
+ def forward(self, x: Tensor) -> Tensor:
25
+ """
26
+ forward pass through linear layer.
27
+ """
28
+ return self.linear(x)
transformer.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Tuple
2
+
3
+ import torch.nn as nn
4
+ from torch import Tensor
5
+
6
+ from modules.transformer_embedding import TransformerEmbedding
7
+ from modules.positional_encoding import PositionalEncoding
8
+
9
+ from model.encoder import Encoder
10
+ from model.decoder import Decoder
11
+ from layers.projection_layer import ProjectionLayer
12
+
13
+ class Transformer(nn.Module):
14
+ """
15
+ Transformer.
16
+
17
+ Args:
18
+ - src_vocab_size (int): source vocabulary size
19
+ - tgt_vocab_size (int): target vocabulary size
20
+ - src_max_seq_len (int): source max sequence length
21
+ - tgt_max_seq_len (int): target max sequence length
22
+ - d_model (int): dimension of model
23
+ - num_heads (int): number of heads
24
+ - d_ff (int): dimension of hidden feed forward layer
25
+ - dropout_p (float): probability of dropout
26
+ - num_encoder_layers (int): number of encoder layers
27
+ - num_decoder_layers (int): number of decoder layers
28
+ """
29
+ def __init__(
30
+ self,
31
+ src_vocab_size: int,
32
+ tgt_vocab_size: int,
33
+ src_max_seq_len: int,
34
+ tgt_max_seq_len: int,
35
+ d_model: int = 512,
36
+ num_heads: int = 8,
37
+ d_ff: int = 2048,
38
+ dropout_p: float = 0.1,
39
+ num_encoder_layers: int = 6,
40
+ num_decoder_layers: int = 6,
41
+ ) -> None:
42
+ super(Transformer, self).__init__()
43
+
44
+ # Embedding layers
45
+ self.src_embedding = TransformerEmbedding(
46
+ d_model=d_model,
47
+ num_embeddings=src_vocab_size
48
+ )
49
+ self.tgt_embedding = TransformerEmbedding(
50
+ d_model=d_model,
51
+ num_embeddings=tgt_vocab_size
52
+ )
53
+
54
+ # Positional Encoding layers
55
+ self.src_positional_encoding = PositionalEncoding(
56
+ d_model=d_model,
57
+ dropout_p=dropout_p,
58
+ max_length=src_max_seq_len
59
+ )
60
+ self.tgt_positional_encoding = PositionalEncoding(
61
+ d_model=d_model,
62
+ dropout_p=dropout_p,
63
+ max_length=tgt_max_seq_len
64
+ )
65
+
66
+ # Encoder
67
+ self.encoder = Encoder(
68
+ d_model=d_model,
69
+ num_heads=num_heads,
70
+ d_ff=d_ff,
71
+ dropout_p=dropout_p,
72
+ num_layers=num_encoder_layers
73
+ )
74
+ # Decoder
75
+ self.decoder = Decoder(
76
+ d_model=d_model,
77
+ num_heads=num_heads,
78
+ d_ff=d_ff,
79
+ dropout_p=dropout_p,
80
+ num_layers=num_decoder_layers
81
+ )
82
+ # projecting decoder's output to the target language.
83
+ self.projection_layer = ProjectionLayer(
84
+ d_model=d_model,
85
+ vocab_size=tgt_vocab_size
86
+ )
87
+
88
+ def encode(
89
+ self,
90
+ src: Tensor,
91
+ src_mask: Tensor
92
+ ) -> Tensor:
93
+ """
94
+ Get encoder outputs.
95
+ """
96
+ src = self.src_embedding(src)
97
+ src = self.src_positional_encoding(src)
98
+ return self.encoder(src, src_mask)
99
+
100
+ def decode(
101
+ self,
102
+ encoder_output: Tensor,
103
+ src_mask: Tensor,
104
+ tgt: Tensor,
105
+ tgt_mask: Tensor
106
+ ) -> Tuple[Tensor, Tensor]:
107
+ """
108
+ Get decoder outputs for a set of target inputs.
109
+ """
110
+ tgt = self.tgt_embedding(tgt)
111
+ tgt = self.tgt_positional_encoding(tgt)
112
+ return self.decoder(
113
+ x=tgt,
114
+ encoder_output=encoder_output,
115
+ src_mask=src_mask,
116
+ tgt_mask=tgt_mask
117
+ )
118
+
119
+ def project(self, decoder_output: Tensor) -> Tensor:
120
+ """
121
+ Project decoder outputs to target vocabulary.
122
+ """
123
+ return self.projection_layer(decoder_output)
124
+
125
+ def forward(
126
+ self,
127
+ src: Tensor,
128
+ src_mask: Tensor,
129
+ tgt: Tensor,
130
+ tgt_mask: Tensor
131
+ ) -> Tuple[Tensor, Tensor]:
132
+ # src_mask = self.make_src_mask(src)
133
+ # tgt_mask = self.make_tgt_mask(tgt)
134
+
135
+ encoder_output = self.encode(src, src_mask)
136
+ decoder_output, attn = self.decode(
137
+ encoder_output, src_mask, tgt, tgt_mask
138
+ )
139
+ output = self.project(decoder_output)
140
+ return output, attn
141
+
142
+ def count_parameters(self):
143
+ return sum(p.numel() for p in self.parameters() if p.requires_grad)
144
+
145
+
146
+ def get_model(config, src_vocab_size: int, tgt_vocab_size: int) -> Transformer:
147
+ """
148
+ returns a `Transformer` model for a given config.
149
+ """
150
+ return Transformer(
151
+ src_vocab_size=src_vocab_size,
152
+ tgt_vocab_size=tgt_vocab_size,
153
+ src_max_seq_len=config['dataset']['src_max_seq_len'],
154
+ tgt_max_seq_len=config['dataset']['tgt_max_seq_len'],
155
+ d_model=config['model']['d_model'],
156
+ num_heads=config['model']['num_heads'],
157
+ d_ff=config['model']['d_ff'],
158
+ dropout_p=config['model']['dropout_p'],
159
+ num_encoder_layers=config['model']['num_encoder_layers'],
160
+ num_decoder_layers=config['model']['num_decoder_layers'],
161
+ )
translate.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Tuple
2
+
3
+ import torch
4
+ from torch import Tensor
5
+
6
+ from tokenizers import Tokenizer
7
+
8
+ from transformer import Transformer
9
+ from decode_method import greedy_decode
10
+
11
+ def translate(
12
+ model: Transformer,
13
+ src_tokenizer: Tokenizer,
14
+ tgt_tokenizer: Tokenizer,
15
+ text: str,
16
+ decode_method: str = 'greedy',
17
+ device = torch.device('cpu')
18
+ ) -> Tuple[str, Tensor]:
19
+ """
20
+ Translation function.
21
+
22
+ Output:
23
+ - translation (str): the translated string.
24
+ - attn (Tensor): The decoder's attention (for visualization)
25
+ """
26
+ sos_token = torch.tensor([src_tokenizer.token_to_id('<sos>')], dtype=torch.int64)
27
+ eos_token = torch.tensor([src_tokenizer.token_to_id('<eos>')], dtype=torch.int64)
28
+ pad_token = torch.tensor([src_tokenizer.token_to_id('<pad>')], dtype=torch.int64)
29
+
30
+ encoder_input_tokens = src_tokenizer.encode(text).ids
31
+ # <sos> + source_text + <eos> = encoder_input
32
+ encoder_input = torch.cat(
33
+ [
34
+ sos_token,
35
+ torch.tensor(encoder_input_tokens, dtype=torch.int64),
36
+ eos_token,
37
+ ]
38
+ )
39
+ encoder_mask = (encoder_input != pad_token).unsqueeze(0).unsqueeze(0).unsqueeze(0).int() # (1, 1, seq_len)
40
+
41
+ encoder_input = encoder_input.unsqueeze(0)
42
+ # encoder_mask = torch.tensor(encoder_mask)
43
+
44
+ assert encoder_input.size(0) == 1
45
+
46
+ if decode_method == 'greedy':
47
+ model_out, attn = greedy_decode(
48
+ model, encoder_input, encoder_mask, src_tokenizer, tgt_tokenizer, 400, device,
49
+ give_attn=True,
50
+ )
51
+ elif decode_method == 'beam-search':
52
+ raise NotImplementedError
53
+ else:
54
+ raise ValueError("Unsuppored decode method")
55
+
56
+ model_out_text = tgt_tokenizer.decode(model_out.detach().cpu().numpy())
57
+ return model_out_text, attn
58
+
59
+
60
+ from config import load_config
61
+ from load_and_save_model import load_model_tokenizer
62
+ if __name__ == '__main__':
63
+ config = load_config(file_name='config_small.yaml')
64
+ model, src_tokenizer, tgt_tokenizer = load_model_tokenizer(config)
65
+ text = "ສະບາຍດີ" # Hello.
66
+ translation, attn = translate(
67
+ model, src_tokenizer, tgt_tokenizer, text
68
+ )
69
+ print(translation)