from typing import * import torch import torch.nn as nn import torch.nn.functional as F Tensor = torch.Tensor class Attention(nn.Module): """Container for applying an attention scoring function.""""" def __init__(self, score: nn.Module, dropout: nn.Module = None): super().__init__() self.score = score self.dropout = dropout def forward(self, decoder_state: Tensor, encoder_state: Tensor, source_mask: Tensor = None) -> Tuple[Tensor, Tensor]: """Return context and attention weights. Accepts a boolean mask indicating padding in the source sequence.""""" (B, L, D), (B, T, _) = decoder_state.shape, encoder_state.shape scores = self.score(decoder_state, encoder_state) # (B, L, T) if source_mask is not None: # (B, T) scores.masked_fill_(source_mask.view(B, 1, T), -1e4) weights = F.softmax(scores, dim=-1) # (B, L, T) if self.dropout is not None: weights = self.dropout(weights) context = weights @ encoder_state # (B, L, _) return context, weights # (B, L, _), (B, L, T) class ConcatScore(nn.Module): """A two layer network as an attention scoring function. Expects bidirectional encoder.""""" def __init__(self, d: int): super().__init__() self.w = nn.Linear(3*d, d) self.v = nn.Linear(d, 1, bias=False) self.initialize_parameters() def forward(self, decoder_state: Tensor, encoder_state: Tensor) -> Tensor: """Return attention scores.""""" (B, L, D), (B, T, _) = decoder_state.shape, encoder_state.shape # (B, L, D), (B, T, 2*D) decoder_state = decoder_state.repeat_interleave(T, dim=1) # (B, L*T, D) encoder_state = encoder_state.repeat(1, L, 1) # (B, L*T, 2*D) concatenated = torch.cat((decoder_state, encoder_state), dim=-1) # (B, L*T, 3*D) scores = self.v(torch.tanh(self.w(concatenated))) # (B, L*T, 1) return scores.view(B, L, T) # (B, L, T) @torch.no_grad() def initialize_parameters(self): nn.init.xavier_uniform_(self.w.weight) nn.init.xavier_uniform_(self.v.weight, gain=nn.init.calculate_gain("tanh")) nn.init.zeros_(self.w.bias)