Source code for mlbench_core.models.pytorch.transformer.modules.layers

import torch.nn.functional as F
from torch import nn

from .multihead_attention import MultiheadAttention


def Linear(in_features, out_features, bias=True):
    m = nn.Linear(in_features, out_features, bias)
    nn.init.xavier_uniform_(m.weight)
    nn.init.constant_(m.bias, 0.0)
    return m


def _dropout_add(x, residual, prob, is_training):
    out = F.dropout(x, p=prob, training=is_training)
    out = residual + out
    return out


def _relu_dropout(x, prob, is_training):
    out = F.threshold(x, 0.0, 0.0)
    out = F.dropout(out, p=prob, training=is_training)
    return out


[docs]class TransformerEncoderLayer(nn.Module): """Encoder layer block. In the original paper each operation (multi-head attention or FFN) is postprocessed with: `dropout -> add residual -> layernorm`. In the tensor2tensor code they suggest that learning is more robust when preprocessing each layer with layernorm and postprocessing with: `dropout -> add residual`. We default to the approach in the paper, but the tensor2tensor approach can be enabled by setting *args.encoder_normalize_before* to ``True``. Args: args (argparse.Namespace): parsed command-line arguments """ def __init__(self, args): super().__init__() self.embed_dim = args.encoder_embed_dim self.self_attn = MultiheadAttention( embed_dim=self.embed_dim, num_heads=args.encoder_attention_heads, dropout=args.attention_dropout, bias=False, softmax_type=args.softmax_type, ) self.dropout = args.dropout self.relu_dropout = args.relu_dropout self.normalize_before = args.encoder_normalize_before self.fc1 = Linear(self.embed_dim, args.encoder_ffn_embed_dim) self.fc2 = Linear(args.encoder_ffn_embed_dim, self.embed_dim) self.layer_norms = nn.ModuleList( [nn.LayerNorm(self.embed_dim) for _ in range(2)] ) def forward(self, x, encoder_padding_mask): residual = x x = self.maybe_layer_norm(0, x, before=True) x, _ = self.self_attn( query=x, key=x, value=x, key_padding_mask=encoder_padding_mask, need_weights=False, ) x = _dropout_add(x, residual, self.dropout, self.training) x = self.maybe_layer_norm(0, x, after=True) residual = x x = self.maybe_layer_norm(1, x, before=True) # Fuse Bias to GEMMs by making Tensors 2D sent_len, sents, hid_dim = x.size() x = x.view(sent_len * sents, hid_dim) x = _relu_dropout(self.fc1(x), self.relu_dropout, self.training) x = self.fc2(x) x = x.view(sent_len, sents, hid_dim) x = _dropout_add(x, residual, self.dropout, self.training) x = self.maybe_layer_norm(1, x, after=True) return x def maybe_layer_norm(self, i, x, before=False, after=False): assert before ^ after if after ^ self.normalize_before: return self.layer_norms[i](x) else: return x
[docs]class TransformerDecoderLayer(nn.Module): """Decoder layer block. In the original paper each operation (multi-head attention, encoder attention or FFN) is postprocessed with: `dropout -> add residual -> layernorm`. In the tensor2tensor code they suggest that learning is more robust when preprocessing each layer with layernorm and postprocessing with: `dropout -> add residual`. We default to the approach in the paper, but the tensor2tensor approach can be enabled by setting *args.decoder_normalize_before* to ``True``. Args: args (argparse.Namespace): parsed command-line arguments no_encoder_attn (bool, optional): whether to attend to encoder outputs (default: False). """ def __init__(self, args, no_encoder_attn=False): super().__init__() self.embed_dim = args.decoder_embed_dim self.self_attn = MultiheadAttention( embed_dim=self.embed_dim, num_heads=args.decoder_attention_heads, dropout=args.attention_dropout, bias=False, softmax_type=args.softmax_type, ) self.dropout = args.dropout self.relu_dropout = args.relu_dropout self.normalize_before = args.decoder_normalize_before self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) if no_encoder_attn: self.encoder_attn = None self.encoder_attn_layer_norm = None else: self.encoder_attn = MultiheadAttention( embed_dim=self.embed_dim, num_heads=args.decoder_attention_heads, dropout=args.attention_dropout, softmax_type=args.softmax_type, bias=False, ) self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim) self.fc1 = Linear(self.embed_dim, args.decoder_ffn_embed_dim) self.fc2 = Linear(args.decoder_ffn_embed_dim, self.embed_dim) self.final_layer_norm = nn.LayerNorm(self.embed_dim) self.need_attn = True def forward(self, x, encoder_out, encoder_padding_mask, incremental_state=None): residual = x x = self.maybe_layer_norm(self.self_attn_layer_norm, x, before=True) x, _ = self.self_attn( query=x, key=x, value=x, mask_future_timesteps=True, key_padding_mask=None, incremental_state=incremental_state, need_weights=False, ) x = _dropout_add(x, residual, self.dropout, self.training) x = self.maybe_layer_norm(self.self_attn_layer_norm, x, after=True) attn = None if self.encoder_attn is not None: residual = x x = self.maybe_layer_norm(self.encoder_attn_layer_norm, x, before=True) x, attn = self.encoder_attn( query=x, key=encoder_out, value=encoder_out, key_padding_mask=encoder_padding_mask, incremental_state=incremental_state, static_kv=True, need_weights=(not self.training and self.need_attn), ) x = _dropout_add(x, residual, self.dropout, self.training) x = self.maybe_layer_norm(self.encoder_attn_layer_norm, x, after=True) residual = x x = self.maybe_layer_norm(self.final_layer_norm, x, before=True) # Fuse Bias to GEMMs by making Tensors 2D sent_len, sents, hid_dim = x.size() x = x.view(sent_len * sents, hid_dim) x = _relu_dropout(self.fc1(x), self.relu_dropout, self.training) x = self.fc2(x) x = x.view(sent_len, sents, hid_dim) x = _dropout_add(x, residual, self.dropout, self.training) x = self.maybe_layer_norm(self.final_layer_norm, x, after=True) return x, attn def maybe_layer_norm(self, layer_norm, x, before=False, after=False): assert before ^ after if after ^ self.normalize_before: return layer_norm(x) else: return x