Source code for RADAR.time_series.algorithms.modelsTransformersTS.vanillaTransformer.attn

import torch
import torch.nn as nn
import torch.nn.functional as F
from math import sqrt

[docs] class ScaledDotProductAttention(nn.Module): ''' Scaled Dot-Product Attention ''' def __init__(self, d_keys = None, dropout=0.1): super().__init__() self.d_k = d_keys self.dropout = nn.Dropout(dropout)
[docs] def forward(self, queries, keys, values, mask=None): mul = torch.matmul(queries,keys.transpose(-2,-1)) d_k = keys.size(-1) if self.d_k is None else self.d_k # Convert to float ? ********* scale = mul / sqrt(d_k) # Apply mask if mask is not None: scale += (mask * -1e9) a = self.dropout(F.softmax(scale, dim=-1)) out = torch.matmul(a, values) return out,a
[docs] class MultiHeadAttention(nn.Module): ''' The MultiHeadAttention class is used to perform multi-head attention in the transformer model. It splits the input vectors into different heads or projections, calculates the attention scores, and concatenates the results before applying a linear transformation to obtain the final output. d_model: number of units in the model (dimensionality of the feature vectors) ''' def __init__(self,d_model, n_heads, d_keys=None, d_values=None,dropout_rate=0.1): super().__init__() self.d_model = d_model self.d_keys = d_keys self.d_values = d_values self.n_heads = n_heads self.dropout_rate = nn.Dropout(dropout_rate) self.build_model()
[docs] def build_model(self): assert self.d_model % self.n_heads == 0 self.d_head = self.d_model // self.n_heads # dimension of every headi **** # weight matrices for Q, K,V and output W0 self.qw = nn.Linear(self.d_model, self.n_heads * self.d_keys, bias=False) self.kw = nn.Linear(self.d_model, self.n_heads * self.d_keys, bias=False) self.vw = nn.Linear(self.d_model, self.n_heads * self.d_values, bias=False) self.fw = nn.Linear(self.n_heads * self.d_values, self.d_model, bias=False) self.norm = nn.LayerNorm(self.d_model, eps=1e-6) self.attn = ScaledDotProductAttention( d_keys = self.d_keys)
[docs] def forward(self, queries, keys, values, mask): residual = queries nh = self.n_heads batch_size = queries.size(0) qsize = queries.size(1) # Query, Key and Value matrices queries = (self.qw(queries).view(batch_size, qsize, nh, self.d_keys)).transpose(1,2) keys = (self.kw(keys).view(batch_size, keys.size(1), nh, self.d_keys)).transpose(1,2) values = (self.vw(values).view(batch_size, values.size(1), nh, self.d_values)).transpose(1,2) # Get the attention scores and concat output, attention = self.attn(queries, keys, values, mask) output = output.transpose(2,1).contiguous().view(batch_size ,qsize, -1) #get the output outputs = self.dropout_rate(self.fw(output)) # dropout and residual here or elayer y dlayer ? ******* #Normalize outputs = self.norm(outputs + residual) return outputs,attention