Source code for atten_lstm._main


__all__ = ["SelfAttention", "AttentionLSTM"]

import tensorflow as tf

initializers = tf.keras.initializers
regularizers = tf.keras.regularizers
constraints = tf.keras.constraints
Layer = tf.keras.layers.Layer
layers = tf.keras.layers
K = tf.keras.backend
Dense = tf.keras.layers.Dense
Lambda = tf.keras.layers.Lambda
Activation = tf.keras.layers.Activation
Softmax = tf.keras.layers.Softmax
dot = tf.keras.layers.dot
concatenate = tf.keras.layers.concatenate


[docs]class SelfAttention(Layer):
    """
    SelfAttention is originally proposed by Cheng et al., 2016 [1]_
    Here using the implementation of Philipperemy from
    [2]_ with modification
    that `attn_units` and `attn_activation` attributes can be changed.
    The default values of these attributes are same as used by the auther.
    However, there is another implementation of SelfAttention at [3]_
    but the author have cited a different paper i.e. Zheng et al., 2018 [4]_ and
    named it as additive attention.
    A useful discussion about this (in this class) implementation can be found at [5]_

    Examples
    --------
    >>> from atten_lstm import SelfAttention
    >>> from tensorflow.keras.layers import Input, LSTM, Dense
    >>> from tensorflow.keras.models import Model
    >>> import numpy as np
    >>> inp = Input(shape=(10, 1))
    >>> lstm = LSTM(2, return_sequences=True)(inp)
    >>> sa, _ = SelfAttention()(lstm)
    >>> out = Dense(1)(sa)
    ...
    >>> model = Model(inputs=inp, outputs=out)
    >>> model.compile(loss="mse")
    ...
    >>> print(model.summary())
    ...
    >>> x = np.random.random((100, 10, 1))
    >>> y = np.random.random((100, 1))
    >>> h = model.fit(x=x, y=y)

    References
    ----------
    .. [1] https://arxiv.org/pdf/1601.06733.pdf
    .. [2] https://github.com/philipperemy/keras-attention-mechanism/blob/master/attention/attention.py
    .. [3] https://github.com/CyberZHG/keras-self-attention/blob/master/keras_self_attention/seq_self_attention.py
    .. [4] https://arxiv.org/pdf/1806.01264.pdf
    .. [5] https://github.com/philipperemy/keras-attention-mechanism/issues/14
    """
[docs]    def __init__(
            self,
            units:int = 128,
            activation:str = 'tanh',
            return_attention_weights:bool = True,
            **kwargs
    ):
        """
        Parameters
        ----------
            units : int, optional (default=128)
                number of units for attention mechanism
            activation : str, optional (default="tanh")
                activation function to use in attention mechanism
            return_attention_weights : bool, optional (default=True)
                if True, then it returns two outputs, first is attention vector
                of shape (batch_size, units) and second is of shape (batch_size, time_steps)
                If False, then returns only attention vector.
            **kwargs :
                any additional keyword arguments for keras Layer.
        """
        self.units = units
        self.attn_activation = activation
        self.return_attention_weights = return_attention_weights
        super().__init__(**kwargs)

    def build(self, input_shape):
        hidden_size = int(input_shape[-1])
        self.d1 = Dense(hidden_size, use_bias=False)
        self.act = Activation('softmax')
        self.d2 = Dense(self.units, use_bias=False, activation=self.attn_activation)
        return

    def call(self, hidden_states, *args, **kwargs):
        """
        Many-to-one attention mechanism for Keras.
        @param hidden_states: 3D tensor with shape (batch_size, time_steps, input_dim).
        @return: 2D tensor with shape (batch_size, 128)
        @author: felixhao28.
        The original code which has here been modified had Apache Licence 2.0.
        """
        hidden_size = int(hidden_states.shape[2])
        # Inside dense layer
        #              hidden_states            dot               W            =>           score_first_part
        # (batch_size, time_steps, hidden_size) dot (hidden_size, hidden_size) => (batch_size, time_steps, hidden_size)
        # W is the trainable weight matrix of attention Luong's multiplicative style score
        score_first_part = self.d1(hidden_states)
        #            score_first_part           dot        last_hidden_state     => attention_weights
        # (batch_size, time_steps, hidden_size) dot   (batch_size, hidden_size)  => (batch_size, time_steps)
        h_t = Lambda(lambda x: x[:, -1, :], output_shape=(hidden_size,))(hidden_states)
        score = dot([score_first_part, h_t], [2, 1])
        attention_weights =self.act(score)
        # (batch_size, time_steps, hidden_size) dot (batch_size, time_steps) => (batch_size, hidden_size)
        context_vector = dot([hidden_states, attention_weights], [1, 1])
        pre_activation = concatenate([context_vector, h_t])
        attention_vector = self.d2(pre_activation)

        if self.return_attention_weights:
            return attention_vector, attention_weights
        return attention_vector


[docs]class AttentionLSTM(Layer):
    """
    This layer combines Self Attention [7]_ mechanism with LSTM [8]_. It uses one
    separate LSTM+SelfAttention block for each input feature. The output from each
    LSTM+SelfAttention block is concatenated and returned. The layer expects
    same input dimension as by LSTM i.e. (batch_size, time_steps, input_features).
    For usage see example [9]_.

    References
    ----------
    .. [7] https://ai4water.readthedocs.io/en/dev/models/layers.html#selfattention

    .. [8] https://www.tensorflow.org/api_docs/python/tf/keras/layers/LSTM

    .. [9] https://ai4water.readthedocs.io/en/dev/auto_examples/attention_lstm.html#
    """
[docs]    def __init__(
            self,
            num_inputs: int,
            lstm_units: int,
            attn_units: int = 128,
            attn_activation: str = "tanh",
            lstm_kwargs:dict = None,
            **kwargs
    ):
        """
        Parameters
        ----------
            num_inputs: int
                number of inputs
            lstm_units : int
                number of units in LSTM layers
            attn_units : int, optional (default=128)
                number of units in SelfAttention layers
            attn_activation : str, optional (default="tanh")
                activation function in SelfAttention layers
            lstm_kwargs : dict, optional (default=None)
                any keyword arguments for LSTM layer.

        Example
        -------
        >>> import numpy as np
        >>> from tensorflow.keras.models import Model
        >>> from tensorflow.keras.layers import Input, Dense
        >>> from atten_lstm import AttentionLSTM
        >>> seq_len = 20
        >>> num_inputs = 2
        >>> inp = Input(shape=(seq_len, num_inputs))
        >>> outs = AttentionLSTM(num_inputs, 16)(inp)
        >>> outs = Dense(1)(outs)
        ...
        >>> model = Model(inputs=inp, outputs=outs)
        >>> model.compile(loss="mse")
        ...
        >>> print(model.summary())
        ... # define input
        >>> x = np.random.random((100, seq_len, num_inputs))
        >>> y = np.random.random((100, 1))
        >>> h = model.fit(x=x, y=y)

        """
        super(AttentionLSTM, self).__init__(**kwargs)
        self.num_inputs = num_inputs
        self.lstm_units = lstm_units
        self.attn_units = attn_units
        self.attn_activation = attn_activation

        if lstm_kwargs is None:
            lstm_kwargs = {}

        assert isinstance(lstm_kwargs, dict)
        self.lstm_kwargs = lstm_kwargs

        self.lstms = []
        self.sas = []
        for i in range(self.num_inputs):
            self.lstms.append(tf.keras.layers.LSTM(self.lstm_units, return_sequences=True, **self.lstm_kwargs))
            self.sas.append(SelfAttention(self.attn_units, self.attn_activation))

    def __call__(self, inputs, *args, **kwargs):

        assert self.num_inputs == inputs.shape[-1], f"""
        num_inputs {self.num_inputs} does not match with input features.
        Inputs are of shape {inputs.shape}"""

        outs = []
        for i in range(inputs.shape[-1]):
            lstm = self.lstms[i](tf.expand_dims(inputs[..., i], axis=-1))
            out, _ = self.sas[i](lstm)
            outs.append(out)

        return tf.concat(outs, axis=-1)