gluonnlp/model/lstmpcellwithclip.py

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
"""LSTM projection cell with cell clip and projection clip."""
__all__ = ['LSTMPCellWithClip']

from mxnet.gluon.contrib.rnn import LSTMPCell


class LSTMPCellWithClip(LSTMPCell):
    r"""Long-Short Term Memory Projected (LSTMP) network cell with cell clip and projection clip.
    Each call computes the following function:

    .. math::

        \DeclareMathOperator{\sigmoid}{sigmoid}
        \begin{array}{ll}
        i_t = \sigmoid(W_{ii} x_t + b_{ii} + W_{ri} r_{(t-1)} + b_{ri}) \\
        f_t = \sigmoid(W_{if} x_t + b_{if} + W_{rf} r_{(t-1)} + b_{rf}) \\
        g_t = \tanh(W_{ig} x_t + b_{ig} + W_{rc} r_{(t-1)} + b_{rg}) \\
        o_t = \sigmoid(W_{io} x_t + b_{io} + W_{ro} r_{(t-1)} + b_{ro}) \\
        c_t = c_{\text{clip}}(f_t * c_{(t-1)} + i_t * g_t) \\
        h_t = o_t * \tanh(c_t) \\
        r_t = p_{\text{clip}}(W_{hr} h_t)
        \end{array}

    where :math:`c_{\text{clip}}` is the cell clip applied on the next cell;
    :math:`r_t` is the projected recurrent activation at time `t`,
    :math:`p_{\text{clip}}` means apply projection clip on he projected output.
    math:`h_t` is the hidden state at time `t`, :math:`c_t` is the
    cell state at time `t`, :math:`x_t` is the input at time `t`, and :math:`i_t`,
    :math:`f_t`, :math:`g_t`, :math:`o_t` are the input, forget, cell, and
    out gates, respectively.

    Parameters
    ----------
    hidden_size : int
        Number of units in cell state symbol.
    projection_size : int
        Number of units in output symbol.
    i2h_weight_initializer : str or Initializer
        Initializer for the input weights matrix, used for the linear
        transformation of the inputs.
    h2h_weight_initializer : str or Initializer
        Initializer for the recurrent weights matrix, used for the linear
        transformation of the hidden state.
    h2r_weight_initializer : str or Initializer
        Initializer for the projection weights matrix, used for the linear
        transformation of the recurrent state.
    i2h_bias_initializer : str or Initializer, default 'lstmbias'
        Initializer for the bias vector. By default, bias for the forget
        gate is initialized to 1 while all other biases are initialized
        to zero.
    h2h_bias_initializer : str or Initializer
        Initializer for the bias vector.
    prefix : str
        Prefix for name of `Block`s
        (and name of weight if params is `None`).
    params : Parameter or None
        Container for weight sharing between cells.
        Created if `None`.
    cell_clip : float
        Clip cell state between `[-cell_clip, cell_clip]` in LSTMPCellWithClip cell
    projection_clip : float
        Clip projection between `[-projection_clip, projection_clip]` in LSTMPCellWithClip cell
    """
    def __init__(self, hidden_size, projection_size,
                 i2h_weight_initializer=None, h2h_weight_initializer=None,
                 h2r_weight_initializer=None,
                 i2h_bias_initializer='zeros', h2h_bias_initializer='zeros',
                 input_size=0, cell_clip=None, projection_clip=None, prefix=None, params=None):
        super(LSTMPCellWithClip, self).__init__(hidden_size,
                                                projection_size,
                                                i2h_weight_initializer,
                                                h2h_weight_initializer,
                                                h2r_weight_initializer,
                                                i2h_bias_initializer,
                                                h2h_bias_initializer,
                                                input_size,
                                                prefix=prefix,
                                                params=params)

        self._cell_clip = cell_clip
        self._projection_clip = projection_clip

    # pylint: disable= arguments-differ
    def hybrid_forward(self, F, inputs, states, i2h_weight,
                       h2h_weight, h2r_weight, i2h_bias, h2h_bias):
        r"""Hybrid forward computation for Long-Short Term Memory Projected network cell
        with cell clip and projection clip.

        Parameters
        ----------
        inputs : input tensor with shape `(batch_size, input_size)`.
        states : a list of two initial recurrent state tensors, with shape
            `(batch_size, projection_size)` and `(batch_size, hidden_size)` respectively.

        Returns
        --------
        out : output tensor with shape `(batch_size, num_hidden)`.
        next_states : a list of two output recurrent state tensors. Each has
            the same shape as `states`.
        """
        prefix = 't%d_'%self._counter
        i2h = F.FullyConnected(data=inputs, weight=i2h_weight, bias=i2h_bias,
                               num_hidden=self._hidden_size*4, name=prefix+'i2h')
        h2h = F.FullyConnected(data=states[0], weight=h2h_weight, bias=h2h_bias,
                               num_hidden=self._hidden_size*4, name=prefix+'h2h')
        gates = i2h + h2h
        slice_gates = F.SliceChannel(gates, num_outputs=4, name=prefix+'slice')
        in_gate = F.Activation(slice_gates[0], act_type='sigmoid', name=prefix+'i')
        forget_gate = F.Activation(slice_gates[1], act_type='sigmoid', name=prefix+'f')
        in_transform = F.Activation(slice_gates[2], act_type='tanh', name=prefix+'c')
        out_gate = F.Activation(slice_gates[3], act_type='sigmoid', name=prefix+'o')
        next_c = F._internal._plus(forget_gate * states[1], in_gate * in_transform,
                                   name=prefix+'state')
        if self._cell_clip is not None:
            next_c = next_c.clip(-self._cell_clip, self._cell_clip)
        hidden = F._internal._mul(out_gate, F.Activation(next_c, act_type='tanh'),
                                  name=prefix+'hidden')
        next_r = F.FullyConnected(data=hidden, num_hidden=self._projection_size,
                                  weight=h2r_weight, no_bias=True, name=prefix+'out')
        if self._projection_clip is not None:
            next_r = next_r.clip(-self._projection_clip, self._projection_clip)

        return next_r, [next_r, next_c]