1"""
2Provides neural-network specific Ops.
3
4Notes
5-----
6TODO: factor this out into a neural-network toolbox.
7
8We register all optimization with the gpu tag as we don't
9implement all the intermediate case on the GPU (in particular
10AdvancedSubtensor). So to make sure it run well on the gpu with
11fast_compile, we register them as needed for the GPU. This can be
12revisited later when all the intermediate part are on the GPU.
13
14"""
15from __future__ import absolute_import, print_function, division
16import logging
17import warnings
18import numpy as np
19from six.moves import xrange
20
21import theano
22from theano import gof
23from theano import scalar
24from theano.tensor import extra_ops, as_tensor_variable
25from theano.gof.opt import copy_stack_trace
26from theano.tensor import basic as tensor, subtensor, opt, elemwise
27from theano.tensor.type import (values_eq_approx_remove_inf,
28                                values_eq_approx_remove_nan)
29from theano.compile import optdb
30from theano.gof import Apply
31
32from theano.tensor.nnet.sigm import sigmoid, softplus
33from theano.gradient import DisconnectedType
34from theano.gradient import grad_not_implemented
35from theano.tensor.nnet.blocksparse import sparse_block_dot
36
37############
38#
39# TENSOR OPS
40#
41
42
43class SoftmaxWithBias(gof.Op):
44    """
45    An L{Op} for the output of neural-net multiclass classifiers.
46
47    Attributes
48    ----------
49    x : a matrix of floats (32 or 64)
50    b : a [row] vector of floats (32 or 64), length is number of cols in x
51
52    This L{Op}'s output is softmax(x+b).
53    softmax(x[i]) is the i'th distribution over len(x[i]) options.
54
55    """
56
57    nin = 2
58    nout = 1
59    __props__ = ()
60
61    def make_node(self, x, b):
62        x = tensor.as_tensor_variable(x)
63        b = tensor.as_tensor_variable(b)
64        if x.type.ndim != 2 \
65                or x.type.dtype not in tensor.float_dtypes:
66            raise ValueError('x must be 2-d tensor of floats')
67        if b.type.ndim != 1 \
68                or b.type.dtype not in tensor.float_dtypes:
69            raise ValueError('b must be 1-d tensor of floats')
70
71        sm = x.type()
72        return Apply(self, [x, b], [sm])
73
74    def perform(self, node, input_storage, output_storage):
75        x, b = input_storage
76        if b.shape[0] != x.shape[1]:
77            raise ValueError('b must have same number of columns as x')
78
79        # sm = numpy.zeros_like(x)
80        # for i in xrange(sm.shape[0]):
81            # row = x[i] + b
82            # sm[i] = numpy.exp(row - numpy.max(row))
83            # sm[i] *= 1.0 / numpy.sum(sm[i])
84        # output_storage[0][0] = sm
85
86        if x.size == 0:
87            # Numpy doesn't like the max of a zero-sized object.
88            output_storage[0][0] = np.zeros(x.shape, dtype=x.dtype)
89            return
90
91        x_dtype = x.dtype
92        # Perform computations in float32 otherwise the result is too imprecise
93        if x.dtype == 'float16':
94            x = x.astype('float32')
95
96        x_plus_b = x + b[None, :]
97        e_x = np.exp(x_plus_b - x_plus_b.max(axis=1)[:, None])
98        e_x *= 1.0 / e_x.sum(axis=1)[:, None]
99        # default for copy is True and we don't need a copy if the
100        # data type matches.
101        output_storage[0][0] = e_x.astype(x_dtype, copy=False)
102
103    def L_op(self, inp, outputs, grads):
104        x, b = inp
105        g_sm, = grads
106
107        if isinstance(g_sm.type, DisconnectedType):
108            return [DisconnectedType()(), DisconnectedType()()]
109
110        dx = softmax_grad(g_sm, outputs[0])
111        db = tensor.sum(dx, axis=0)
112        return dx, db
113
114    def infer_shape(self, node, shape):
115        return [shape[0]]
116
117    def c_headers(self):
118        return ['<iostream>', '<cmath>']
119
120    @staticmethod
121    def c_code_template(dtype):
122        # this implementation was lifted from
123        # /u/bergstrj/cvs/bergstrj/src/feb07/nn.cxx
124
125        # TODO: put this into a templated function, in the support code
126        # TODO: declare the max of each row as an Op output
127
128        # TODO: set error messages for failures in this code
129
130        # TODO: use this to accept float32 and int32:
131        # node.inputs[0].type.dtype_specs()[1]
132        init_decl = """
133        npy_intp* Nx = PyArray_DIMS(%(x)s);
134        npy_intp Sx = 0;
135        npy_intp Sb = 0;
136        npy_intp Ssm = 0;
137
138
139        if (PyArray_NDIM(%(x)s) != 2)
140        {
141            PyErr_SetString(PyExc_ValueError, "not a 2d tensor");
142            %(fail)s;
143        }
144        if (PyArray_NDIM(%(b)s) != 1)
145        {
146            PyErr_SetString(PyExc_ValueError, "b not 1d tensor");
147            %(fail)s;
148        }
149        if ((PyArray_TYPE(%(x)s) != NPY_DOUBLE) &&
150            (PyArray_TYPE(%(x)s) != NPY_FLOAT))
151        {
152            PyErr_SetString(PyExc_TypeError, "not a float");
153            %(fail)s;
154        }
155        if ((PyArray_TYPE(%(b)s) != NPY_DOUBLE) &&
156            (PyArray_TYPE(%(b)s) != NPY_FLOAT))
157        {
158            PyErr_SetString(PyExc_TypeError, "b not float");
159            %(fail)s;
160        }
161        if ((PyArray_DIMS(%(x)s)[1] != PyArray_DIMS(%(b)s)[0]))
162        {
163            PyErr_Format(PyExc_ValueError,
164                         "number of columns in x (%%ld) does not match length of b (%%ld)",
165                (long int)PyArray_DIMS(%(x)s)[1], (long int)PyArray_DIMS(%(b)s)[0]);
166            %(fail)s;
167        }
168
169        if ((NULL == %(sm)s)
170            || (PyArray_DIMS(%(sm)s)[0] != PyArray_DIMS(%(x)s)[0])
171            || (PyArray_DIMS(%(sm)s)[1] != PyArray_DIMS(%(x)s)[1]))
172        {
173            if (NULL != %(sm)s) Py_XDECREF(%(sm)s);
174            %(sm)s = (PyArrayObject*)PyArray_SimpleNew(2, PyArray_DIMS(%(x)s),
175                                                       PyArray_TYPE(%(x)s));
176            if(!%(sm)s) {
177                PyErr_SetString(PyExc_MemoryError,
178                     "failed to alloc sm output");
179                %(fail)s
180            }
181        }
182        Sx = PyArray_STRIDES(%(x)s)[1]/sizeof(dtype_%(x)s);
183        Sb = PyArray_STRIDES(%(b)s)[0]/sizeof(dtype_%(b)s);
184        Ssm = PyArray_STRIDES(%(sm)s)[1]/sizeof(dtype_%(sm)s);
185
186        """
187
188        begin_row_loop = """
189        for (size_t i = 0; i < Nx[0]; ++i)
190        {
191            size_t j;
192            double sum = 0.0;
193
194            const dtype_%(x)s* __restrict__ x_i = (dtype_%(x)s*)(PyArray_BYTES(%(x)s) + PyArray_STRIDES(%(x)s)[0] * i);
195            const dtype_%(b)s* __restrict__ b_i = (dtype_%(b)s*)(PyArray_BYTES(%(b)s));
196            dtype_%(sm) s* __restrict__ sm_i = (dtype_%(sm)s*)(PyArray_BYTES(%(sm)s) + PyArray_STRIDES(%(sm)s)[0] * i);
197
198            npy_intp Sx = PyArray_STRIDES(%(x)s)[1]/sizeof(dtype_%(x)s);
199            npy_intp Sb = PyArray_STRIDES(%(b)s)[0]/sizeof(dtype_%(b)s);
200            npy_intp Ssm = PyArray_STRIDES(%(sm)s)[1]/sizeof(dtype_%(sm)s);
201
202            size_t row_max_j=0;
203            dtype_%(sm)s row_max = x_i[0] + b_i[0];
204            //std::cout << "0 " << row_max << "\\n";
205            // Get the maximum value of the row
206            for (j = 1; j < Nx[1]; ++j)
207            {
208                dtype_%(sm)s row_ij = x_i[j * Sx] +  b_i[j * Sb];
209                //std::cout << "1 " << row_ij << "\\n";
210                row_max_j = (row_ij > row_max) ? j : row_max_j;
211                row_max   = (row_ij > row_max) ? row_ij : row_max;
212            }
213
214        """
215
216        inside_row_loop = """
217            for (j = 0; j < Nx[1]; ++j)
218            {
219                dtype_%(sm)s row_ij = x_i[j * Sx] +  b_i[j * Sb];
220                //std::cout << "2 " << j << " " << row_ij << " " << row_max << "\\n";
221                dtype_%(sm)s sm_ij = exp(row_ij - row_max);
222                //std::cout << "3 " << j << " " << sm_ij << "\\n";
223                sum += sm_ij;
224                sm_i[j * Ssm] = sm_ij;
225            }
226
227            //cblas_dscal(x.N, 1.0 / sum, &mat_at(s,i,0), s.n);
228            double sum_inv = 1.0 / sum;
229            for (j = 0; j < Nx[1]; ++j)
230            {
231                sm_i[j * Ssm] *= sum_inv;
232            }
233
234        """
235
236        # Get the vectorized version of exp if it exist
237        try:
238            vec_exp = theano.scalar.exp.c_code_contiguous_raw(dtype,
239                                                              "Nx[1]", "sm_i", "sm_i")
240            inside_row_loop_contig = """
241            for (j = 0; j < Nx[1]; ++j)
242            {
243                dtype_%%(sm)s row_ij = x_i[j * Sx] +  b_i[j * Sb];
244                //std::cout << "2 " << j << " " << row_ij << " " << row_max << "\\n";
245                dtype_%%(sm)s sm_ij = row_ij - row_max;
246                //std::cout << "3 " << j << " " << sm_ij << "\\n";
247                sm_i[j * Ssm] = sm_ij;
248            }
249            %(vec_exp)s;
250            for (j = 0; j < Nx[1]; ++j)
251            {
252                sum += sm_i[j * Ssm];
253            }
254
255            //cblas_dscal(x.N, 1.0 / sum, &mat_at(s,i,0), s.n);
256            double sum_inv = 1.0 / sum;
257            for (j = 0; j < Nx[1]; ++j)
258            {
259                sm_i[j * Ssm] *= sum_inv;
260            }
261
262        """ % locals()
263            inside_row_loop = """
264            if(Ssm == 1){
265                %(inside_row_loop_contig)s
266            }else{
267                %(inside_row_loop)s
268            }
269            """ % locals()
270        except theano.gof.utils.MethodNotDefined:
271            pass
272        end_row_loop = """
273        }
274        """
275
276        return (init_decl, begin_row_loop, inside_row_loop, end_row_loop)
277
278    def c_code(self, node, name, inp, out, sub):
279        x, b = inp
280        sm, = out
281        code_template = ''.join(self.c_code_template(
282            node.inputs[0].type.dtype_specs()[1]))
283        return code_template % dict(locals(), **sub)
284
285    @staticmethod
286    def c_code_cache_version():
287        return (8,)
288
289softmax_with_bias = SoftmaxWithBias()
290
291
292class SoftmaxGrad(gof.Op):
293    """
294    Gradient wrt x of the Softmax Op.
295
296    """
297
298    nin = 2
299    nout = 1
300    __props__ = ()
301
302    def make_node(self, dy, sm):
303        dy = tensor.as_tensor_variable(dy)
304        sm = tensor.as_tensor_variable(sm)
305        if dy.type.ndim not in (1, 2) \
306                or dy.type.dtype not in tensor.float_dtypes:
307            raise ValueError('dy must be 1-d or 2-d tensor of floats. Got ',
308                             dy.type)
309        if dy.ndim == 1:
310            dy = tensor.shape_padleft(dy, n_ones=1)
311        if sm.ndim == 1:
312            sm = tensor.shape_padleft(sm, n_ones=1)
313        return Apply(self, [dy, sm], [sm.type()])
314
315    def perform(self, node, input_storage, output_storage):
316        dy, sm = input_storage
317        dx = np.zeros_like(sm)
318        # dx[i,j] = - (\sum_k dy[i,k] sm[i,k]) sm[i,j] + dy[i,j] sm[i,j]
319        for i in xrange(sm.shape[0]):
320            dy_times_sm_i = dy[i] * sm[i]
321            dx[i] = dy_times_sm_i - sum(dy_times_sm_i) * sm[i]
322        output_storage[0][0] = dx
323
324    def grad(self, inp, grads):
325        dy, sm = inp
326        g, = grads
327
328        tmp = g + tensor.neg(tensor.sum(g * sm, axis=1).dimshuffle((0, 'x')))
329        g_dy = tmp * sm
330
331        tmp2 = tensor.sum(dy * sm, axis=1).dimshuffle((0, 'x'))
332        g_sm = tmp * dy - g * tmp2
333
334        return g_dy, g_sm
335
336    def infer_shape(self, node, shape):
337        return [shape[1]]
338
339    def c_code_cache_version(self):
340        return (3,)
341
342    def c_code(self, node, name, inp, out, sub):
343        dy, sm = inp
344        dx, = out
345        return '''
346        if ((PyArray_TYPE(%(dy)s) != NPY_DOUBLE) &&
347            (PyArray_TYPE(%(dy)s) != NPY_FLOAT))
348        {
349            PyErr_SetString(PyExc_TypeError,
350                 "types should be float or float64");
351            %(fail)s;
352        }
353        if ((PyArray_TYPE(%(sm)s) != NPY_DOUBLE) &&
354            (PyArray_TYPE(%(sm)s) != NPY_FLOAT))
355        {
356            PyErr_SetString(PyExc_TypeError,
357                 "types should be float or float64");
358            %(fail)s;
359        }
360        if ((PyArray_NDIM(%(dy)s) != 2)
361            || (PyArray_NDIM(%(sm)s) != 2))
362        {
363            PyErr_SetString(PyExc_ValueError, "rank error");
364            %(fail)s;
365        }
366        if (PyArray_DIMS(%(dy)s)[0] != PyArray_DIMS(%(sm)s)[0])
367        {
368            PyErr_SetString(PyExc_ValueError, "dy.shape[0] != sm.shape[0]");
369            %(fail)s;
370        }
371        if ((NULL == %(dx)s)
372            || (PyArray_DIMS(%(dx)s)[0] != PyArray_DIMS(%(sm)s)[0])
373            || (PyArray_DIMS(%(dx)s)[1] != PyArray_DIMS(%(sm)s)[1]))
374        {
375            Py_XDECREF(%(dx)s);
376            %(dx)s = (PyArrayObject*) PyArray_SimpleNew(2,
377                                                        PyArray_DIMS(%(sm)s),
378                                                        PyArray_TYPE(%(sm)s));
379            if (!%(dx)s)
380            {
381                PyErr_SetString(PyExc_MemoryError,
382                     "failed to alloc dx output");
383                %(fail)s;
384            }
385        }
386
387        for (size_t i = 0; i < PyArray_DIMS(%(dx)s)[0]; ++i)
388        {
389            const dtype_%(dy)s* __restrict__ dy_i = (dtype_%(dy)s*) (PyArray_BYTES(%(dy)s) + PyArray_STRIDES(%(dy)s)[0] * i);
390            npy_intp Sdy = PyArray_STRIDES(%(dy)s)[1]/sizeof(dtype_%(dy)s);
391            const dtype_%(sm)s* __restrict__ sm_i = (dtype_%(sm)s*) (PyArray_BYTES(%(sm)s) + PyArray_STRIDES(%(sm)s)[0] * i);
392            npy_intp Ssm = PyArray_STRIDES(%(sm)s)[1]/sizeof(dtype_%(sm)s);
393            dtype_%(dx) s* __restrict__ dx_i = (dtype_%(dx)s*) (PyArray_BYTES(%(dx)s) + PyArray_STRIDES(%(dx)s)[0] * i);
394            npy_intp Sdx = PyArray_STRIDES(%(dx)s)[1]/sizeof(dtype_%(dx)s);
395
396            double sum_dy_times_sm = 0.;
397            for (size_t j = 0; j < PyArray_DIMS(%(dx)s)[1]; ++j)
398            {
399                dx_i[j * Sdx] = dy_i[j * Sdy] * sm_i[j * Ssm];
400                sum_dy_times_sm += dx_i[j * Sdx];
401            }
402            for (size_t j = 0; j < PyArray_DIMS(%(dx)s)[1]; ++j)
403            {
404                dx_i[j * Sdx] -= sum_dy_times_sm * sm_i[j * Ssm];
405            }
406        }
407        ''' % dict(locals(), **sub)
408softmax_grad = SoftmaxGrad()
409
410
411class Softmax(gof.Op):
412    r"""
413    Softmax activation function
414    :math:`\\varphi(\\mathbf{x})_j =
415    \\frac{e^{\mathbf{x}_j}}{\sum_{k=1}^K e^{\mathbf{x}_k}}`
416    where :math:`K` is the total number of neurons in the layer. This
417    activation function gets applied row-wise.
418
419    """
420
421    nin = 1
422    nout = 1
423    __props__ = ()
424
425    def make_node(self, x):
426        x = tensor.as_tensor_variable(x)
427        if x.type.ndim not in (1, 2) \
428                or x.type.dtype not in tensor.float_dtypes:
429            raise ValueError('x must be 1-d or 2-d tensor of floats. Got %s' %
430                             x.type)
431        if x.ndim == 1:
432            warnings.warn("DEPRECATION: If x is a vector, Softmax will not automatically pad x "
433                          "anymore in next releases. If you need it, please do it manually. The "
434                          "vector case is gonna be supported soon and the output will be a vector.",
435                          stacklevel=4)
436            x = tensor.shape_padleft(x, n_ones=1)
437
438        return Apply(self, [x], [x.type()])
439
440    def perform(self, node, input_storage, output_storage):
441        x, = input_storage
442        e_x = np.exp(x - x.max(axis=1)[:, None])
443        sm = e_x / e_x.sum(axis=1)[:, None]
444        output_storage[0][0] = sm
445
446    def L_op(self, inp, outputs, grads):
447        x, = inp
448        g_sm, = grads
449        return [softmax_grad(g_sm, outputs[0])]
450
451    def R_op(self, inputs, eval_points):
452        # I think the Jacobian is symmetric so the R_op
453        # is the same as the grad
454        if None in eval_points:
455            return [None]
456        return self.L_op(inputs, [self(*inputs)], eval_points)
457
458    def infer_shape(self, node, shape):
459        return shape
460
461    def c_headers(self):
462        return ['<iostream>', '<cmath>']
463
464    @staticmethod
465    def c_code_template(dtype):
466        # this implementation was lifted from
467        # /u/bergstrj/cvs/bergstrj/src/feb07/nn.cxx
468
469        # TODO: put this into a templated function, in the support code
470        # TODO: declare the max of each row as an Op output
471
472        # TODO: set error messages for failures in this code
473
474        # TODO: use this to accept float32 and int32: node.inputs[0].type.dtype_specs()[1]
475        init_decl = """
476        npy_intp* Nx = PyArray_DIMS(%(x)s);
477        npy_intp Sx1 = 0;
478        npy_intp Ssm1 = 0;
479
480        if (PyArray_NDIM(%(x)s) != 2)
481        {
482            PyErr_SetString(PyExc_ValueError, "not a 2d tensor");
483            %(fail)s;
484        }
485        if ((PyArray_TYPE(%(x)s) != NPY_DOUBLE) &&
486            (PyArray_TYPE(%(x)s) != NPY_FLOAT))
487        {
488            PyErr_SetString(PyExc_TypeError, "not a float");
489            %(fail)s;
490        }
491
492        if ((NULL == %(sm)s)
493            || (PyArray_DIMS(%(sm)s)[0] != PyArray_DIMS(%(x)s)[0])
494            || (PyArray_DIMS(%(sm)s)[1] != PyArray_DIMS(%(x)s)[1]))
495        {
496            Py_XDECREF(%(sm)s);
497            %(sm)s = (PyArrayObject*)PyArray_SimpleNew(2, PyArray_DIMS(%(x)s),
498                                                       PyArray_TYPE(%(x)s));
499            if(!%(sm)s) {
500                PyErr_SetString(PyExc_MemoryError,
501                     "failed to alloc sm output");
502                %(fail)s
503            }
504        }
505        Sx1 = PyArray_STRIDES(%(x)s)[1]/sizeof(dtype_%(x)s);
506        Ssm1 = PyArray_STRIDES(%(sm)s)[1]/sizeof(dtype_%(sm)s);
507        """
508
509        begin_row_loop = """
510        for (size_t i = 0; i < Nx[0]; ++i)
511        {
512            size_t j;
513            double sum = 0.0;
514
515            const dtype_%(x)s* __restrict__ x_i = (dtype_%(x)s*)(PyArray_BYTES(%(x)s) + PyArray_STRIDES(%(x)s)[0] * i);
516            dtype_%(sm) s* __restrict__ sm_i = (dtype_%(sm)s*)(PyArray_BYTES(%(sm)s) + PyArray_STRIDES(%(sm)s)[0] * i);
517
518            dtype_%(sm)s row_max = x_i[0];
519            //std::cout << "0 " << row_max << "\\n";
520            // Get the maximum value of the row
521            for (j = 1; j < Nx[1]; ++j)
522            {
523                dtype_%(sm)s row_ij = x_i[j * Sx1] ;
524                //std::cout << "1 " << row_ij << "\\n";
525                row_max   = (row_ij > row_max) ? row_ij : row_max;
526            }
527
528        """
529
530        inside_row_loop = """
531            for (j = 0; j < Nx[1]; ++j)
532            {
533                dtype_%(sm)s row_ij = x_i[j * Sx1] ;
534                //std::cout << "2 " << j << " " << row_ij << " " << row_max << "\\n";
535                dtype_%(sm)s sm_ij = exp(row_ij - row_max);
536                //std::cout << "3 " << j << " " << sm_ij << "\\n";
537                sum += sm_ij;
538                sm_i[j * Ssm1] = sm_ij;
539            }
540
541            //cblas_dscal(x.N, 1.0 / sum, &mat_at(s,i,0), s.n);
542            double sum_inv = 1.0 / sum;
543            for (j = 0; j < Nx[1]; ++j)
544            {
545                sm_i[j * Ssm1] *= sum_inv;
546            }
547
548        """
549        # Get the vectorized version of exp if it exist
550        try:
551            vec_exp = theano.scalar.exp.c_code_contiguous_raw(dtype,
552                                                              "Nx[1]", "sm_i", "sm_i")
553            inside_row_loop_contig = """
554            for (j = 0; j < Nx[1]; ++j)
555            {
556                sm_i[j * Ssm1] = x_i[j * Sx1] - row_max;
557            }
558            %(vec_exp)s;
559            for (j = 0; j < Nx[1]; ++j)
560            {
561                sum += sm_i[j * Ssm1];
562            }
563
564            //cblas_dscal(x.N, 1.0 / sum, &mat_at(s,i,0), s.n);
565            double sum_inv = 1.0 / sum;
566            for (j = 0; j < Nx[1]; ++j)
567            {
568                sm_i[j * Ssm1] *= sum_inv;
569            }
570
571            """ % locals()
572
573            inside_row_loop = """
574            if(Ssm1 == 1){
575                %(inside_row_loop_contig)s
576            }else{
577                %(inside_row_loop)s
578            }
579            """ % locals()
580        except theano.gof.utils.MethodNotDefined:
581            pass
582
583        end_row_loop = """
584        }
585        """
586        return (init_decl, begin_row_loop, inside_row_loop, end_row_loop)
587
588    def c_code(self, node, name, inp, out, sub):
589        x, = inp
590        sm, = out
591        code_template = ''.join(self.c_code_template(
592            node.inputs[0].type.dtype_specs()[1]))
593        return code_template % dict(locals(), **sub)
594
595    @staticmethod
596    def c_code_cache_version():
597        return (3,)
598
599softmax_op = Softmax()
600
601
602class LogSoftmax(gof.Op):
603    r"""
604    LogSoftmax activation function
605    :math:`\\varphi(\\mathbf{x})_j =
606    \\e^{(\mathbf{x}_j - log{\sum_{k=1}^K e^{\mathbf{x}_k})}}
607    where :math:`K` is the total number of neurons in the layer. This
608    activation function gets applied row-wise.
609
610    """
611    __props__ = ()
612
613    def make_node(self, x):
614        x = tensor.as_tensor_variable(x)
615        if x.type.ndim not in (1, 2) \
616                or x.type.dtype not in tensor.float_dtypes:
617            raise ValueError('x must be 1-d or 2-d tensor of floats. Got %s' %
618                             x.type)
619        if x.ndim == 1:
620            warnings.warn("DEPRECATION: If x is a vector, LogSoftmax will not automatically pad x "
621                          "anymore in next releases. If you need it, please do it manually. The "
622                          "vector case is gonna be supported soon and the output will be a vector.",
623                          stacklevel=4)
624            x = tensor.shape_padleft(x, n_ones=1)
625
626        return Apply(self, [x], [x.type()])
627
628    def perform(self, node, input_storage, output_storage):
629        x, = input_storage
630        xdev = x - x.max(axis=1)[:, None]
631        lsm = xdev - np.log(np.sum(np.exp(xdev), axis=1,
632                            keepdims=True))
633        output_storage[0][0] = lsm
634
635    def grad(self, inp, grads):
636        x, = inp
637        sm = softmax_op(x)
638        return [grads[0] - tensor.sum(grads[0], axis=1, keepdims=True) * sm]
639
640    def R_op(self, inputs, eval_points):
641        # I think the Jacobian is symmetric so the R_op
642        # is the same as the grad
643        if None in eval_points:
644            return [None]
645        return self.grad(inputs, eval_points)
646
647    def infer_shape(self, node, shape):
648        return shape
649
650    def c_headers(self):
651        return ['<cmath>']
652
653    @staticmethod
654    def c_code_template(dtype):
655        init_decl = """
656          npy_intp* Nx = PyArray_DIMS(%(x)s);
657          npy_intp Sx1 = 0;
658          npy_intp Ssm1 = 0;
659
660          if (PyArray_NDIM(%(x)s) != 2)
661          {
662              PyErr_SetString(PyExc_ValueError, "not a 2d tensor");
663              %(fail)s;
664          }
665          if ((PyArray_TYPE(%(x)s) != NPY_DOUBLE) &&
666              (PyArray_TYPE(%(x)s) != NPY_FLOAT))
667          {
668              PyErr_SetString(PyExc_TypeError, "not a float");
669              %(fail)s;
670          }
671
672          if ((NULL == %(sm)s)
673              || (PyArray_DIMS(%(sm)s)[0] != PyArray_DIMS(%(x)s)[0])
674              || (PyArray_DIMS(%(sm)s)[1] != PyArray_DIMS(%(x)s)[1]))
675          {
676              Py_XDECREF(%(sm)s);
677              %(sm)s = (PyArrayObject*)PyArray_SimpleNew(
678                  2, PyArray_DIMS(%(x)s),
679                  PyArray_TYPE(%(x)s));
680              if(!%(sm)s) {
681                  PyErr_SetString(PyExc_MemoryError,
682                       "failed to alloc sm output");
683                  %(fail)s
684              }
685          }
686          Sx1 = PyArray_STRIDES(%(x)s)[1]/sizeof(dtype_%(x)s);
687          Ssm1 = PyArray_STRIDES(%(sm)s)[1]/sizeof(dtype_%(sm)s);
688          """
689
690        begin_row_loop = """
691          // minibatch loop
692          for (size_t i = 0; i < Nx[0]; ++i)
693          {
694              size_t j;
695              double sum = 0.0;
696
697              const dtype_%(x)s* __restrict__ x_i = (dtype_%(x)s*)(
698                  PyArray_BYTES(%(x)s) + PyArray_STRIDES(%(x)s)[0] * i);
699              dtype_%(sm)s* __restrict__ sm_i = (dtype_%(sm)s*)(
700                  PyArray_BYTES(%(sm)s) + PyArray_STRIDES(%(sm)s)[0] * i);
701
702              dtype_%(sm)s row_max = x_i[0];
703              // Get the maximum value of the row
704              for (j = 1; j < Nx[1]; ++j)
705              {
706                  dtype_%(sm)s x_ij = x_i[j * Sx1] ;
707                  row_max = (x_ij > row_max) ? x_ij : row_max;
708              }
709              """
710
711        inside_row_loop = """
712              // Compute xdev and sum(exp(xdev), axis=1)
713              double xdev_exp_row_sum = 0.0;
714              for (j = 0; j < Nx[1]; j++)
715              {
716                  // use sm_i to temporary store xdev
717                  sm_i[j * Ssm1] = (dtype_%(sm)s) (x_i[j * Sx1] - row_max);
718                  xdev_exp_row_sum += exp(sm_i[j * Ssm1]);
719              }
720
721              // Write sm = xdev - log(sum(exp(xdev), axis=1))
722              xdev_exp_row_sum = log(xdev_exp_row_sum);
723              for (j = 0; j < Nx[1]; ++j)
724              {
725                  sm_i[j * Ssm1] -= (dtype_%(sm)s) xdev_exp_row_sum;
726              }
727              """
728        end_row_loop = """
729          }
730          """
731        return (init_decl, begin_row_loop, inside_row_loop, end_row_loop)
732
733    def c_code(self, node, name, inp, out, sub):
734        x, = inp
735        sm, = out
736        code_template = ''.join(self.c_code_template(
737            node.inputs[0].type.dtype_specs()[1]))
738        return code_template % dict(locals(), **sub)
739
740    @staticmethod
741    def c_code_cache_version():
742        return (0,)
743
744logsoftmax_op = LogSoftmax()
745
746
747# This is not registered in stabilize, as it cause some crossentropy
748# optimization to not be inserted.
749@opt.register_specialize('stabilize', 'fast_compile')
750@gof.local_optimizer([tensor.Elemwise])
751def local_logsoftmax(node):
752    """
753    Detect Log(Softmax(x)) and replace it with LogSoftmax(x)
754
755    Note: only forward pass is affected
756    """
757    if (isinstance(node.op, tensor.Elemwise) and
758            isinstance(node.op.scalar_op, scalar.basic.Log) and
759            len(node.inputs) == 1 and
760            node.inputs[0].owner is not None and
761            isinstance(node.inputs[0].owner.op, Softmax)):
762        inVars = node.inputs[0].owner.inputs[0]
763        new_op = LogSoftmax()
764        ret = new_op(inVars)
765        ret .tag.values_eq_approx = values_eq_approx_remove_inf
766        copy_stack_trace([node.inputs[0], node.outputs[0]], ret)
767        return [ret]
768
769
770# This is not registered in stabilize, as it cause some crossentropy
771# optimization to not be inserted.
772@opt.register_specialize('stabilize', 'fast_compile')
773@gof.local_optimizer([SoftmaxGrad])
774def local_logsoftmax_grad(node):
775    """
776    Detect Log(Softmax(x))'s grad and replace it with LogSoftmax(x)'s grad
777
778    Note: only grad is affected
779    """
780    if (isinstance(node.op, SoftmaxGrad) and
781        len(node.inputs) == 2 and
782        node.inputs[0].owner is not None and
783        node.inputs[0].owner.op == tensor.true_div and
784        len(node.inputs[0].owner.inputs) >= 2 and
785        node.inputs[0].owner.inputs[1].owner is not None and
786        node.inputs[0].owner.inputs[1].owner.op == softmax_op and
787        node.inputs[1] == node.inputs[0].owner.inputs[1] and
788        not (
789            # skip if it will be optimized by
790            # local_advanced_indexing_crossentropy_onehot_grad
791            node.inputs[0].owner.op == tensor.true_div and
792            node.inputs[0].owner.inputs[0].owner is not None and
793            isinstance(node.inputs[0].owner.inputs[0].owner.op,
794                       subtensor.AdvancedIncSubtensor))):
795        # get parameters from unoptimized op
796        sm = node.inputs[0].owner.inputs[1]
797        # sm_input = node.inputs[1].owner.inputs[0]
798        grads = node.inputs[0].owner.inputs[0]
799        if grads.broadcastable[1] and not sm.broadcastable[1]:
800            grads = tensor.alloc(grads, grads.shape[0], sm.shape[1])
801        ret = grads - tensor.sum(grads, axis=1, keepdims=True) * sm
802        ret.tag.values_eq_approx = values_eq_approx_remove_nan
803        copy_stack_trace(node.outputs[0], ret)
804        return [ret]
805
806
807def softmax_graph(c):
808    return tensor.exp(c) / tensor.exp(c).sum(axis=-1, keepdims=True)
809
810
811def softmax(c):
812    c = as_tensor_variable(c)
813    if c.broadcastable[-1]:
814        warnings.warn("The softmax is applied on a dimension of shape 1, which does not have a semantic meaning.")
815    return softmax_op(c)
816
817
818def logsoftmax(c):
819    return logsoftmax_op(c)
820
821
822@opt.register_specialize('fast_compile_gpu')
823@gof.local_optimizer([softmax_op])
824def local_softmax_with_bias(node):
825    """
826    Try to turn softmax(sum_of_stuff) -> softmax_w_bias(matrix, bias).
827
828    """
829    if node.op == softmax_op:
830        x, = node.inputs
831        if x.owner and x.owner.op == tensor.add:
832            vectors = []
833            non_vectors = []
834            for x_in in x.owner.inputs:
835                if list(x_in.type.broadcastable) == [True, False]:
836                    # print isinstance(x_in.owner.op,
837                    # tensor.DimShuffle) since specialization comes
838                    # relatively late in optimization, we don't want to
839                    # put in extra DimShuffles un-necessarily.
840                    if (x_in.owner and
841                            isinstance(x_in.owner.op, tensor.DimShuffle) and
842                            list(x_in.owner.inputs[0].type.broadcastable) == [False]):
843                        # cut out the DimShuffle that was broadcasting a vector
844                        vectors.append(x_in.owner.inputs[0])
845                    else:
846                        # insert an extra DimShuffle to correct the old one
847                        vectors.append(tensor.
848                                       DimShuffle((True, False), (1,))(x_in))
849                else:
850                    non_vectors.append(x_in)
851
852            # If all the inputs were vectors or broadcasted vectors,
853            # we broadcast one of them to be used as a matrix
854            if len(non_vectors) == 0:
855                assert len(vectors) > 0  # we should have at least 1 input...
856                promoted_vector = vectors.pop()
857                non_vectors.append(tensor.shape_padleft(promoted_vector))
858            assert non_vectors  # not empty
859
860            if vectors:
861                # we're in business...
862                if len(vectors) > 1:
863                    vector_sum = tensor.add(*vectors)
864                    copy_stack_trace(x_in, vector_sum)
865                else:
866                    vector_sum = vectors[0]
867
868                if len(non_vectors) > 1:
869                    non_vector_sum = tensor.add(*non_vectors)
870                    copy_stack_trace(x_in, non_vector_sum)
871                else:
872                    non_vector_sum = non_vectors[0]
873
874                try:
875                    sm_bias = softmax_with_bias(non_vector_sum, vector_sum)
876                    copy_stack_trace(node.outputs[0], sm_bias)
877                except Exception:
878                    # if our arguments have the wrong types, then
879                    # forget about it
880                    return
881
882                if sm_bias.type == node.outputs[0].type:
883                    # This condition is not always true. See the test
884                    # nnet/tests/test_nnet.py:T_SoftmaxWithBias.test_broadcast
885                    return [sm_bias]
886
887
888def softmax_simplifier(numerators, denominators):
889    for numerator in list(numerators):
890        # TODO: a single softmax'd vector??
891        if not numerator.type.dtype.startswith('float'):
892            continue
893
894        if numerator.ndim != 2:
895            continue
896        if numerator.owner and numerator.owner.op == tensor.exp:
897            x = numerator.owner.inputs[0]
898        else:
899            continue
900
901        matching_denom = None
902
903        for denominator in denominators:
904            if denominator.owner and isinstance(denominator.owner.op,
905                                                tensor.DimShuffle):
906                if denominator.owner.op.new_order == (0, 'x'):
907                    z = denominator.owner.inputs[0]
908                    # thing getting dimshuffled
909                    if z.owner and isinstance(z.owner.op, tensor.Sum):
910                        # print 'ASDF', denominator.owner.op.new_order
911                        # print z.owner.op.axis
912                        if z.owner.op.axis == (1,):
913                            # print "almost there.. softmax", x, z.owner.inputs[0]
914                            if z.owner.inputs[0] is numerator:
915                                matching_denom = denominator
916                                break
917        if matching_denom:
918            softmax = softmax_op(x)
919            copy_stack_trace(numerator, softmax)
920            numerators.remove(numerator)
921            denominators.remove(matching_denom)
922            numerators.append(softmax)
923
924    return numerators, denominators
925opt.local_mul_canonizer.add_simplifier(softmax_simplifier, 'softmax_simplifier')
926
927
928class CrossentropySoftmaxArgmax1HotWithBias(gof.Op):
929    """
930    A special compound L{Op} for the output of neural-net classifiers.
931
932    Parameters
933    ----------
934    x : a matrix of floats (32 or 64)
935    b : a [row] vector of floats (32 or 64), length is number of cols in x
936    y_idx : a [column] vector of int (32 or 64), length is number of rows in x
937
938    Returns
939    -------
940    object
941        row-wise NLL, softmax(x+b), row-wise argmax of (x+b).
942
943    @precondition: every entry in y_idx is a valid (non-negative)
944                   column index into x
945
946    This L{Op} has three outputs:
947     - KL(softmax(x+b), y)
948     - softmax(x+b)
949     - argmax(x+b)
950
951    softmax(x[i]) is the i'th distribution over len(x[i]) options
952    argmax(x) is the index of x's greatest element
953    y_idx[i] is an integer index, encoding a 1-hot distribution.
954
955    In practice, when we are trying to do classification, we have one row in x
956    and y_idx per example, and y[i] is the index of the (correct) class of the
957    i'th example.
958
959    """
960
961    nin = 3
962    nout = 3
963    __props__ = ()
964
965    def __init__(self, **kwargs):
966        gof.Op.__init__(self, **kwargs)
967
968    def make_node(self, x, b, y_idx):
969        x = tensor.as_tensor_variable(x)
970        b = tensor.as_tensor_variable(b)
971        y_idx = tensor.as_tensor_variable(y_idx)
972        if x.type.ndim != 2 \
973                or x.type.dtype not in tensor.float_dtypes:
974            raise ValueError('x must be 2-d tensor of floats', x.type)
975        if b.type.ndim != 1 \
976                or x.type.dtype not in tensor.float_dtypes:
977            raise ValueError('b must be 1-d tensor of floats', b.type)
978        if y_idx.type.ndim != 1 \
979                or y_idx.type.dtype not in tensor.discrete_dtypes:
980            raise ValueError('y_idx must be 1-d tensor of [u]ints', y_idx.type)
981
982#       TODO: Is this correct? It used to be y, not y_idx
983        nll = tensor.TensorType(x.type.dtype,
984                                y_idx.type.broadcastable).make_variable()
985#        nll = TensorType(x.dtype, y.broadcastable)
986        sm = x.type()
987        am = y_idx.type()
988        return Apply(self, [x, b, y_idx], [nll, sm, am])
989
990    def perform(self, node, input_storage, output_storage):
991        """
992        The math, where x is an input vector, and t is a target index:
993
994            softmax(x)[i] = exp(x[i]) / sum_j(exp(x[j]))
995            nll(x,t) = -log(softmax(x)[t])
996
997        We compute this by subtracting off the max of x. This avoids
998        numerical instability.
999
1000            m = max_j x[j]
1001            softmax(x)[i] = exp(x[i] -m) / sum_j(exp(x[j] - m))
1002
1003            nll = -log(exp(x[t] -m) / sum_j(exp(x[j] - m)))
1004                = -x[t] + m + log( sum_j(exp(x[j] - m)))
1005
1006        """
1007        x, b, y_idx = input_storage
1008        if b.shape[0] != x.shape[1]:
1009            raise ValueError('b must have same number of columns as x')
1010        if y_idx.shape[0] != x.shape[0]:
1011            raise ValueError('y_idx must have same number of rows as x')
1012        if any(y_idx < 0):
1013            raise ValueError("y_i value out of bounds")
1014        sm = np.zeros_like(x)  # softmax
1015        nll = np.zeros(x.shape[0], dtype=node.outputs[0].type.dtype)  # nll(y | softmax(x))
1016        am = np.zeros_like(y_idx)
1017        for i in xrange(sm.shape[0]):
1018            # add the bias vector to the i'th row of x
1019            row = x[i] + b
1020
1021            # get the maximum value of i'th row for numerically safe
1022            # softmax / nll
1023            am[i] = np.argmax(row)
1024            m = row[am[i]]
1025
1026            # compute the unnormalized softmax, and normalization constant
1027            sm[i] = np.exp(row - m)
1028            sum_j = np.sum(sm[i])  # sum_j(exp(x[j] - m))
1029
1030            # normalized our softmax
1031            sm[i] *= 1.0 / sum_j
1032
1033            # store the nll
1034            nll[i] = -row[y_idx[i]] + m + np.log(sum_j)
1035
1036        output_storage[0][0] = nll
1037        output_storage[1][0] = sm
1038        output_storage[2][0] = am
1039
1040    def infer_shape(self, node, shapes):
1041        x_shp, b_shp, idx_shp = shapes
1042        nll_shp = (x_shp[0],)
1043        sm_shp = x_shp
1044        am_shp = idx_shp
1045        return [nll_shp, sm_shp, am_shp]
1046
1047    def connection_pattern(self, node):
1048
1049        return [[True, True, True],  # x
1050                [True, True, True],  # b
1051                [False, False, True]]  # y_idx
1052
1053    def grad(self, inp, grads):
1054        x, b, y_idx = inp
1055        g_nll, g_sm, g_am = grads
1056
1057        dx_terms = []
1058        db_terms = []
1059        d_idx_terms = []
1060
1061        if not isinstance(g_nll.type, DisconnectedType):
1062            nll, sm = crossentropy_softmax_1hot_with_bias(x, b, y_idx)
1063            dx = crossentropy_softmax_1hot_with_bias_dx(g_nll, sm, y_idx)
1064            db = tensor.sum(dx, axis=[0])
1065            dx_terms.append(dx)
1066            db_terms.append(db)
1067
1068        if not isinstance(g_sm.type, DisconnectedType):
1069            dx, db = softmax_with_bias.L_op((x, b), [softmax_with_bias(x, b)], (g_sm, ))
1070            dx_terms.append(dx)
1071            db_terms.append(db)
1072
1073        if not isinstance(g_am.type, DisconnectedType):
1074            dx_terms.append(x.zeros_like())
1075            db_terms.append(b.zeros_like())
1076            d_idx_terms.append(y_idx.zeros_like())
1077
1078        def fancy_sum(terms):
1079            if len(terms) == 0:
1080                return DisconnectedType()()
1081            rval = terms[0]
1082            for term in terms[1:]:
1083                rval = rval + term
1084            return rval
1085
1086        return [fancy_sum(terms) for terms in
1087                [dx_terms, db_terms, d_idx_terms]]
1088
1089    def c_headers(self):
1090        return ['<iostream>', '<cmath>']
1091
1092    @staticmethod
1093    def c_code_template(dtype):
1094        # this implementation was lifted from
1095        # /u/bergstrj/cvs/bergstrj/src/feb07/nn.cxx
1096
1097        # TODO: put this into a templated function, in the support code
1098        # TODO: declare the max of each row as an Op output
1099
1100        # TODO: set error messages for failures in this code
1101
1102        # TODO: use this to accept float32 and int32: node.inputs[0].type.dtype_specs()[1]
1103        (init_decl, begin_row_loop, inside_row_loop, end_row_loop) = \
1104            SoftmaxWithBias.c_code_template(dtype)
1105        return (init_decl,
1106                """
1107        if (PyArray_NDIM(%(y_idx)s) != 1)
1108        {
1109            PyErr_SetString(PyExc_ValueError, "y_idx not 1d tensor");
1110            %(fail)s;
1111        }
1112        if (PyArray_DIMS(%(x)s)[0] != PyArray_DIMS(%(y_idx)s)[0])
1113        {
1114            PyErr_Format(PyExc_ValueError,
1115                "number of rows in x (%%ld) does not match length of y (%%ld)",
1116                (long int)PyArray_DIMS(%(x)s)[0],
1117                (long int)PyArray_DIMS(%(y_idx)s)[0]);
1118            %(fail)s;
1119        }
1120
1121        if ((NULL == %(nll)s) //initial condition
1122            || (PyArray_DIMS(%(nll)s)[0] != PyArray_DIMS(%(y_idx)s)[0]))
1123        {
1124            if (NULL != %(nll)s) Py_XDECREF(%(nll)s);
1125            %(nll)s = (PyArrayObject*)PyArray_SimpleNew(1,
1126                PyArray_DIMS(%(y_idx)s), PyArray_TYPE(%(x)s));
1127            if(!%(nll)s)
1128            {
1129                PyErr_SetString(PyExc_MemoryError,
1130                     "failed to alloc nll output");
1131                %(fail)s;
1132            }
1133        }
1134        if ((NULL == %(am)s)
1135            || (PyArray_DIMS(%(am)s)[0] != PyArray_DIMS(%(y_idx)s)[0]))
1136        {
1137            Py_XDECREF(%(am)s);
1138            %(am)s = (PyArrayObject*) PyArray_SimpleNew(1,
1139                PyArray_DIMS(%(y_idx)s), PyArray_TYPE(%(y_idx)s));
1140            if(!%(am)s)
1141            {
1142                PyErr_SetString(PyExc_MemoryError,
1143                     "failed to alloc am output");
1144                %(fail)s;
1145            }
1146        }
1147                """,
1148                begin_row_loop,
1149                """
1150            const %(y_idx_type) s y_i = ((%(y_idx_type)s*)(PyArray_BYTES(%(y_idx)s) + PyArray_STRIDES(%(y_idx)s)[0] * i))[0];
1151            dtype_%(nll) s* __restrict__ nll_i = (dtype_%(nll)s*)(PyArray_BYTES(%(nll)s) + PyArray_STRIDES(%(nll)s)[0] * i);
1152            %(am_type)s* __restrict__ am_i = (%(am_type)s*) (PyArray_BYTES(%(am)s) + PyArray_STRIDES(%(am)s)[0] * i);
1153                """,
1154                inside_row_loop,
1155                """
1156            if ((y_i >= PyArray_DIMS(%(x)s)[1]) || (y_i < 0))
1157            {
1158                PyErr_SetString(PyExc_ValueError, "y_i value out of bounds");
1159                %(fail)s;
1160            }
1161            nll_i[0] = - x_i[y_i*Sx]
1162                       - b_i[y_i*Sb]
1163                       + row_max
1164                       + log(sum);
1165            am_i[0] = row_max_j;
1166                """,
1167                end_row_loop)
1168
1169    def c_code_cache_version(self):
1170        return (5,) + SoftmaxWithBias.c_code_cache_version()
1171
1172    def c_code(self, node, name, inp, out, sub):
1173        x, b, y_idx = inp
1174        nll, sm, am = out
1175        y_idx_type = node.inputs[2].type.dtype_specs()[1]
1176        am_type = y_idx_type
1177        dtype = node.inputs[0].type.dtype_specs()[1]
1178        code_template = ''.join(self.c_code_template(dtype))
1179        return code_template % dict(locals(), **sub)
1180
1181
1182class CrossentropySoftmax1HotWithBiasDx(gof.Op):
1183    """
1184    Gradient wrt x of the CrossentropySoftmaxArgmax1HotWithBias Op.
1185
1186    """
1187
1188    nin = 3
1189    nout = 1
1190    __props__ = ()
1191
1192    def make_node(self, dy, sm, y_idx, **kwargs):
1193        dy = tensor.as_tensor_variable(dy)
1194        sm = tensor.as_tensor_variable(sm)
1195        y_idx = tensor.as_tensor_variable(y_idx)
1196        if (dy.type.ndim > 1 or
1197                dy.type.dtype not in tensor.float_dtypes):
1198            raise ValueError('dy must be {0,1}-d tensor of floats', dy.type)
1199        if (sm.type.ndim != 2 or
1200                sm.type.dtype not in tensor.float_dtypes):
1201            raise ValueError('sm must be 2-d tensor of floats', sm.type)
1202        if (y_idx.type.ndim != 1 or
1203                y_idx.type.dtype not in tensor.discrete_dtypes):
1204            raise ValueError('y_idx must be 1-d tensor of [u]ints', y_idx.type)
1205        return Apply(self, [dy, sm, y_idx], [sm.type()])
1206
1207    def perform(self, node, input_storage, output_storage):
1208        dy, sm, y_idx = input_storage
1209        if any(y_idx < 0):
1210            raise ValueError("y_i value out of bounds")
1211        dx = np.zeros_like(sm)
1212        if dy.ndim == 0:
1213            dy = dy[None]
1214        incr = int(dy.shape[0] > 1)
1215        for i in xrange(sm.shape[0]):
1216            dy_i = dy[i * incr]
1217            dx[i] = dy_i * sm[i]  # vector scale
1218            dx[i, y_idx[i]] -= dy_i  # scalar decrement
1219        output_storage[0][0] = dx
1220
1221    def infer_shape(self, node, shapes):
1222        return [shapes[1]]
1223
1224    def grad(self, inp, grads):
1225        dy, sm, y_idx = inp
1226        g_dx, = grads
1227        # TODO: currently we do not compute the gradient w.r.t. dy, because
1228        # advanced indexing is not working yet. When it works, do it to avoid
1229        # potentially misleading behavior in gradient computations! (although
1230        # typically we should not need the gradient w.r.t. dy).
1231        y_idx_range = tensor.arange(y_idx.shape[0])
1232        g_dy = tensor.sum(
1233            g_dx * subtensor.AdvancedIncSubtensor()(
1234                sm, tensor.fill(dy, -1), y_idx_range, y_idx), axis=1)
1235        g_sm = dy.dimshuffle(0, 'x') * g_dx
1236        g_y_idx = grad_not_implemented(self, 2, y_idx)
1237        return [g_dy, g_sm, g_y_idx]
1238
1239    def c_code_cache_version(self):
1240        return (6,)
1241
1242    def c_code(self, node, name, inp, out, sub):
1243        dnll, sm, y_idx = inp
1244        dx, = out
1245        y_idx_type = node.inputs[2].type.dtype_specs()[1]
1246        return """
1247        if ((PyArray_TYPE(%(dnll)s) != NPY_DOUBLE) &&
1248            (PyArray_TYPE(%(dnll)s) != NPY_FLOAT))
1249        {
1250            PyErr_SetString(PyExc_TypeError,
1251                 "dnll type should be float32 or float64");
1252            %(fail)s;
1253        }
1254        if ((PyArray_TYPE(%(sm)s) != NPY_DOUBLE) &&
1255            (PyArray_TYPE(%(sm)s) != NPY_FLOAT))
1256        {
1257            PyErr_SetString(PyExc_TypeError,
1258                 "sm type should be float32 or float64");
1259            %(fail)s;
1260        }
1261
1262        // new scope because of variable declaration
1263        // TODO: proper indentation, but the diff will get messy
1264        {
1265        // Get `dnll.shape[0]` or set it to zero if `dnll` is a scalar.
1266        const npy_intp %(dnll)s_dims0 = (PyArray_NDIM(%(dnll)s) > 0 ?
1267                                         PyArray_DIMS(%(dnll)s)[0] :
1268                                         (npy_intp) 0);
1269
1270        // Get `dnll.strides[0]` and set it to zero if `dnll` is a scalar
1271        // or a vector with just one element.
1272        const npy_intp %(dnll)s_strides0 = (%(dnll)s_dims0 > 1 ?
1273                                            PyArray_STRIDES(%(dnll)s)[0] :
1274                                            (npy_intp) 0);
1275
1276        if ((PyArray_NDIM(%(dnll)s) > 1)
1277            || (PyArray_NDIM(%(sm)s) != 2)
1278            || (PyArray_NDIM(%(y_idx)s) != 1))
1279        {
1280            PyErr_SetString(PyExc_ValueError, "rank error");
1281            %(fail)s;
1282        }
1283        if (%(dnll)s_dims0 != PyArray_DIMS(%(sm)s)[0] && %(dnll)s_dims0 > 1)
1284        {
1285            PyErr_Format(PyExc_ValueError,
1286                         "dnll.shape[0] (%%ld) != sm.shape[0] (%%ld)",
1287                         (long int)%(dnll)s_dims0,
1288                         (long int)PyArray_DIMS(%(sm)s)[0]);
1289            %(fail)s;
1290        }
1291        if (%(dnll)s_dims0 != PyArray_DIMS(%(y_idx)s)[0] && %(dnll)s_dims0 > 1)
1292        {
1293            PyErr_Format(PyExc_ValueError,
1294                         "dnll.shape[0] (%%ld) != y_idx.shape[0] (%%ld)",
1295                         (long int)%(dnll)s_dims0,
1296                         (long int)PyArray_DIMS(%(y_idx)s)[0]);
1297            %(fail)s;
1298        }
1299        if (PyArray_DIMS(%(sm)s)[0] !=
1300            PyArray_DIMS(%(y_idx)s)[0])
1301        {
1302            PyErr_SetString(PyExc_ValueError,
1303                            "sm.shape[0] != y_idx.shape[0]");
1304            %(fail)s;
1305        }
1306        if ((NULL == %(dx)s)
1307            || (PyArray_DIMS(%(dx)s)[0] != PyArray_DIMS(%(sm)s)[0])
1308            || (PyArray_DIMS(%(dx)s)[1] != PyArray_DIMS(%(sm)s)[1]))
1309        {
1310            if (NULL != %(dx)s) Py_XDECREF(%(dx)s);
1311            %(dx)s = (PyArrayObject*) PyArray_SimpleNew(2,
1312                                                        PyArray_DIMS(%(sm)s),
1313                                                        PyArray_TYPE(%(sm)s));
1314            if(!%(dx)s) {
1315                PyErr_SetString(PyExc_MemoryError,
1316                     "failed to alloc dx output");
1317                %(fail)s
1318            }
1319        }
1320
1321        for (size_t i = 0; i < PyArray_DIMS(%(dx)s)[0]; ++i)
1322        {
1323            const dtype_%(dnll)s dnll_i = ((dtype_%(dnll)s*)(PyArray_BYTES(%(dnll)s) + %(dnll)s_strides0 * i))[0];
1324
1325            const %(y_idx_type) s y_i = ((%(y_idx_type)s*)(PyArray_BYTES(%(y_idx)s) + PyArray_STRIDES(%(y_idx)s)[0] * i))[0];
1326
1327            const dtype_%(sm)s* __restrict__ sm_i = (dtype_%(sm)s*)(PyArray_BYTES(%(sm)s) + PyArray_STRIDES(%(sm)s)[0] * i);
1328            npy_intp Ssm = PyArray_STRIDES(%(sm)s)[1]/sizeof(dtype_%(sm)s);
1329
1330            dtype_%(dx) s* __restrict__ dx_i = (dtype_%(dx)s*)(PyArray_BYTES(%(dx)s) + PyArray_STRIDES(%(dx)s)[0] * i);
1331            npy_intp Sdx = PyArray_STRIDES(%(dx)s)[1]/sizeof(dtype_%(dx)s);
1332
1333            for (size_t j = 0; j < PyArray_DIMS(%(dx)s)[1]; ++j)
1334            {
1335                dx_i[j * Sdx] = dnll_i * sm_i[j * Ssm];
1336            }
1337            if (y_i >= PyArray_DIMS(%(dx)s)[1] || (y_i < 0))
1338            {
1339                PyErr_SetString(PyExc_ValueError, "y_i >= dx dimensions[1] or y_i < 0.");
1340                %(fail)s;
1341            }
1342            dx_i[y_i * Sdx] -= dnll_i;
1343        }
1344        }
1345        """ % dict(locals(), **sub)
1346
1347crossentropy_softmax_argmax_1hot_with_bias = \
1348    CrossentropySoftmaxArgmax1HotWithBias()
1349
1350crossentropy_softmax_1hot_with_bias_dx = \
1351    CrossentropySoftmax1HotWithBiasDx()
1352
1353
1354def crossentropy_softmax_1hot_with_bias(x, b, y_idx, **kwargs):
1355    return crossentropy_softmax_argmax_1hot_with_bias(x, b, y_idx,
1356                                                      **kwargs)[0:2]
1357
1358
1359def crossentropy_softmax_1hot(x, y_idx, **kwargs):
1360    b = tensor.zeros_like(x[0, :])
1361    return crossentropy_softmax_1hot_with_bias(x, b, y_idx, **kwargs)
1362
1363
1364def crossentropy_softmax_max_and_argmax_1hot_with_bias(x, b, y_idx, **kwargs):
1365    """
1366    Returns
1367    -------
1368    object
1369        The cross-entropy, the softmax output, the max probability,
1370        and the argmax index.
1371
1372    TODO: Since we are recomputing the argmax,
1373           we might as well assert that it is correct.
1374
1375    TODO: Make this entire function is
1376    unnecessary? e.g. CrossentropySoftmaxArgmax1HotWithBias should return
1377    the appropriate information (i.e. the max probability)?
1378
1379    """
1380    (xent, softmax) = crossentropy_softmax_1hot_with_bias(x, b, y_idx, **kwargs)
1381    (max_pr, argmax) = tensor.max_and_argmax(softmax, axis=-1)
1382    return (xent, softmax, max_pr, argmax)
1383
1384
1385def crossentropy_softmax_max_and_argmax_1hot(x, y_idx, **kwargs):
1386    b = tensor.zeros_like(x[0, :])
1387    return crossentropy_softmax_max_and_argmax_1hot_with_bias(x, b, y_idx,
1388                                                              **kwargs)
1389
1390
1391class CrossentropyCategorical1HotGrad(gof.Op):
1392
1393    __props__ = ()
1394
1395    def make_node(self, g_y, coding_dist, true_one_of_n):
1396        return Apply(self, [g_y, coding_dist, true_one_of_n],
1397                     [coding_dist.type()])
1398
1399    def perform(self, node, inp, out):
1400        g_y, coding_dist, true_one_of_n = inp
1401        g_coding_strg, = out
1402        g_coding = np.zeros_like(coding_dist)
1403        for i in xrange(len(g_y)):
1404            g_coding[i, true_one_of_n[i]] = (-g_y[i] /
1405                                             coding_dist[i, true_one_of_n[i]])
1406        g_coding_strg[0] = g_coding
1407
1408    def infer_shape(self, node, in_shapes):
1409        return [in_shapes[1]]
1410
1411crossentropy_categorical_1hot_grad = CrossentropyCategorical1HotGrad()
1412
1413
1414class CrossentropyCategorical1Hot(gof.Op):
1415    r"""
1416    Compute the cross entropy between a coding distribution and
1417    a true distribution of the form [0, 0, ... 0, 1, 0, ..., 0].
1418
1419    .. math::
1420
1421        y[i] = - \log(coding_dist[i, one_of_n[i])
1422
1423    Notes
1424    -----
1425    In the case that the coding distribution is the output of a
1426    softmax, an application of this Op will probably be optimized
1427    away in favour of one with a C implementation.
1428
1429    """
1430    __props__ = ()
1431
1432    def make_node(self, coding_dist, true_one_of_n):
1433        """
1434        Parameters
1435        ----------
1436        coding_dist : dense matrix
1437        true_one_of_n : lvector
1438
1439        Returns
1440        -------
1441        dvector
1442
1443        """
1444        _coding_dist = tensor.as_tensor_variable(coding_dist)
1445        _true_one_of_n = tensor.as_tensor_variable(true_one_of_n)
1446        if _coding_dist.type.ndim != 2:
1447            raise TypeError('matrix required for argument: coding_dist')
1448        if _true_one_of_n.type not in (tensor.lvector, tensor.ivector):
1449            raise TypeError(
1450                'integer vector required for argument: true_one_of_n'
1451                '(got type: %s instead of: %s)' % (_true_one_of_n.type,
1452                                                   tensor.lvector))
1453
1454        return Apply(self, [_coding_dist, _true_one_of_n],
1455                     [tensor.Tensor(dtype=_coding_dist.dtype,
1456                      broadcastable=[False])()])
1457
1458    def perform(self, node, inp, out):
1459        coding, one_of_n = inp
1460        y_out, = out
1461        y = np.zeros_like(coding[:, 0])
1462        for i in xrange(len(y)):
1463            y[i] = -np.log(coding[i, one_of_n[i]])
1464        y_out[0] = y
1465
1466    def infer_shape(self, node, in_shapes):
1467        return [(in_shapes[0][0],)]
1468
1469    def grad(self, inp, grads):
1470        coding, one_of_n = inp
1471        g_y, = grads
1472        return [crossentropy_categorical_1hot_grad(g_y, coding, one_of_n),
1473                grad_not_implemented(self, 1, one_of_n)]
1474
1475crossentropy_categorical_1hot = CrossentropyCategorical1Hot()
1476
1477
1478@opt.register_stabilize('fast_compile_gpu')
1479@opt.register_specialize('fast_compile_gpu')
1480@gof.optimizer
1481def crossentropy_to_crossentropy_with_softmax_with_bias(fgraph):
1482    """
1483    This is a stabilization optimization.
1484
1485    Notes
1486    -----
1487    Not a local optimization because we are replacing outputs
1488    from several nodes at once.
1489
1490    """
1491
1492    def search_make_one_sub():
1493        for node in fgraph.toposort():
1494            if node.op == crossentropy_categorical_1hot:
1495                nll, = node.outputs
1496                sm, one_of_n = node.inputs
1497                if sm.owner and sm.owner.op == softmax_with_bias:
1498                    x, b = sm.owner.inputs
1499                    new_nll, new_sm, new_am = crossentropy_softmax_argmax_1hot_with_bias(
1500                        x, b, one_of_n)
1501                    fgraph.replace_all_validate(
1502                        [(nll, new_nll), (sm, new_sm)],
1503                        reason="crossentropy_to_crossentropy_with_softmax_with_bias")
1504                    return True
1505
1506        return False
1507
1508    while search_make_one_sub():
1509        pass
1510    return
1511
1512
1513@gof.optimizer
1514def crossentropy_to_crossentropy_with_softmax(fgraph):
1515    """
1516    This is a stabilization optimization that is more general than
1517    crossentropy_to_crossentropy_with_softmax_with_bias.
1518
1519    It must be executed after local_softmax_with_bias optimization in
1520    specialize.
1521
1522    TODO : This is a stabilization optimization! How to make this more cleanly?
1523
1524    Notes
1525    -----
1526    Not a local optimization because we are replacing outputs from several
1527    nodes at once.
1528
1529    """
1530
1531    def search_make_one_sub():
1532        for node in fgraph.toposort():
1533            if node.op == crossentropy_categorical_1hot:
1534                nll, = node.outputs
1535                sm, one_of_n = node.inputs
1536                if sm.owner and sm.owner.op == softmax_op:
1537                    x, = sm.owner.inputs
1538                    new_nll, new_sm, new_am = crossentropy_softmax_argmax_1hot_with_bias(
1539                        x, tensor.zeros_like(x[0]), one_of_n)
1540                    fgraph.replace_all_validate(
1541                        [(nll, new_nll), (sm, new_sm)],
1542                        reason="crossentropy_to_crossentropy_with_softmax")
1543                    return True
1544                if sm.owner and sm.owner.op == softmax_with_bias:
1545                    x, b = sm.owner.inputs
1546                    new_nll, new_sm, new_am = crossentropy_softmax_argmax_1hot_with_bias(x, b,
1547                                                                                         one_of_n)
1548                    fgraph.replace_all_validate(
1549                        [(nll, new_nll), (sm, new_sm)],
1550                        reason="crossentropy_to_crossentropy_with_softmax")
1551                    return True
1552
1553        return False
1554
1555    while search_make_one_sub():
1556        pass
1557    return
1558
1559optdb.register('crossentropy_to_crossentropy_with_softmax',
1560               crossentropy_to_crossentropy_with_softmax, 2.01,
1561               'fast_run', 'xent', 'fast_compile_gpu')
1562
1563
1564@opt.register_specialize(
1565    'fast_compile_gpu',
1566    'local_crossentropy_to_crossentropy_with_softmax_grad')  # old name
1567@gof.local_optimizer([softmax_grad])
1568def local_softmax_grad_to_crossentropy_with_softmax_grad(node):
1569    if node.op == softmax_grad:
1570        g_coding_dist, coding_dist = node.inputs
1571        if (g_coding_dist.owner and
1572                g_coding_dist.owner.op == crossentropy_categorical_1hot_grad):
1573            g_nll, coding_dist, true_one_of_n = g_coding_dist.owner.inputs
1574            dx = crossentropy_softmax_1hot_with_bias_dx(g_nll, coding_dist,
1575                                                        true_one_of_n)
1576            copy_stack_trace(node.outputs[0], dx)
1577            return [dx]
1578
1579
1580@opt.register_specialize('fast_compile_gpu')
1581@gof.local_optimizer([tensor.MaxAndArgmax])
1582def local_argmax_pushdown(node):
1583    if isinstance(node.op, tensor.MaxAndArgmax) and node.inputs[0].owner and \
1584            len(node.outputs[0].clients) > 0 and node.inputs[0].owner.op in \
1585            (softmax_op, softplus, tensor.exp, tensor.log, tensor.tanh, sigmoid,
1586             softmax_with_bias):
1587        if theano.config.warn.argmax_pushdown_bug:
1588            logging.getLogger('theano.tensor.nnet.nnet').warn(
1589                "WARNING: there "
1590                "was a bug in Theano fixed on May 27th, 2010 in this case."
1591                " I.E. when we take the max of a softplus, softmax, exp, "
1592                "log, tanh, sigmoid, softmax_with_bias op, we were doing "
1593                "the max of the parent of the input. To remove this "
1594                "warning set the Theano flags 'warn.argmax_pushdown_bug' "
1595                "to False")
1596
1597    if (isinstance(node.op, tensor.MaxAndArgmax) and
1598            node.inputs[0].owner and len(node.outputs[0].clients) == 0):
1599        x_max, x_argmax = node.outputs
1600        x = node.inputs[0]
1601        axis = node.op.get_params(node)
1602        # TODO: Make a list/set of monotonic ops...
1603        if x.owner and x.owner.op in (softmax_op, softplus, tensor.exp,
1604                                      tensor.log, tensor.tanh, sigmoid):
1605            pre_x, = x.owner.inputs
1606            ret = tensor.max_and_argmax(pre_x, axis)
1607            copy_stack_trace(x_max, ret)
1608            return ret
1609        if x.owner and x.owner.op == softmax_with_bias:
1610            pre_x, pre_bias = x.owner.inputs
1611            ret = tensor.max_and_argmax(pre_x +
1612                                        tensor.DimShuffle(
1613                                            pre_bias.broadcastable,
1614                                            ('x', 0))(pre_bias), axis)
1615            # copy both stack traces
1616            copy_stack_trace(x_max, ret)
1617            return ret
1618
1619# Utility function used by the two next optimizations
1620
1621
1622def _check_rows_is_arange_len_labels(rows, labels):
1623    """
1624    Check that 'rows' is the same node as T.arange(labels.shape[0]).
1625
1626    Also considers the case where labels.shape[0] is constant and equal
1627    to 1, and T.arange(labels.shape[0]) has been constant-folded into 0.
1628
1629    """
1630
1631    if labels.owner and hasattr(labels.owner.fgraph, 'shape_feature'):
1632        shape_of = labels.owner.fgraph.shape_feature.shape_of
1633        # TODO: consider cases where shape_of[labels] is constant, and
1634        # has a value different from 1.
1635        # This case is harder, as _is_const only accepts a scalar value
1636        # as second argument, so checking for
1637        # _is_const(rows, numpy.arange(...)) does not work for the moment.
1638        if len(shape_of[labels]) == 1 and _is_const(shape_of[labels][0], 1):
1639            return _is_const(rows, 0)
1640
1641    if rows.owner and isinstance(rows.owner.op, tensor.ARange):
1642        start, stop, step = rows.owner.inputs
1643        if getattr(start, 'data', None) != 0:  # constants will have data
1644            return False
1645        if getattr(step, 'data', None) != 1:  # constant step will have data
1646            return False
1647        if not stop.owner:
1648            return False
1649
1650        # Not sure if that case happens any more after the introduction of
1651        # ShapeOptimizer, but we keep it if ShapeOptimizer is not present
1652        if isinstance(stop.owner.op, subtensor.Subtensor):
1653            shape_subtensor = stop.owner
1654            if shape_subtensor.op.get_constant_idx(shape_subtensor.inputs,
1655                                                   allow_partial=True) == [0]:
1656                shape_var = shape_subtensor.inputs[0]
1657                if shape_var.owner and shape_var.owner.op == tensor.shape:
1658                    return shape_var.owner.inputs[0] is labels
1659        else:
1660            shape_of = stop.owner.fgraph.shape_feature.shape_of
1661            return shape_of[labels][0] is stop
1662
1663
1664def _is_const(z, val, approx=False):
1665    try:
1666        maybe = opt.get_scalar_constant_value(z)
1667    except tensor.NotScalarConstantError:
1668        return False
1669    if approx:
1670        return np.allclose(maybe, val)
1671    else:
1672        return np.all(maybe == val)
1673
1674
1675@opt.register_specialize('fast_compile_gpu')
1676@gof.local_optimizer([subtensor.AdvancedSubtensor, tensor.log])
1677def local_advanced_indexing_crossentropy_onehot(node):
1678    log = None
1679    sm = None
1680    # First case: log(softmax(x))[rows, labels]
1681    if isinstance(node.op, subtensor.AdvancedSubtensor):
1682        try:
1683            log, rows, labels = node.inputs
1684        except Exception:
1685            pass
1686        if log and log.owner and log.owner.op == tensor.log:
1687            sm = log.owner.inputs[0]
1688
1689    # Second case: log(softmax(x)[rows, labels])
1690    elif node.op == tensor.log:
1691        pre_log = node.inputs[0].owner
1692        if pre_log and isinstance(pre_log.op, subtensor.AdvancedSubtensor):
1693            try:
1694                sm, rows, labels = pre_log.inputs
1695            except Exception:
1696                pass
1697
1698    if sm is not None and sm.owner and sm.owner.op in (softmax_op,
1699                                                       softmax_with_bias):
1700        sm_w_bias = local_softmax_with_bias.transform(sm.owner)
1701        if sm_w_bias:
1702            assert sm_w_bias[0].owner.op == softmax_with_bias
1703            x_var, b_var = sm_w_bias[0].owner.inputs
1704        else:
1705            x_var = sm.owner.inputs[0]
1706            b_var = tensor.zeros_like(x_var[0])
1707
1708        # Check that rows == arange(labels.shape[0])
1709        if _check_rows_is_arange_len_labels(rows, labels):
1710            if labels.ndim == 1 and x_var.ndim == 2:
1711                minus_ret = crossentropy_softmax_argmax_1hot_with_bias(x_var,
1712                                                                       b_var,
1713                                                                       labels)[0]
1714                ret = -minus_ret
1715                copy_stack_trace(node.outputs[0], [minus_ret, ret])
1716                return [ret]
1717
1718
1719@opt.register_specialize('fast_compile_gpu')
1720@gof.local_optimizer([softmax_grad])
1721def local_advanced_indexing_crossentropy_onehot_grad(node):
1722    if not (node.op == softmax_grad):
1723        return
1724
1725    sm = None
1726    try:
1727        d_sm, sm = node.inputs
1728    except Exception:
1729        return
1730
1731    if (sm is not None) and sm.owner and (sm.owner.op in (softmax_op,
1732                                                          softmax_with_bias)):
1733        sm_w_bias = local_softmax_with_bias.transform(sm.owner)
1734        if sm_w_bias:
1735            assert sm_w_bias[0].owner.op == softmax_with_bias
1736            x_var, b_var = sm_w_bias[0].owner.inputs
1737        else:
1738            x_var = sm.owner.inputs[0]
1739    else:
1740        return
1741
1742    # Two cases are supported:
1743    # 1. AdvancedIncSubtensor(
1744    #           zeros_like(softmax(x)),
1745    #           -out_grad / AdvancedSubtensor(softmax(x), arange(y.shape[0]), y),
1746    #           arange(y.shape[0]),
1747    #           y)
1748    #   which arises from the gradient of log(softmax(x)[arange(y.shape[0]), y])
1749    #
1750    # 2. AdvancedIncSubtensor(
1751    #           zeros_like(log(softmax(x))),
1752    #           -out_grad,
1753    #           arange(y.shape[0]),
1754    #           y)
1755    #           / softmax(x)
1756    #   which arises from the gradient of log(softmax(x))[arange(y.shape[0]), y]
1757    #
1758    # out_grad represents the gradient of the (final) cost wrt the output.
1759
1760    #
1761    # N.B. Regarding clients -- This substitution is important for numerical stability, so we
1762    # perform the substitution even when intermediate values have multiple clients.
1763    #
1764
1765    # First case.
1766    # After the check for AdvancedIncSubtensor, if anything does not fit with
1767    # the formula above, there's no way to fit it with the the second case,
1768    # so we return immediately.
1769    if d_sm.owner and isinstance(d_sm.owner.op, subtensor.AdvancedIncSubtensor):
1770        try:
1771            z, incr, rows, labels = d_sm.owner.inputs
1772        except Exception:
1773            return
1774        # Check that z == zeros_like(softmax(x))
1775        # We know z has the right size because z has the same size as d_sm,
1776        # and d_sm and sm are both inputs of softmax_grad (so they have
1777        # the same size).
1778        if not _is_const(z, 0):
1779            return
1780
1781        # In the base case (output gradient = 1), incr is -1./sm[arange(len(y)), y]
1782        # Here, we are looking for the AdvancedSubtensor term (sm[arange(len(y)), y]),
1783        # and constructing out_grad by incorporating the other terms.
1784        # out_grad will be constructed in 3 steps as follow:
1785        # out_grad = +/- 1. (according to sign)
1786        # out_grad *= -numerator
1787        # out_grad /= denominator
1788        # Then, if out_grad is a scalar, it will be allocated as a vector
1789        adv_subtensor = None
1790        out_grad = 1.
1791
1792        # If there's a 'minus' sign before the whole expression, put it in
1793        # out_grad and iterate
1794        if incr.owner and incr.owner.op == tensor.neg:
1795            out_grad = - out_grad
1796            incr = incr.owner.inputs[0]
1797
1798        if incr.owner and incr.owner.op == tensor.true_div:
1799            num, denom = incr.owner.inputs
1800
1801            # set out_grad according to the numerator, it may be divided later
1802            # num should be a vector or a scalar
1803            if num.ndim == 1 or np.all(num.broadcastable):
1804                out_grad *= -num
1805            else:
1806                return
1807
1808            if not denom.owner:
1809                return
1810
1811            if isinstance(denom.owner.op, subtensor.AdvancedSubtensor):
1812                # Base case
1813                adv_subtensor = denom
1814                # out_grad /= 1.
1815            elif denom.owner.op == tensor.mul:
1816                # Try to find the AdvancedSubtensor node mentionned above,
1817                # and the output gradient
1818                for i, input in enumerate(denom.owner.inputs):
1819                    if input.owner and isinstance(input.owner.op,
1820                                                  subtensor.AdvancedSubtensor):
1821                        other_inputs = [in_ for (j, in_) in
1822                                        enumerate(denom.owner.inputs) if j != i]
1823                        if len(other_inputs) == 1:
1824                            rest = other_inputs[0]
1825                        else:
1826                            rest = tensor.mul(*[other_inputs])
1827
1828                        # Check that rest is a vector or a scalar
1829                        if rest.ndim == 1 or np.all(rest.broadcastable):
1830                            adv_subtensor = input
1831                            out_grad /= rest
1832                            break
1833            else:
1834                return
1835
1836            # The output gradient needs to be a vector
1837            out_grad = tensor.fill(x_var[:, 0], out_grad)
1838
1839            if adv_subtensor is not None:
1840                try:
1841                    maybe_sm, maybe_rows, \
1842                        maybe_labels = adv_subtensor.owner.inputs
1843                except Exception:
1844                    return
1845
1846                if (not (maybe_sm is sm and
1847                         maybe_rows is rows and
1848                         maybe_labels is labels)):
1849                    return
1850                # else: OK
1851            else:
1852                return
1853        else:
1854            return
1855
1856        # Check that rows is arange(labels.shape[0])
1857        if not _check_rows_is_arange_len_labels(rows, labels):
1858            return
1859        # else, arguments of AdvancedIncSubtensor are OK,
1860        # it was really case 1.
1861
1862    # Second case
1863    elif d_sm.owner and d_sm.owner.op == tensor.true_div:
1864        # we're looking for
1865        # AdvIncSubtensor(zeros, grad_nll, arange(len(y)), y) / softmax
1866        try:
1867            num, denom = d_sm.owner.inputs
1868        except Exception:
1869            return
1870
1871        if denom != sm:
1872            return
1873
1874        # Check the numerator (AdvancedIncSubtensor)
1875        if num.owner and isinstance(num.owner.op, subtensor.AdvancedIncSubtensor):
1876            try:
1877                z, incr, rows, labels = num.owner.inputs
1878            except Exception:
1879                return
1880
1881            # Check z is zeros_like(log(sm))
1882            if not _is_const(z, 0):
1883                return
1884            if z.broadcastable not in [(False, False), (True, False)]:
1885                    return
1886            # here we know that we are incrementing a matrix of zeros
1887            # (or a broadcasted vector).
1888            # Since d_sm and sm are the inputs of softmax_grad,
1889            # if the graph is valid, they have the same shape, so we
1890            # also know that z has the right shape.
1891
1892            if incr.ndim != 1 or incr.dtype not in tensor.float_dtypes:
1893                return
1894
1895            # here we know that we are incrementing some part of
1896            # matrix z by a vector
1897
1898            # unless the user has taken care to mark that the data and
1899            # labels have the same number of rows, we cannot be sure
1900            # here that len(y) == len(z) However, in the common case
1901            # that these are predictions and labels it is true.  We
1902            # leave it to the Op to crash (and the user to complain)
1903            # if this assumption is ever not true.
1904
1905            out_grad = -incr
1906
1907            # Check that rows is arange(labels.shape[0])
1908            if not _check_rows_is_arange_len_labels(rows, labels):
1909                return
1910            # else, arguments of AdvancedIncSubtensor are OK
1911        else:
1912            return
1913
1914        # numerator and denominator are OK,
1915        # it was really case 2.
1916
1917    else:
1918        return
1919
1920    # Dimension check before substitution
1921    if labels.ndim == 1 and x_var.ndim == 2:
1922        ret = crossentropy_softmax_1hot_with_bias_dx(out_grad, sm, labels)
1923        # The stack trace is not added to output_grad, sm and labels at
1924        # the moment but may need to be added at a future point
1925        copy_stack_trace(node.outputs[0], ret)
1926        return [ret]
1927    else:
1928        return
1929
1930
1931@opt.register_specialize('fast_compile_gpu')
1932@gof.local_optimizer([softmax_with_bias])
1933def graph_merge_softmax_with_crossentropy_softmax(node):
1934    if node.op == softmax_with_bias:
1935        x, b = node.inputs
1936        for x_client in x.clients:
1937            if x_client[0].op == crossentropy_softmax_argmax_1hot_with_bias:
1938                big_client = x_client[0]
1939                if big_client in [b_client[0] for b_client in b.clients]:
1940                    xx, bb, ll = big_client.inputs
1941                    mergeable_client = big_client.op(x, b, ll)
1942                    copy_stack_trace(node.outputs[0], mergeable_client[1])
1943                    return [mergeable_client[1]]
1944
1945
1946@opt.register_specialize
1947@opt.register_stabilize
1948@opt.register_canonicalize
1949@gof.local_optimizer([CrossentropySoftmax1HotWithBiasDx])
1950def local_useless_crossentropy_softmax_1hot_with_bias_dx_alloc(node):
1951    """
1952    Replace a CrossentropySoftmax1HotWithBiasDx op, whose incoming gradient is
1953    an `alloc` of a scalar variable or one that has either broadcastable or
1954    matching dimensions with the output variable, by one that skips the
1955    intermediate `alloc`.
1956
1957    """
1958    if isinstance(node.op, CrossentropySoftmax1HotWithBiasDx):
1959        dy, sm, y_idx = node.inputs
1960
1961        # Those cases are directly handled by the internal broadcasting of the
1962        # `CrossentropySoftmax1HotWithBiasDx` op.
1963        if dy.ndim == 0:
1964            return False
1965        if dy.ndim == 1 and dy.broadcastable[0]:
1966            return False
1967
1968        assert dy.ndim == 1
1969
1970        if dy.owner is not None and isinstance(dy.owner.op, tensor.Alloc):
1971            # dz is the input of the Alloc op, i.e. T.alloc(dz, <shape>)
1972            dz = dy.owner.inputs[0]
1973
1974            try:
1975                shape_feature = node.fgraph.shape_feature
1976            except AttributeError:
1977                # The shape feature may not be available in some mode, but we
1978                # need it for this optimization, so don't continue.
1979                return False
1980
1981            shape_of = shape_feature.shape_of
1982            same_shape = shape_feature.same_shape
1983
1984            # Build `dz_broad` explicitly to include extra implicit dimensions.
1985            dz_broad = (True,) * (dy.ndim - dz.ndim) + dz.broadcastable
1986
1987            # If we can infer statically that the shape of `sm` and
1988            # `dy` are the same in dimension `k` or the shape of `dy` is equal
1989            # to 1 (which triggers the internal broadcasting in
1990            # `CrossentropySoftmax1HotWithBiasDx`) we do not need to
1991            # check it at runtime.
1992            if (dz_broad[0] and
1993                    not same_shape(sm, dy, dim_x=0, dim_y=0) and
1994                    shape_of[dy][0] != 1):
1995                # If `dz` is broadcastable, we need to check whether the shapes
1996                # of `dy` and `sm` are the same or whether the shape of `dy` is
1997                # equal to 1.
1998                cond = tensor.or_(tensor.eq(dy.shape[0], 1),
1999                                  tensor.eq(dy.shape[0], sm.shape[0]))
2000                msg = '`sm` and `dy` do not have the same shape.'
2001                dz = opt.Assert(msg)(dz, cond)
2002
2003            ret = node.op(dz, sm, y_idx)
2004            copy_stack_trace(node.outputs[0], ret)
2005            return [ret]
2006
2007
2008def binary_crossentropy(output, target):
2009    """
2010    Compute the crossentropy of binary random variables.
2011
2012    Output and target are each expectations of binary random
2013    variables; target may be exactly 0 or 1 but output must
2014    lie strictly between 0 and 1.
2015
2016    Notes
2017    -----
2018    We could use the x log y op to support output=0 and output=1.
2019    The gradient would still be undefined though.
2020
2021    We do not sum, crossentropy is computed by component.
2022    TODO : Rewrite as a scalar, and then broadcast to tensor.
2023
2024    """
2025    return -(target * tensor.log(output) + (1.0 - target) * tensor.log(1.0 - output))
2026
2027
2028def sigmoid_binary_crossentropy(output, target):
2029    """
2030    Compute the cross-entropy of binary random variables.
2031
2032    `output` should be real-valued (range (-inf, +inf)); `sigmoid` will be
2033    applied to produce a (0, 1) valued input.
2034
2035    `target` is assumed to be probabilities in [0, 1].
2036
2037    Notes
2038    -----
2039    Mathematically equivalent to `binary_crossentropy(sigmoid(output), target)`,
2040    but with more efficient and numerically stable computation.
2041    """
2042    def grad(inputs, out_grads):
2043        (output, target), (out_grad,) = inputs, out_grads
2044        g_output = out_grad * (sigmoid(output) - target)
2045        g_target = out_grad * (-output)
2046        return [g_output, g_target]
2047    inp = [output, target]
2048    outp = softplus(-abs(output)) + output * ((output > 0) - target)
2049    return theano.OpFromGraph(inp, [outp], grad_overrides=grad, inline=True,
2050                              name='sigmoid_binary_crossentropy')(*inp)
2051
2052
2053def categorical_crossentropy(coding_dist, true_dist):
2054    r"""
2055    Return the cross-entropy between an approximating distribution and a true
2056    distribution.
2057
2058    .. warning:: THIS FUNCTION IS UNNECESSARILY POLYMORPHIC.
2059    We ultimately don't want the polymorphism, and will move this function
2060    to pylearn.algorithms.cost. The 1hot version will be removed.
2061    The length of the documentation here is a form of code smell.
2062
2063    The cross entropy between two probability distributions measures the average
2064    number of bits needed to identify an event from a set of possibilities, if a
2065    coding scheme is used based on a given probability distribution q, rather
2066    than the "true" distribution p.
2067
2068    Mathematically it is defined as follows:
2069
2070    .. math::
2071
2072        H(p,q) = - \sum_x p(x) \log(q(x))
2073
2074    Parameters
2075    ----------
2076    coding_dist : a dense matrix
2077        Each slice along axis represents one distribution.
2078    true_dist : a dense matrix or sparse matrix or integer vector
2079        In the case of a matrix argument, each slice along axis represents one
2080        distribution. In the case of an integer vector argument, each element
2081        represents the position of the '1' in a 1-of-N encoding.
2082
2083    Returns
2084    -------
2085    tensor of rank one-less-than `coding_dist`
2086        The cross entropy between each coding and true distribution.
2087
2088    Notes
2089    -----
2090    axis : int
2091        The dimension over which each distribution runs
2092        (1 for row distributions, 0 for column distributions).
2093
2094    """
2095    if true_dist.ndim == coding_dist.ndim:
2096        return -tensor.sum(true_dist * tensor.log(coding_dist),
2097                           axis=coding_dist.ndim - 1)
2098    elif true_dist.ndim == coding_dist.ndim - 1:
2099        return crossentropy_categorical_1hot(coding_dist, true_dist)
2100    else:
2101        raise TypeError('rank mismatch between coding and true distributions')
2102
2103
2104class Prepend_scalar_constant_to_each_row(gof.Op):
2105
2106    __props__ = ()
2107
2108    def __init__(self, val=0):
2109        if isinstance(val, float):
2110            val = scalar.constant(val)
2111        self.val = val
2112
2113    def __str__(self):
2114        return '%s{%s}' % (self.__class__.__name__, self.val)
2115
2116    def make_node(self, mat):
2117        # check type of input
2118        x = tensor.as_tensor_variable(mat)
2119        if not mat.type.broadcastable == (False, False):
2120            raise TypeError("Expected a matrix as input")
2121        y = tensor.as_tensor_variable(self.val)
2122        assert y.ndim == 0
2123        if x.type.dtype != y.type.dtype:
2124            TypeError(
2125                "the value to prepend don't have the same type as the matrix")
2126
2127        node = Apply(op=self, inputs=[mat], outputs=[mat.type()])
2128        return node
2129
2130    def perform(self, node, inp, out):
2131        mat, = inp
2132        output, = out
2133        new_shape = (mat.shape[0], mat.shape[1] + 1)
2134        if output[0] is None:
2135            output[0] = np.empty(new_shape, dtype=mat.dtype)
2136            out = output[0]
2137        else:
2138            if output[0].shape != new_shape:
2139                try:
2140                    output[0].resize(new_shape)
2141                except Exception:
2142                    output[0] = np.empty(new_shape, dtype=mat.dtype)
2143            out = output[0]
2144
2145        out[:, 0].fill(self.val.data)
2146        out[:, 1:] = mat
2147
2148    def infer_shape(self, node, in_shapes):
2149        shp = (in_shapes[0][0], in_shapes[0][1] + 1)
2150        return [shp]
2151
2152    def grad(self, inp, grads):
2153        mat, = inp
2154        goutput, = grads
2155        return goutput[:, 1:]
2156
2157
2158class Prepend_scalar_to_each_row(gof.Op):
2159
2160    __props__ = ()
2161
2162    def make_node(self, val, mat):
2163        # check type of input
2164        x = tensor.as_tensor_variable(mat)
2165        if isinstance(val, float):
2166            val = scalar.constant(val)
2167        if not mat.type.broadcastable == (False, False):
2168            raise TypeError("Expected a matrix as input")
2169        y = tensor.as_tensor_variable(val)
2170        assert y.ndim == 0
2171        if x.type.dtype != y.type.dtype:
2172            TypeError(
2173                "the value to prepend don't have the same type as the matrix")
2174
2175        node = Apply(op=self, inputs=[val, mat], outputs=[mat.type()])
2176        return node
2177
2178    def perform(self, node, inp, out):
2179        val, mat = inp
2180        output, = out
2181        new_shape = (mat.shape[0], mat.shape[1] + 1)
2182        if output[0] is None:
2183            output[0] = np.empty(new_shape, dtype=mat.dtype)
2184            out = output[0]
2185        else:
2186            if output[0].shape != new_shape:
2187                try:
2188                    output[0].resize(new_shape)
2189                except Exception:
2190                    output[0] = np.empty(new_shape, dtype=mat.dtype)
2191            out = output[0]
2192        out[:, 0].fill(val)
2193        out[:, 1:] = mat
2194
2195    def infer_shape(self, node, in_shapes):
2196        shp = (in_shapes[1][0], in_shapes[1][1] + 1)
2197        return [shp]
2198
2199    def grad(self, inp, grads):
2200        val, mat = inp
2201        goutput, = grads
2202        return goutput[:, 0], goutput[:, 1:]
2203
2204prepend_scalar_to_each_row = Prepend_scalar_to_each_row()
2205prepend_0_to_each_row = Prepend_scalar_constant_to_each_row(0.)
2206prepend_1_to_each_row = Prepend_scalar_constant_to_each_row(1.)
2207
2208
2209def relu(x, alpha=0):
2210    """
2211    Compute the element-wise rectified linear activation function.
2212
2213    .. versionadded:: 0.7.1
2214
2215    Parameters
2216    ----------
2217    x : symbolic tensor
2218        Tensor to compute the activation function for.
2219    alpha : `scalar or tensor, optional`
2220        Slope for negative input, usually between 0 and 1. The default value
2221        of 0 will lead to the standard rectifier, 1 will lead to
2222        a linear activation function, and any value in between will give a
2223        leaky rectifier. A shared variable (broadcastable against `x`) will
2224        result in a parameterized rectifier with learnable slope(s).
2225
2226    Returns
2227    -------
2228    symbolic tensor
2229        Element-wise rectifier applied to `x`.
2230
2231    Notes
2232    -----
2233    This is numerically equivalent to ``T.switch(x > 0, x, alpha * x)``
2234    (or ``T.maximum(x, alpha * x)`` for ``alpha < 1``), but uses a faster
2235    formulation or an optimized Op, so we encourage to use this function.
2236
2237    """
2238    # This is probably the fastest implementation for GPUs. Both the forward
2239    # pass and the gradient get compiled into a single GpuElemwise call.
2240    # TODO: Check if it's optimal for CPU as well; add an "if" clause if not.
2241    # TODO: Check if there's a faster way for the gradient; create an Op if so.
2242    if alpha == 0:
2243        return 0.5 * (x + abs(x))
2244    else:
2245        # We can't use 0.5 and 1 for one and half.  as if alpha is a
2246        # numpy dtype, they will be considered as float64, so would
2247        # cause upcast to float64.
2248        alpha = tensor.as_tensor_variable(alpha)
2249        f1 = 0.5 * (1 + alpha)
2250        f2 = 0.5 * (1 - alpha)
2251        return f1 * x + f2 * abs(x)
2252
2253
2254def h_softmax(x, batch_size, n_outputs, n_classes, n_outputs_per_class,
2255              W1, b1, W2, b2, target=None):
2256    """ Two-level hierarchical softmax.
2257
2258    This function implements a two-layer hierarchical softmax. It is commonly
2259    used as an alternative of the softmax when the number of outputs is
2260    important (it is common to use it for millions of outputs). See
2261    reference [1]_ for more information about the computational gains.
2262
2263    The `n_outputs` outputs are organized in `n_classes` classes, each class
2264    containing the same number `n_outputs_per_class` of outputs.
2265    For an input `x` (last hidden activation), the first softmax layer predicts
2266    its class and the second softmax layer predicts its output among its class.
2267
2268    If `target` is specified, it will only compute the outputs of the
2269    corresponding targets. Otherwise, if `target` is `None`, it will compute
2270    all the outputs.
2271
2272    The outputs are grouped in classes in the same order as they are initially
2273    defined: if `n_outputs=10` and `n_classes=2`, then the first class is
2274    composed of the outputs labeled `{0,1,2,3,4}` while the second class is
2275    composed of `{5,6,7,8,9}`. If you need to change the classes, you have to
2276    re-label your outputs.
2277
2278    .. versionadded:: 0.7.1
2279
2280    Parameters
2281    ----------
2282    x: tensor of shape (batch_size, number of features)
2283        the minibatch input of the two-layer hierarchical softmax.
2284    batch_size: int
2285        the size of the minibatch input x.
2286    n_outputs: int
2287        the number of outputs.
2288    n_classes: int
2289        the number of classes of the two-layer hierarchical softmax. It
2290        corresponds to the number of outputs of the first softmax. See note at
2291        the end.
2292    n_outputs_per_class: int
2293        the number of outputs per class. See note at the end.
2294    W1: tensor of shape (number of features of the input x, n_classes)
2295        the weight matrix of the first softmax, which maps the input x to the
2296        probabilities of the classes.
2297    b1: tensor of shape (n_classes,)
2298        the bias vector of the first softmax layer.
2299    W2: tensor of shape (n_classes, number of features of the input x,
2300            n_outputs_per_class)
2301        the weight matrix of the second softmax, which maps the input x to
2302        the probabilities of the outputs.
2303    b2: tensor of shape (n_classes, n_outputs_per_class)
2304        the bias vector of the second softmax layer.
2305    target: tensor of shape either (batch_size,) or (batch_size, 1)
2306        (optional, default None)
2307        contains the indices of the targets for the minibatch
2308        input x. For each input, the function computes the output for its
2309        corresponding target. If target is None, then all the outputs are
2310        computed for each input.
2311
2312    Returns
2313    -------
2314    tensor of shape (`batch_size`, `n_outputs`) or (`batch_size`, 1)
2315        Output tensor of the two-layer hierarchical softmax for input `x`.
2316        Depending on argument `target`, it can have two different shapes.
2317        If `target` is not specified (`None`), then all the outputs are
2318        computed and the returned tensor has shape (`batch_size`, `n_outputs`).
2319        Otherwise, when `target` is specified, only the corresponding outputs
2320        are computed and the returned tensor has thus shape (`batch_size`, 1).
2321
2322    Notes
2323    -----
2324    The product of `n_outputs_per_class` and `n_classes` has to be greater or
2325    equal to `n_outputs`. If it is strictly greater, then the irrelevant
2326    outputs will be ignored.
2327    `n_outputs_per_class` and `n_classes` have to be the same as the
2328    corresponding dimensions of the tensors of `W1`, `b1`, `W2` and `b2`.
2329    The most computational efficient configuration is when
2330    `n_outputs_per_class` and `n_classes` are equal to the square root of
2331    `n_outputs`.
2332
2333    Examples
2334    --------
2335    The following example builds a simple hierarchical softmax layer.
2336
2337    >>> import numpy as np
2338    >>> import theano
2339    >>> from theano import tensor
2340    >>> from theano.tensor.nnet import h_softmax
2341    >>>
2342    >>> # Parameters
2343    >>> batch_size = 32
2344    >>> n_outputs = 100
2345    >>> dim_x = 10  # dimension of the input
2346    >>> n_classes = int(np.ceil(np.sqrt(n_outputs)))
2347    >>> n_outputs_per_class = n_classes
2348    >>> output_size = n_outputs_per_class * n_outputs_per_class
2349    >>>
2350    >>> # First level of h_softmax
2351    >>> floatX = theano.config.floatX
2352    >>> W1 = theano.shared(
2353    ...     np.random.normal(0, 0.001, (dim_x, n_classes)).astype(floatX))
2354    >>> b1 = theano.shared(np.zeros((n_classes,), floatX))
2355    >>>
2356    >>> # Second level of h_softmax
2357    >>> W2 = np.random.normal(0, 0.001,
2358    ...     size=(n_classes, dim_x, n_outputs_per_class)).astype(floatX)
2359    >>> W2 = theano.shared(W2)
2360    >>> b2 = theano.shared(np.zeros((n_classes, n_outputs_per_class), floatX))
2361    >>>
2362    >>> # We can now build the graph to compute a loss function, typically the
2363    >>> # negative log-likelihood:
2364    >>>
2365    >>> x = tensor.imatrix('x')
2366    >>> target = tensor.imatrix('target')
2367    >>>
2368    >>> # This only computes the output corresponding to the target.
2369    >>> # The complexity is O(n_classes + n_outputs_per_class).
2370    >>> y_hat_tg = h_softmax(x, batch_size, output_size, n_classes,
2371    ...                      n_outputs_per_class, W1, b1, W2, b2, target)
2372    >>>
2373    >>> negll = -tensor.mean(tensor.log(y_hat_tg))
2374    >>>
2375    >>> # We may need to compute all the outputs (at test time usually):
2376    >>>
2377    >>> # This computes all the outputs.
2378    >>> # The complexity is O(n_classes * n_outputs_per_class).
2379    >>> output = h_softmax(x, batch_size, output_size, n_classes,
2380    ...                    n_outputs_per_class, W1, b1, W2, b2)
2381
2382
2383    References
2384    ----------
2385    .. [1] J. Goodman, "Classes for Fast Maximum Entropy Training,"
2386        ICASSP, 2001, <http://arxiv.org/abs/cs/0108006>`.
2387    """
2388
2389    # First softmax that computes the probabilities of belonging to each class
2390    class_probs = theano.tensor.nnet.softmax(tensor.dot(x, W1) + b1)
2391
2392    if target is None:  # Computes the probabilites of all the outputs
2393
2394        # Second softmax that computes the output probabilities
2395        activations = tensor.tensordot(x, W2, (1, 1)) + b2
2396        output_probs = theano.tensor.nnet.softmax(
2397            activations.reshape((-1, n_outputs_per_class)))
2398        output_probs = output_probs.reshape((batch_size, n_classes, -1))
2399        output_probs = class_probs.dimshuffle(0, 1, 'x') * output_probs
2400        output_probs = output_probs.reshape((batch_size, -1))
2401        # output_probs.shape[1] is n_classes * n_outputs_per_class, which might
2402        # be greater than n_outputs, so we ignore the potential irrelevant
2403        # outputs with the next line:
2404        output_probs = output_probs[:, :n_outputs]
2405
2406    else:  # Computes the probabilities of the outputs specified by the targets
2407
2408        target = target.flatten()
2409
2410        # Classes to which belong each target
2411        target_classes = target // n_outputs_per_class
2412
2413        # Outputs to which belong each target inside a class
2414        target_outputs_in_class = target % n_outputs_per_class
2415
2416        # Second softmax that computes the output probabilities
2417        activations = sparse_block_dot(
2418            W2.dimshuffle('x', 0, 1, 2), x.dimshuffle(0, 'x', 1),
2419            tensor.zeros((batch_size, 1), dtype='int32'), b2,
2420            target_classes.dimshuffle(0, 'x'))
2421
2422        output_probs = theano.tensor.nnet.softmax(activations.dimshuffle(0, 2))
2423        target_class_probs = class_probs[tensor.arange(batch_size),
2424                                         target_classes]
2425        output_probs = output_probs[tensor.arange(batch_size),
2426                                    target_outputs_in_class]
2427        output_probs = target_class_probs * output_probs
2428
2429    return output_probs
2430
2431
2432def elu(x, alpha=1):
2433    """
2434    Compute the element-wise exponential linear activation function [2]_.
2435
2436    .. versionadded:: 0.8.0
2437
2438    Parameters
2439    ----------
2440    x : symbolic tensor
2441        Tensor to compute the activation function for.
2442    alpha : scalar
2443
2444
2445    Returns
2446    -------
2447    symbolic tensor
2448        Element-wise exponential linear activation function applied to `x`.
2449
2450    References
2451    -----
2452    .. [2] Djork-Arne Clevert,  Thomas Unterthiner, Sepp Hochreiter
2453        "Fast and Accurate Deep Network Learning by
2454        Exponential Linear Units (ELUs)" <http://arxiv.org/abs/1511.07289>`.
2455    """
2456    return tensor.switch(x > 0, x, alpha * tensor.expm1(x))
2457
2458
2459def selu(x):
2460    """Compute the element-wise Scaled Exponential Linear unit [3]_.
2461
2462    .. versionadded:: 0.9.0
2463
2464    Parameters
2465    ----------
2466    x : symbolic tensor
2467        Tensor to compute the activation function for.
2468
2469    Returns
2470    -------
2471    symbolic tensor
2472        Element-wise scaled exponential linear activation function applied to `x`.
2473
2474    References
2475    ----------
2476    .. [3] Klambauer G, Unterthiner T, Mayr A, Hochreiter S.
2477        "Self-Normalizing Neural Networks" <https://arxiv.org/abs/1706.02515>
2478    """
2479    alpha = 1.6732632423543772848170429916717
2480    scale = 1.0507009873554804934193349852946
2481    return scale * elu(x, alpha)
2482
2483
2484class ScalarSoftsign(theano.scalar.UnaryScalarOp):
2485    """
2486    Softsign activation function
2487    :math:`\\varphi(\\mathbf{x}) = \\frac{1}{1+|x|}`
2488
2489    """
2490    @staticmethod
2491    def static_impl(x):
2492        return x / (1.0 + abs(x))
2493
2494    def impl(self, x):
2495        return ScalarSoftsign.static_impl(x)
2496
2497    def grad(self, inp, grads):
2498        x, = inp
2499        gz, = grads
2500        if 'float' in x.type.dtype:
2501            d = (1.0 + abs(x))
2502            return [gz / (d * d)]
2503        else:
2504            return NotImplemented
2505
2506    def c_code(self, node, name, inp, out, sub):
2507        x, = inp
2508        z, = out
2509        if node.inputs[0].type in [theano.scalar.float32,
2510                                   theano.scalar.float64]:
2511            return "%(z)s = %(x)s / (1.0+fabs(%(x)s));" % locals()
2512        raise NotImplementedError('only floating point x is implemented')
2513
2514scalar_softsign = ScalarSoftsign(theano.scalar.upgrade_to_float,
2515                                 name='scalar_softsign')
2516softsign = elemwise.Elemwise(scalar_softsign, name='softsign')
2517
2518
2519def confusion_matrix(actual, pred):
2520    """
2521    Computes the confusion matrix of given vectors containing
2522    actual observations and predicted observations.
2523
2524    Parameters
2525    ----------
2526    actual : 1-d tensor variable
2527    pred : 1-d tensor variable
2528
2529    Returns
2530    -------
2531    conf_mat : Confusion matrix of actual and predictions observations as shown below.
2532
2533               | Predicted
2534    ___________|___________
2535       Actual  |
2536               |
2537
2538    order : 1-d array of order of entries in rows and columns
2539
2540    Examples
2541    --------
2542    >>> import theano
2543    >>> from theano.tensor.nnet import confusion_matrix
2544
2545    >>> x = theano.tensor.vector()
2546    >>> y = theano.tensor.vector()
2547    >>> f = theano.function([x, y], confusion_matrix(x, y))
2548    >>> y_true = [2, 0, 2, 2, 0, 1]
2549    >>> y_pred = [0, 0, 2, 2, 0, 2]
2550    >>> print(f(y_true, y_pred))
2551    [array([[2, 0, 0],
2552       [0, 0, 1],
2553       [1, 0, 2]]), array([ 0.,  1.,  2.])]
2554    """
2555    if actual.ndim != 1:
2556        raise ValueError('actual must be 1-d tensor variable')
2557    if pred.ndim != 1:
2558        raise ValueError('pred must be 1-d tensor variable')
2559
2560    order = extra_ops.Unique(False, False, False)(tensor.concatenate([actual, pred]))
2561
2562    colA = actual.dimshuffle(0, 'x')
2563    colP = pred.dimshuffle(0, 'x')
2564
2565    oneHotA = tensor.eq(colA, order).astype('int64')
2566    oneHotP = tensor.eq(colP, order).astype('int64')
2567
2568    conf_mat = tensor.dot(oneHotA.T, oneHotP)
2569    return [conf_mat, order]
2570