1""" 2Provides neural-network specific Ops. 3 4Notes 5----- 6TODO: factor this out into a neural-network toolbox. 7 8We register all optimization with the gpu tag as we don't 9implement all the intermediate case on the GPU (in particular 10AdvancedSubtensor). So to make sure it run well on the gpu with 11fast_compile, we register them as needed for the GPU. This can be 12revisited later when all the intermediate part are on the GPU. 13 14""" 15from __future__ import absolute_import, print_function, division 16import logging 17import warnings 18import numpy as np 19from six.moves import xrange 20 21import theano 22from theano import gof 23from theano import scalar 24from theano.tensor import extra_ops, as_tensor_variable 25from theano.gof.opt import copy_stack_trace 26from theano.tensor import basic as tensor, subtensor, opt, elemwise 27from theano.tensor.type import (values_eq_approx_remove_inf, 28 values_eq_approx_remove_nan) 29from theano.compile import optdb 30from theano.gof import Apply 31 32from theano.tensor.nnet.sigm import sigmoid, softplus 33from theano.gradient import DisconnectedType 34from theano.gradient import grad_not_implemented 35from theano.tensor.nnet.blocksparse import sparse_block_dot 36 37############ 38# 39# TENSOR OPS 40# 41 42 43class SoftmaxWithBias(gof.Op): 44 """ 45 An L{Op} for the output of neural-net multiclass classifiers. 46 47 Attributes 48 ---------- 49 x : a matrix of floats (32 or 64) 50 b : a [row] vector of floats (32 or 64), length is number of cols in x 51 52 This L{Op}'s output is softmax(x+b). 53 softmax(x[i]) is the i'th distribution over len(x[i]) options. 54 55 """ 56 57 nin = 2 58 nout = 1 59 __props__ = () 60 61 def make_node(self, x, b): 62 x = tensor.as_tensor_variable(x) 63 b = tensor.as_tensor_variable(b) 64 if x.type.ndim != 2 \ 65 or x.type.dtype not in tensor.float_dtypes: 66 raise ValueError('x must be 2-d tensor of floats') 67 if b.type.ndim != 1 \ 68 or b.type.dtype not in tensor.float_dtypes: 69 raise ValueError('b must be 1-d tensor of floats') 70 71 sm = x.type() 72 return Apply(self, [x, b], [sm]) 73 74 def perform(self, node, input_storage, output_storage): 75 x, b = input_storage 76 if b.shape[0] != x.shape[1]: 77 raise ValueError('b must have same number of columns as x') 78 79 # sm = numpy.zeros_like(x) 80 # for i in xrange(sm.shape[0]): 81 # row = x[i] + b 82 # sm[i] = numpy.exp(row - numpy.max(row)) 83 # sm[i] *= 1.0 / numpy.sum(sm[i]) 84 # output_storage[0][0] = sm 85 86 if x.size == 0: 87 # Numpy doesn't like the max of a zero-sized object. 88 output_storage[0][0] = np.zeros(x.shape, dtype=x.dtype) 89 return 90 91 x_dtype = x.dtype 92 # Perform computations in float32 otherwise the result is too imprecise 93 if x.dtype == 'float16': 94 x = x.astype('float32') 95 96 x_plus_b = x + b[None, :] 97 e_x = np.exp(x_plus_b - x_plus_b.max(axis=1)[:, None]) 98 e_x *= 1.0 / e_x.sum(axis=1)[:, None] 99 # default for copy is True and we don't need a copy if the 100 # data type matches. 101 output_storage[0][0] = e_x.astype(x_dtype, copy=False) 102 103 def L_op(self, inp, outputs, grads): 104 x, b = inp 105 g_sm, = grads 106 107 if isinstance(g_sm.type, DisconnectedType): 108 return [DisconnectedType()(), DisconnectedType()()] 109 110 dx = softmax_grad(g_sm, outputs[0]) 111 db = tensor.sum(dx, axis=0) 112 return dx, db 113 114 def infer_shape(self, node, shape): 115 return [shape[0]] 116 117 def c_headers(self): 118 return ['<iostream>', '<cmath>'] 119 120 @staticmethod 121 def c_code_template(dtype): 122 # this implementation was lifted from 123 # /u/bergstrj/cvs/bergstrj/src/feb07/nn.cxx 124 125 # TODO: put this into a templated function, in the support code 126 # TODO: declare the max of each row as an Op output 127 128 # TODO: set error messages for failures in this code 129 130 # TODO: use this to accept float32 and int32: 131 # node.inputs[0].type.dtype_specs()[1] 132 init_decl = """ 133 npy_intp* Nx = PyArray_DIMS(%(x)s); 134 npy_intp Sx = 0; 135 npy_intp Sb = 0; 136 npy_intp Ssm = 0; 137 138 139 if (PyArray_NDIM(%(x)s) != 2) 140 { 141 PyErr_SetString(PyExc_ValueError, "not a 2d tensor"); 142 %(fail)s; 143 } 144 if (PyArray_NDIM(%(b)s) != 1) 145 { 146 PyErr_SetString(PyExc_ValueError, "b not 1d tensor"); 147 %(fail)s; 148 } 149 if ((PyArray_TYPE(%(x)s) != NPY_DOUBLE) && 150 (PyArray_TYPE(%(x)s) != NPY_FLOAT)) 151 { 152 PyErr_SetString(PyExc_TypeError, "not a float"); 153 %(fail)s; 154 } 155 if ((PyArray_TYPE(%(b)s) != NPY_DOUBLE) && 156 (PyArray_TYPE(%(b)s) != NPY_FLOAT)) 157 { 158 PyErr_SetString(PyExc_TypeError, "b not float"); 159 %(fail)s; 160 } 161 if ((PyArray_DIMS(%(x)s)[1] != PyArray_DIMS(%(b)s)[0])) 162 { 163 PyErr_Format(PyExc_ValueError, 164 "number of columns in x (%%ld) does not match length of b (%%ld)", 165 (long int)PyArray_DIMS(%(x)s)[1], (long int)PyArray_DIMS(%(b)s)[0]); 166 %(fail)s; 167 } 168 169 if ((NULL == %(sm)s) 170 || (PyArray_DIMS(%(sm)s)[0] != PyArray_DIMS(%(x)s)[0]) 171 || (PyArray_DIMS(%(sm)s)[1] != PyArray_DIMS(%(x)s)[1])) 172 { 173 if (NULL != %(sm)s) Py_XDECREF(%(sm)s); 174 %(sm)s = (PyArrayObject*)PyArray_SimpleNew(2, PyArray_DIMS(%(x)s), 175 PyArray_TYPE(%(x)s)); 176 if(!%(sm)s) { 177 PyErr_SetString(PyExc_MemoryError, 178 "failed to alloc sm output"); 179 %(fail)s 180 } 181 } 182 Sx = PyArray_STRIDES(%(x)s)[1]/sizeof(dtype_%(x)s); 183 Sb = PyArray_STRIDES(%(b)s)[0]/sizeof(dtype_%(b)s); 184 Ssm = PyArray_STRIDES(%(sm)s)[1]/sizeof(dtype_%(sm)s); 185 186 """ 187 188 begin_row_loop = """ 189 for (size_t i = 0; i < Nx[0]; ++i) 190 { 191 size_t j; 192 double sum = 0.0; 193 194 const dtype_%(x)s* __restrict__ x_i = (dtype_%(x)s*)(PyArray_BYTES(%(x)s) + PyArray_STRIDES(%(x)s)[0] * i); 195 const dtype_%(b)s* __restrict__ b_i = (dtype_%(b)s*)(PyArray_BYTES(%(b)s)); 196 dtype_%(sm) s* __restrict__ sm_i = (dtype_%(sm)s*)(PyArray_BYTES(%(sm)s) + PyArray_STRIDES(%(sm)s)[0] * i); 197 198 npy_intp Sx = PyArray_STRIDES(%(x)s)[1]/sizeof(dtype_%(x)s); 199 npy_intp Sb = PyArray_STRIDES(%(b)s)[0]/sizeof(dtype_%(b)s); 200 npy_intp Ssm = PyArray_STRIDES(%(sm)s)[1]/sizeof(dtype_%(sm)s); 201 202 size_t row_max_j=0; 203 dtype_%(sm)s row_max = x_i[0] + b_i[0]; 204 //std::cout << "0 " << row_max << "\\n"; 205 // Get the maximum value of the row 206 for (j = 1; j < Nx[1]; ++j) 207 { 208 dtype_%(sm)s row_ij = x_i[j * Sx] + b_i[j * Sb]; 209 //std::cout << "1 " << row_ij << "\\n"; 210 row_max_j = (row_ij > row_max) ? j : row_max_j; 211 row_max = (row_ij > row_max) ? row_ij : row_max; 212 } 213 214 """ 215 216 inside_row_loop = """ 217 for (j = 0; j < Nx[1]; ++j) 218 { 219 dtype_%(sm)s row_ij = x_i[j * Sx] + b_i[j * Sb]; 220 //std::cout << "2 " << j << " " << row_ij << " " << row_max << "\\n"; 221 dtype_%(sm)s sm_ij = exp(row_ij - row_max); 222 //std::cout << "3 " << j << " " << sm_ij << "\\n"; 223 sum += sm_ij; 224 sm_i[j * Ssm] = sm_ij; 225 } 226 227 //cblas_dscal(x.N, 1.0 / sum, &mat_at(s,i,0), s.n); 228 double sum_inv = 1.0 / sum; 229 for (j = 0; j < Nx[1]; ++j) 230 { 231 sm_i[j * Ssm] *= sum_inv; 232 } 233 234 """ 235 236 # Get the vectorized version of exp if it exist 237 try: 238 vec_exp = theano.scalar.exp.c_code_contiguous_raw(dtype, 239 "Nx[1]", "sm_i", "sm_i") 240 inside_row_loop_contig = """ 241 for (j = 0; j < Nx[1]; ++j) 242 { 243 dtype_%%(sm)s row_ij = x_i[j * Sx] + b_i[j * Sb]; 244 //std::cout << "2 " << j << " " << row_ij << " " << row_max << "\\n"; 245 dtype_%%(sm)s sm_ij = row_ij - row_max; 246 //std::cout << "3 " << j << " " << sm_ij << "\\n"; 247 sm_i[j * Ssm] = sm_ij; 248 } 249 %(vec_exp)s; 250 for (j = 0; j < Nx[1]; ++j) 251 { 252 sum += sm_i[j * Ssm]; 253 } 254 255 //cblas_dscal(x.N, 1.0 / sum, &mat_at(s,i,0), s.n); 256 double sum_inv = 1.0 / sum; 257 for (j = 0; j < Nx[1]; ++j) 258 { 259 sm_i[j * Ssm] *= sum_inv; 260 } 261 262 """ % locals() 263 inside_row_loop = """ 264 if(Ssm == 1){ 265 %(inside_row_loop_contig)s 266 }else{ 267 %(inside_row_loop)s 268 } 269 """ % locals() 270 except theano.gof.utils.MethodNotDefined: 271 pass 272 end_row_loop = """ 273 } 274 """ 275 276 return (init_decl, begin_row_loop, inside_row_loop, end_row_loop) 277 278 def c_code(self, node, name, inp, out, sub): 279 x, b = inp 280 sm, = out 281 code_template = ''.join(self.c_code_template( 282 node.inputs[0].type.dtype_specs()[1])) 283 return code_template % dict(locals(), **sub) 284 285 @staticmethod 286 def c_code_cache_version(): 287 return (8,) 288 289softmax_with_bias = SoftmaxWithBias() 290 291 292class SoftmaxGrad(gof.Op): 293 """ 294 Gradient wrt x of the Softmax Op. 295 296 """ 297 298 nin = 2 299 nout = 1 300 __props__ = () 301 302 def make_node(self, dy, sm): 303 dy = tensor.as_tensor_variable(dy) 304 sm = tensor.as_tensor_variable(sm) 305 if dy.type.ndim not in (1, 2) \ 306 or dy.type.dtype not in tensor.float_dtypes: 307 raise ValueError('dy must be 1-d or 2-d tensor of floats. Got ', 308 dy.type) 309 if dy.ndim == 1: 310 dy = tensor.shape_padleft(dy, n_ones=1) 311 if sm.ndim == 1: 312 sm = tensor.shape_padleft(sm, n_ones=1) 313 return Apply(self, [dy, sm], [sm.type()]) 314 315 def perform(self, node, input_storage, output_storage): 316 dy, sm = input_storage 317 dx = np.zeros_like(sm) 318 # dx[i,j] = - (\sum_k dy[i,k] sm[i,k]) sm[i,j] + dy[i,j] sm[i,j] 319 for i in xrange(sm.shape[0]): 320 dy_times_sm_i = dy[i] * sm[i] 321 dx[i] = dy_times_sm_i - sum(dy_times_sm_i) * sm[i] 322 output_storage[0][0] = dx 323 324 def grad(self, inp, grads): 325 dy, sm = inp 326 g, = grads 327 328 tmp = g + tensor.neg(tensor.sum(g * sm, axis=1).dimshuffle((0, 'x'))) 329 g_dy = tmp * sm 330 331 tmp2 = tensor.sum(dy * sm, axis=1).dimshuffle((0, 'x')) 332 g_sm = tmp * dy - g * tmp2 333 334 return g_dy, g_sm 335 336 def infer_shape(self, node, shape): 337 return [shape[1]] 338 339 def c_code_cache_version(self): 340 return (3,) 341 342 def c_code(self, node, name, inp, out, sub): 343 dy, sm = inp 344 dx, = out 345 return ''' 346 if ((PyArray_TYPE(%(dy)s) != NPY_DOUBLE) && 347 (PyArray_TYPE(%(dy)s) != NPY_FLOAT)) 348 { 349 PyErr_SetString(PyExc_TypeError, 350 "types should be float or float64"); 351 %(fail)s; 352 } 353 if ((PyArray_TYPE(%(sm)s) != NPY_DOUBLE) && 354 (PyArray_TYPE(%(sm)s) != NPY_FLOAT)) 355 { 356 PyErr_SetString(PyExc_TypeError, 357 "types should be float or float64"); 358 %(fail)s; 359 } 360 if ((PyArray_NDIM(%(dy)s) != 2) 361 || (PyArray_NDIM(%(sm)s) != 2)) 362 { 363 PyErr_SetString(PyExc_ValueError, "rank error"); 364 %(fail)s; 365 } 366 if (PyArray_DIMS(%(dy)s)[0] != PyArray_DIMS(%(sm)s)[0]) 367 { 368 PyErr_SetString(PyExc_ValueError, "dy.shape[0] != sm.shape[0]"); 369 %(fail)s; 370 } 371 if ((NULL == %(dx)s) 372 || (PyArray_DIMS(%(dx)s)[0] != PyArray_DIMS(%(sm)s)[0]) 373 || (PyArray_DIMS(%(dx)s)[1] != PyArray_DIMS(%(sm)s)[1])) 374 { 375 Py_XDECREF(%(dx)s); 376 %(dx)s = (PyArrayObject*) PyArray_SimpleNew(2, 377 PyArray_DIMS(%(sm)s), 378 PyArray_TYPE(%(sm)s)); 379 if (!%(dx)s) 380 { 381 PyErr_SetString(PyExc_MemoryError, 382 "failed to alloc dx output"); 383 %(fail)s; 384 } 385 } 386 387 for (size_t i = 0; i < PyArray_DIMS(%(dx)s)[0]; ++i) 388 { 389 const dtype_%(dy)s* __restrict__ dy_i = (dtype_%(dy)s*) (PyArray_BYTES(%(dy)s) + PyArray_STRIDES(%(dy)s)[0] * i); 390 npy_intp Sdy = PyArray_STRIDES(%(dy)s)[1]/sizeof(dtype_%(dy)s); 391 const dtype_%(sm)s* __restrict__ sm_i = (dtype_%(sm)s*) (PyArray_BYTES(%(sm)s) + PyArray_STRIDES(%(sm)s)[0] * i); 392 npy_intp Ssm = PyArray_STRIDES(%(sm)s)[1]/sizeof(dtype_%(sm)s); 393 dtype_%(dx) s* __restrict__ dx_i = (dtype_%(dx)s*) (PyArray_BYTES(%(dx)s) + PyArray_STRIDES(%(dx)s)[0] * i); 394 npy_intp Sdx = PyArray_STRIDES(%(dx)s)[1]/sizeof(dtype_%(dx)s); 395 396 double sum_dy_times_sm = 0.; 397 for (size_t j = 0; j < PyArray_DIMS(%(dx)s)[1]; ++j) 398 { 399 dx_i[j * Sdx] = dy_i[j * Sdy] * sm_i[j * Ssm]; 400 sum_dy_times_sm += dx_i[j * Sdx]; 401 } 402 for (size_t j = 0; j < PyArray_DIMS(%(dx)s)[1]; ++j) 403 { 404 dx_i[j * Sdx] -= sum_dy_times_sm * sm_i[j * Ssm]; 405 } 406 } 407 ''' % dict(locals(), **sub) 408softmax_grad = SoftmaxGrad() 409 410 411class Softmax(gof.Op): 412 r""" 413 Softmax activation function 414 :math:`\\varphi(\\mathbf{x})_j = 415 \\frac{e^{\mathbf{x}_j}}{\sum_{k=1}^K e^{\mathbf{x}_k}}` 416 where :math:`K` is the total number of neurons in the layer. This 417 activation function gets applied row-wise. 418 419 """ 420 421 nin = 1 422 nout = 1 423 __props__ = () 424 425 def make_node(self, x): 426 x = tensor.as_tensor_variable(x) 427 if x.type.ndim not in (1, 2) \ 428 or x.type.dtype not in tensor.float_dtypes: 429 raise ValueError('x must be 1-d or 2-d tensor of floats. Got %s' % 430 x.type) 431 if x.ndim == 1: 432 warnings.warn("DEPRECATION: If x is a vector, Softmax will not automatically pad x " 433 "anymore in next releases. If you need it, please do it manually. The " 434 "vector case is gonna be supported soon and the output will be a vector.", 435 stacklevel=4) 436 x = tensor.shape_padleft(x, n_ones=1) 437 438 return Apply(self, [x], [x.type()]) 439 440 def perform(self, node, input_storage, output_storage): 441 x, = input_storage 442 e_x = np.exp(x - x.max(axis=1)[:, None]) 443 sm = e_x / e_x.sum(axis=1)[:, None] 444 output_storage[0][0] = sm 445 446 def L_op(self, inp, outputs, grads): 447 x, = inp 448 g_sm, = grads 449 return [softmax_grad(g_sm, outputs[0])] 450 451 def R_op(self, inputs, eval_points): 452 # I think the Jacobian is symmetric so the R_op 453 # is the same as the grad 454 if None in eval_points: 455 return [None] 456 return self.L_op(inputs, [self(*inputs)], eval_points) 457 458 def infer_shape(self, node, shape): 459 return shape 460 461 def c_headers(self): 462 return ['<iostream>', '<cmath>'] 463 464 @staticmethod 465 def c_code_template(dtype): 466 # this implementation was lifted from 467 # /u/bergstrj/cvs/bergstrj/src/feb07/nn.cxx 468 469 # TODO: put this into a templated function, in the support code 470 # TODO: declare the max of each row as an Op output 471 472 # TODO: set error messages for failures in this code 473 474 # TODO: use this to accept float32 and int32: node.inputs[0].type.dtype_specs()[1] 475 init_decl = """ 476 npy_intp* Nx = PyArray_DIMS(%(x)s); 477 npy_intp Sx1 = 0; 478 npy_intp Ssm1 = 0; 479 480 if (PyArray_NDIM(%(x)s) != 2) 481 { 482 PyErr_SetString(PyExc_ValueError, "not a 2d tensor"); 483 %(fail)s; 484 } 485 if ((PyArray_TYPE(%(x)s) != NPY_DOUBLE) && 486 (PyArray_TYPE(%(x)s) != NPY_FLOAT)) 487 { 488 PyErr_SetString(PyExc_TypeError, "not a float"); 489 %(fail)s; 490 } 491 492 if ((NULL == %(sm)s) 493 || (PyArray_DIMS(%(sm)s)[0] != PyArray_DIMS(%(x)s)[0]) 494 || (PyArray_DIMS(%(sm)s)[1] != PyArray_DIMS(%(x)s)[1])) 495 { 496 Py_XDECREF(%(sm)s); 497 %(sm)s = (PyArrayObject*)PyArray_SimpleNew(2, PyArray_DIMS(%(x)s), 498 PyArray_TYPE(%(x)s)); 499 if(!%(sm)s) { 500 PyErr_SetString(PyExc_MemoryError, 501 "failed to alloc sm output"); 502 %(fail)s 503 } 504 } 505 Sx1 = PyArray_STRIDES(%(x)s)[1]/sizeof(dtype_%(x)s); 506 Ssm1 = PyArray_STRIDES(%(sm)s)[1]/sizeof(dtype_%(sm)s); 507 """ 508 509 begin_row_loop = """ 510 for (size_t i = 0; i < Nx[0]; ++i) 511 { 512 size_t j; 513 double sum = 0.0; 514 515 const dtype_%(x)s* __restrict__ x_i = (dtype_%(x)s*)(PyArray_BYTES(%(x)s) + PyArray_STRIDES(%(x)s)[0] * i); 516 dtype_%(sm) s* __restrict__ sm_i = (dtype_%(sm)s*)(PyArray_BYTES(%(sm)s) + PyArray_STRIDES(%(sm)s)[0] * i); 517 518 dtype_%(sm)s row_max = x_i[0]; 519 //std::cout << "0 " << row_max << "\\n"; 520 // Get the maximum value of the row 521 for (j = 1; j < Nx[1]; ++j) 522 { 523 dtype_%(sm)s row_ij = x_i[j * Sx1] ; 524 //std::cout << "1 " << row_ij << "\\n"; 525 row_max = (row_ij > row_max) ? row_ij : row_max; 526 } 527 528 """ 529 530 inside_row_loop = """ 531 for (j = 0; j < Nx[1]; ++j) 532 { 533 dtype_%(sm)s row_ij = x_i[j * Sx1] ; 534 //std::cout << "2 " << j << " " << row_ij << " " << row_max << "\\n"; 535 dtype_%(sm)s sm_ij = exp(row_ij - row_max); 536 //std::cout << "3 " << j << " " << sm_ij << "\\n"; 537 sum += sm_ij; 538 sm_i[j * Ssm1] = sm_ij; 539 } 540 541 //cblas_dscal(x.N, 1.0 / sum, &mat_at(s,i,0), s.n); 542 double sum_inv = 1.0 / sum; 543 for (j = 0; j < Nx[1]; ++j) 544 { 545 sm_i[j * Ssm1] *= sum_inv; 546 } 547 548 """ 549 # Get the vectorized version of exp if it exist 550 try: 551 vec_exp = theano.scalar.exp.c_code_contiguous_raw(dtype, 552 "Nx[1]", "sm_i", "sm_i") 553 inside_row_loop_contig = """ 554 for (j = 0; j < Nx[1]; ++j) 555 { 556 sm_i[j * Ssm1] = x_i[j * Sx1] - row_max; 557 } 558 %(vec_exp)s; 559 for (j = 0; j < Nx[1]; ++j) 560 { 561 sum += sm_i[j * Ssm1]; 562 } 563 564 //cblas_dscal(x.N, 1.0 / sum, &mat_at(s,i,0), s.n); 565 double sum_inv = 1.0 / sum; 566 for (j = 0; j < Nx[1]; ++j) 567 { 568 sm_i[j * Ssm1] *= sum_inv; 569 } 570 571 """ % locals() 572 573 inside_row_loop = """ 574 if(Ssm1 == 1){ 575 %(inside_row_loop_contig)s 576 }else{ 577 %(inside_row_loop)s 578 } 579 """ % locals() 580 except theano.gof.utils.MethodNotDefined: 581 pass 582 583 end_row_loop = """ 584 } 585 """ 586 return (init_decl, begin_row_loop, inside_row_loop, end_row_loop) 587 588 def c_code(self, node, name, inp, out, sub): 589 x, = inp 590 sm, = out 591 code_template = ''.join(self.c_code_template( 592 node.inputs[0].type.dtype_specs()[1])) 593 return code_template % dict(locals(), **sub) 594 595 @staticmethod 596 def c_code_cache_version(): 597 return (3,) 598 599softmax_op = Softmax() 600 601 602class LogSoftmax(gof.Op): 603 r""" 604 LogSoftmax activation function 605 :math:`\\varphi(\\mathbf{x})_j = 606 \\e^{(\mathbf{x}_j - log{\sum_{k=1}^K e^{\mathbf{x}_k})}} 607 where :math:`K` is the total number of neurons in the layer. This 608 activation function gets applied row-wise. 609 610 """ 611 __props__ = () 612 613 def make_node(self, x): 614 x = tensor.as_tensor_variable(x) 615 if x.type.ndim not in (1, 2) \ 616 or x.type.dtype not in tensor.float_dtypes: 617 raise ValueError('x must be 1-d or 2-d tensor of floats. Got %s' % 618 x.type) 619 if x.ndim == 1: 620 warnings.warn("DEPRECATION: If x is a vector, LogSoftmax will not automatically pad x " 621 "anymore in next releases. If you need it, please do it manually. The " 622 "vector case is gonna be supported soon and the output will be a vector.", 623 stacklevel=4) 624 x = tensor.shape_padleft(x, n_ones=1) 625 626 return Apply(self, [x], [x.type()]) 627 628 def perform(self, node, input_storage, output_storage): 629 x, = input_storage 630 xdev = x - x.max(axis=1)[:, None] 631 lsm = xdev - np.log(np.sum(np.exp(xdev), axis=1, 632 keepdims=True)) 633 output_storage[0][0] = lsm 634 635 def grad(self, inp, grads): 636 x, = inp 637 sm = softmax_op(x) 638 return [grads[0] - tensor.sum(grads[0], axis=1, keepdims=True) * sm] 639 640 def R_op(self, inputs, eval_points): 641 # I think the Jacobian is symmetric so the R_op 642 # is the same as the grad 643 if None in eval_points: 644 return [None] 645 return self.grad(inputs, eval_points) 646 647 def infer_shape(self, node, shape): 648 return shape 649 650 def c_headers(self): 651 return ['<cmath>'] 652 653 @staticmethod 654 def c_code_template(dtype): 655 init_decl = """ 656 npy_intp* Nx = PyArray_DIMS(%(x)s); 657 npy_intp Sx1 = 0; 658 npy_intp Ssm1 = 0; 659 660 if (PyArray_NDIM(%(x)s) != 2) 661 { 662 PyErr_SetString(PyExc_ValueError, "not a 2d tensor"); 663 %(fail)s; 664 } 665 if ((PyArray_TYPE(%(x)s) != NPY_DOUBLE) && 666 (PyArray_TYPE(%(x)s) != NPY_FLOAT)) 667 { 668 PyErr_SetString(PyExc_TypeError, "not a float"); 669 %(fail)s; 670 } 671 672 if ((NULL == %(sm)s) 673 || (PyArray_DIMS(%(sm)s)[0] != PyArray_DIMS(%(x)s)[0]) 674 || (PyArray_DIMS(%(sm)s)[1] != PyArray_DIMS(%(x)s)[1])) 675 { 676 Py_XDECREF(%(sm)s); 677 %(sm)s = (PyArrayObject*)PyArray_SimpleNew( 678 2, PyArray_DIMS(%(x)s), 679 PyArray_TYPE(%(x)s)); 680 if(!%(sm)s) { 681 PyErr_SetString(PyExc_MemoryError, 682 "failed to alloc sm output"); 683 %(fail)s 684 } 685 } 686 Sx1 = PyArray_STRIDES(%(x)s)[1]/sizeof(dtype_%(x)s); 687 Ssm1 = PyArray_STRIDES(%(sm)s)[1]/sizeof(dtype_%(sm)s); 688 """ 689 690 begin_row_loop = """ 691 // minibatch loop 692 for (size_t i = 0; i < Nx[0]; ++i) 693 { 694 size_t j; 695 double sum = 0.0; 696 697 const dtype_%(x)s* __restrict__ x_i = (dtype_%(x)s*)( 698 PyArray_BYTES(%(x)s) + PyArray_STRIDES(%(x)s)[0] * i); 699 dtype_%(sm)s* __restrict__ sm_i = (dtype_%(sm)s*)( 700 PyArray_BYTES(%(sm)s) + PyArray_STRIDES(%(sm)s)[0] * i); 701 702 dtype_%(sm)s row_max = x_i[0]; 703 // Get the maximum value of the row 704 for (j = 1; j < Nx[1]; ++j) 705 { 706 dtype_%(sm)s x_ij = x_i[j * Sx1] ; 707 row_max = (x_ij > row_max) ? x_ij : row_max; 708 } 709 """ 710 711 inside_row_loop = """ 712 // Compute xdev and sum(exp(xdev), axis=1) 713 double xdev_exp_row_sum = 0.0; 714 for (j = 0; j < Nx[1]; j++) 715 { 716 // use sm_i to temporary store xdev 717 sm_i[j * Ssm1] = (dtype_%(sm)s) (x_i[j * Sx1] - row_max); 718 xdev_exp_row_sum += exp(sm_i[j * Ssm1]); 719 } 720 721 // Write sm = xdev - log(sum(exp(xdev), axis=1)) 722 xdev_exp_row_sum = log(xdev_exp_row_sum); 723 for (j = 0; j < Nx[1]; ++j) 724 { 725 sm_i[j * Ssm1] -= (dtype_%(sm)s) xdev_exp_row_sum; 726 } 727 """ 728 end_row_loop = """ 729 } 730 """ 731 return (init_decl, begin_row_loop, inside_row_loop, end_row_loop) 732 733 def c_code(self, node, name, inp, out, sub): 734 x, = inp 735 sm, = out 736 code_template = ''.join(self.c_code_template( 737 node.inputs[0].type.dtype_specs()[1])) 738 return code_template % dict(locals(), **sub) 739 740 @staticmethod 741 def c_code_cache_version(): 742 return (0,) 743 744logsoftmax_op = LogSoftmax() 745 746 747# This is not registered in stabilize, as it cause some crossentropy 748# optimization to not be inserted. 749@opt.register_specialize('stabilize', 'fast_compile') 750@gof.local_optimizer([tensor.Elemwise]) 751def local_logsoftmax(node): 752 """ 753 Detect Log(Softmax(x)) and replace it with LogSoftmax(x) 754 755 Note: only forward pass is affected 756 """ 757 if (isinstance(node.op, tensor.Elemwise) and 758 isinstance(node.op.scalar_op, scalar.basic.Log) and 759 len(node.inputs) == 1 and 760 node.inputs[0].owner is not None and 761 isinstance(node.inputs[0].owner.op, Softmax)): 762 inVars = node.inputs[0].owner.inputs[0] 763 new_op = LogSoftmax() 764 ret = new_op(inVars) 765 ret .tag.values_eq_approx = values_eq_approx_remove_inf 766 copy_stack_trace([node.inputs[0], node.outputs[0]], ret) 767 return [ret] 768 769 770# This is not registered in stabilize, as it cause some crossentropy 771# optimization to not be inserted. 772@opt.register_specialize('stabilize', 'fast_compile') 773@gof.local_optimizer([SoftmaxGrad]) 774def local_logsoftmax_grad(node): 775 """ 776 Detect Log(Softmax(x))'s grad and replace it with LogSoftmax(x)'s grad 777 778 Note: only grad is affected 779 """ 780 if (isinstance(node.op, SoftmaxGrad) and 781 len(node.inputs) == 2 and 782 node.inputs[0].owner is not None and 783 node.inputs[0].owner.op == tensor.true_div and 784 len(node.inputs[0].owner.inputs) >= 2 and 785 node.inputs[0].owner.inputs[1].owner is not None and 786 node.inputs[0].owner.inputs[1].owner.op == softmax_op and 787 node.inputs[1] == node.inputs[0].owner.inputs[1] and 788 not ( 789 # skip if it will be optimized by 790 # local_advanced_indexing_crossentropy_onehot_grad 791 node.inputs[0].owner.op == tensor.true_div and 792 node.inputs[0].owner.inputs[0].owner is not None and 793 isinstance(node.inputs[0].owner.inputs[0].owner.op, 794 subtensor.AdvancedIncSubtensor))): 795 # get parameters from unoptimized op 796 sm = node.inputs[0].owner.inputs[1] 797 # sm_input = node.inputs[1].owner.inputs[0] 798 grads = node.inputs[0].owner.inputs[0] 799 if grads.broadcastable[1] and not sm.broadcastable[1]: 800 grads = tensor.alloc(grads, grads.shape[0], sm.shape[1]) 801 ret = grads - tensor.sum(grads, axis=1, keepdims=True) * sm 802 ret.tag.values_eq_approx = values_eq_approx_remove_nan 803 copy_stack_trace(node.outputs[0], ret) 804 return [ret] 805 806 807def softmax_graph(c): 808 return tensor.exp(c) / tensor.exp(c).sum(axis=-1, keepdims=True) 809 810 811def softmax(c): 812 c = as_tensor_variable(c) 813 if c.broadcastable[-1]: 814 warnings.warn("The softmax is applied on a dimension of shape 1, which does not have a semantic meaning.") 815 return softmax_op(c) 816 817 818def logsoftmax(c): 819 return logsoftmax_op(c) 820 821 822@opt.register_specialize('fast_compile_gpu') 823@gof.local_optimizer([softmax_op]) 824def local_softmax_with_bias(node): 825 """ 826 Try to turn softmax(sum_of_stuff) -> softmax_w_bias(matrix, bias). 827 828 """ 829 if node.op == softmax_op: 830 x, = node.inputs 831 if x.owner and x.owner.op == tensor.add: 832 vectors = [] 833 non_vectors = [] 834 for x_in in x.owner.inputs: 835 if list(x_in.type.broadcastable) == [True, False]: 836 # print isinstance(x_in.owner.op, 837 # tensor.DimShuffle) since specialization comes 838 # relatively late in optimization, we don't want to 839 # put in extra DimShuffles un-necessarily. 840 if (x_in.owner and 841 isinstance(x_in.owner.op, tensor.DimShuffle) and 842 list(x_in.owner.inputs[0].type.broadcastable) == [False]): 843 # cut out the DimShuffle that was broadcasting a vector 844 vectors.append(x_in.owner.inputs[0]) 845 else: 846 # insert an extra DimShuffle to correct the old one 847 vectors.append(tensor. 848 DimShuffle((True, False), (1,))(x_in)) 849 else: 850 non_vectors.append(x_in) 851 852 # If all the inputs were vectors or broadcasted vectors, 853 # we broadcast one of them to be used as a matrix 854 if len(non_vectors) == 0: 855 assert len(vectors) > 0 # we should have at least 1 input... 856 promoted_vector = vectors.pop() 857 non_vectors.append(tensor.shape_padleft(promoted_vector)) 858 assert non_vectors # not empty 859 860 if vectors: 861 # we're in business... 862 if len(vectors) > 1: 863 vector_sum = tensor.add(*vectors) 864 copy_stack_trace(x_in, vector_sum) 865 else: 866 vector_sum = vectors[0] 867 868 if len(non_vectors) > 1: 869 non_vector_sum = tensor.add(*non_vectors) 870 copy_stack_trace(x_in, non_vector_sum) 871 else: 872 non_vector_sum = non_vectors[0] 873 874 try: 875 sm_bias = softmax_with_bias(non_vector_sum, vector_sum) 876 copy_stack_trace(node.outputs[0], sm_bias) 877 except Exception: 878 # if our arguments have the wrong types, then 879 # forget about it 880 return 881 882 if sm_bias.type == node.outputs[0].type: 883 # This condition is not always true. See the test 884 # nnet/tests/test_nnet.py:T_SoftmaxWithBias.test_broadcast 885 return [sm_bias] 886 887 888def softmax_simplifier(numerators, denominators): 889 for numerator in list(numerators): 890 # TODO: a single softmax'd vector?? 891 if not numerator.type.dtype.startswith('float'): 892 continue 893 894 if numerator.ndim != 2: 895 continue 896 if numerator.owner and numerator.owner.op == tensor.exp: 897 x = numerator.owner.inputs[0] 898 else: 899 continue 900 901 matching_denom = None 902 903 for denominator in denominators: 904 if denominator.owner and isinstance(denominator.owner.op, 905 tensor.DimShuffle): 906 if denominator.owner.op.new_order == (0, 'x'): 907 z = denominator.owner.inputs[0] 908 # thing getting dimshuffled 909 if z.owner and isinstance(z.owner.op, tensor.Sum): 910 # print 'ASDF', denominator.owner.op.new_order 911 # print z.owner.op.axis 912 if z.owner.op.axis == (1,): 913 # print "almost there.. softmax", x, z.owner.inputs[0] 914 if z.owner.inputs[0] is numerator: 915 matching_denom = denominator 916 break 917 if matching_denom: 918 softmax = softmax_op(x) 919 copy_stack_trace(numerator, softmax) 920 numerators.remove(numerator) 921 denominators.remove(matching_denom) 922 numerators.append(softmax) 923 924 return numerators, denominators 925opt.local_mul_canonizer.add_simplifier(softmax_simplifier, 'softmax_simplifier') 926 927 928class CrossentropySoftmaxArgmax1HotWithBias(gof.Op): 929 """ 930 A special compound L{Op} for the output of neural-net classifiers. 931 932 Parameters 933 ---------- 934 x : a matrix of floats (32 or 64) 935 b : a [row] vector of floats (32 or 64), length is number of cols in x 936 y_idx : a [column] vector of int (32 or 64), length is number of rows in x 937 938 Returns 939 ------- 940 object 941 row-wise NLL, softmax(x+b), row-wise argmax of (x+b). 942 943 @precondition: every entry in y_idx is a valid (non-negative) 944 column index into x 945 946 This L{Op} has three outputs: 947 - KL(softmax(x+b), y) 948 - softmax(x+b) 949 - argmax(x+b) 950 951 softmax(x[i]) is the i'th distribution over len(x[i]) options 952 argmax(x) is the index of x's greatest element 953 y_idx[i] is an integer index, encoding a 1-hot distribution. 954 955 In practice, when we are trying to do classification, we have one row in x 956 and y_idx per example, and y[i] is the index of the (correct) class of the 957 i'th example. 958 959 """ 960 961 nin = 3 962 nout = 3 963 __props__ = () 964 965 def __init__(self, **kwargs): 966 gof.Op.__init__(self, **kwargs) 967 968 def make_node(self, x, b, y_idx): 969 x = tensor.as_tensor_variable(x) 970 b = tensor.as_tensor_variable(b) 971 y_idx = tensor.as_tensor_variable(y_idx) 972 if x.type.ndim != 2 \ 973 or x.type.dtype not in tensor.float_dtypes: 974 raise ValueError('x must be 2-d tensor of floats', x.type) 975 if b.type.ndim != 1 \ 976 or x.type.dtype not in tensor.float_dtypes: 977 raise ValueError('b must be 1-d tensor of floats', b.type) 978 if y_idx.type.ndim != 1 \ 979 or y_idx.type.dtype not in tensor.discrete_dtypes: 980 raise ValueError('y_idx must be 1-d tensor of [u]ints', y_idx.type) 981 982# TODO: Is this correct? It used to be y, not y_idx 983 nll = tensor.TensorType(x.type.dtype, 984 y_idx.type.broadcastable).make_variable() 985# nll = TensorType(x.dtype, y.broadcastable) 986 sm = x.type() 987 am = y_idx.type() 988 return Apply(self, [x, b, y_idx], [nll, sm, am]) 989 990 def perform(self, node, input_storage, output_storage): 991 """ 992 The math, where x is an input vector, and t is a target index: 993 994 softmax(x)[i] = exp(x[i]) / sum_j(exp(x[j])) 995 nll(x,t) = -log(softmax(x)[t]) 996 997 We compute this by subtracting off the max of x. This avoids 998 numerical instability. 999 1000 m = max_j x[j] 1001 softmax(x)[i] = exp(x[i] -m) / sum_j(exp(x[j] - m)) 1002 1003 nll = -log(exp(x[t] -m) / sum_j(exp(x[j] - m))) 1004 = -x[t] + m + log( sum_j(exp(x[j] - m))) 1005 1006 """ 1007 x, b, y_idx = input_storage 1008 if b.shape[0] != x.shape[1]: 1009 raise ValueError('b must have same number of columns as x') 1010 if y_idx.shape[0] != x.shape[0]: 1011 raise ValueError('y_idx must have same number of rows as x') 1012 if any(y_idx < 0): 1013 raise ValueError("y_i value out of bounds") 1014 sm = np.zeros_like(x) # softmax 1015 nll = np.zeros(x.shape[0], dtype=node.outputs[0].type.dtype) # nll(y | softmax(x)) 1016 am = np.zeros_like(y_idx) 1017 for i in xrange(sm.shape[0]): 1018 # add the bias vector to the i'th row of x 1019 row = x[i] + b 1020 1021 # get the maximum value of i'th row for numerically safe 1022 # softmax / nll 1023 am[i] = np.argmax(row) 1024 m = row[am[i]] 1025 1026 # compute the unnormalized softmax, and normalization constant 1027 sm[i] = np.exp(row - m) 1028 sum_j = np.sum(sm[i]) # sum_j(exp(x[j] - m)) 1029 1030 # normalized our softmax 1031 sm[i] *= 1.0 / sum_j 1032 1033 # store the nll 1034 nll[i] = -row[y_idx[i]] + m + np.log(sum_j) 1035 1036 output_storage[0][0] = nll 1037 output_storage[1][0] = sm 1038 output_storage[2][0] = am 1039 1040 def infer_shape(self, node, shapes): 1041 x_shp, b_shp, idx_shp = shapes 1042 nll_shp = (x_shp[0],) 1043 sm_shp = x_shp 1044 am_shp = idx_shp 1045 return [nll_shp, sm_shp, am_shp] 1046 1047 def connection_pattern(self, node): 1048 1049 return [[True, True, True], # x 1050 [True, True, True], # b 1051 [False, False, True]] # y_idx 1052 1053 def grad(self, inp, grads): 1054 x, b, y_idx = inp 1055 g_nll, g_sm, g_am = grads 1056 1057 dx_terms = [] 1058 db_terms = [] 1059 d_idx_terms = [] 1060 1061 if not isinstance(g_nll.type, DisconnectedType): 1062 nll, sm = crossentropy_softmax_1hot_with_bias(x, b, y_idx) 1063 dx = crossentropy_softmax_1hot_with_bias_dx(g_nll, sm, y_idx) 1064 db = tensor.sum(dx, axis=[0]) 1065 dx_terms.append(dx) 1066 db_terms.append(db) 1067 1068 if not isinstance(g_sm.type, DisconnectedType): 1069 dx, db = softmax_with_bias.L_op((x, b), [softmax_with_bias(x, b)], (g_sm, )) 1070 dx_terms.append(dx) 1071 db_terms.append(db) 1072 1073 if not isinstance(g_am.type, DisconnectedType): 1074 dx_terms.append(x.zeros_like()) 1075 db_terms.append(b.zeros_like()) 1076 d_idx_terms.append(y_idx.zeros_like()) 1077 1078 def fancy_sum(terms): 1079 if len(terms) == 0: 1080 return DisconnectedType()() 1081 rval = terms[0] 1082 for term in terms[1:]: 1083 rval = rval + term 1084 return rval 1085 1086 return [fancy_sum(terms) for terms in 1087 [dx_terms, db_terms, d_idx_terms]] 1088 1089 def c_headers(self): 1090 return ['<iostream>', '<cmath>'] 1091 1092 @staticmethod 1093 def c_code_template(dtype): 1094 # this implementation was lifted from 1095 # /u/bergstrj/cvs/bergstrj/src/feb07/nn.cxx 1096 1097 # TODO: put this into a templated function, in the support code 1098 # TODO: declare the max of each row as an Op output 1099 1100 # TODO: set error messages for failures in this code 1101 1102 # TODO: use this to accept float32 and int32: node.inputs[0].type.dtype_specs()[1] 1103 (init_decl, begin_row_loop, inside_row_loop, end_row_loop) = \ 1104 SoftmaxWithBias.c_code_template(dtype) 1105 return (init_decl, 1106 """ 1107 if (PyArray_NDIM(%(y_idx)s) != 1) 1108 { 1109 PyErr_SetString(PyExc_ValueError, "y_idx not 1d tensor"); 1110 %(fail)s; 1111 } 1112 if (PyArray_DIMS(%(x)s)[0] != PyArray_DIMS(%(y_idx)s)[0]) 1113 { 1114 PyErr_Format(PyExc_ValueError, 1115 "number of rows in x (%%ld) does not match length of y (%%ld)", 1116 (long int)PyArray_DIMS(%(x)s)[0], 1117 (long int)PyArray_DIMS(%(y_idx)s)[0]); 1118 %(fail)s; 1119 } 1120 1121 if ((NULL == %(nll)s) //initial condition 1122 || (PyArray_DIMS(%(nll)s)[0] != PyArray_DIMS(%(y_idx)s)[0])) 1123 { 1124 if (NULL != %(nll)s) Py_XDECREF(%(nll)s); 1125 %(nll)s = (PyArrayObject*)PyArray_SimpleNew(1, 1126 PyArray_DIMS(%(y_idx)s), PyArray_TYPE(%(x)s)); 1127 if(!%(nll)s) 1128 { 1129 PyErr_SetString(PyExc_MemoryError, 1130 "failed to alloc nll output"); 1131 %(fail)s; 1132 } 1133 } 1134 if ((NULL == %(am)s) 1135 || (PyArray_DIMS(%(am)s)[0] != PyArray_DIMS(%(y_idx)s)[0])) 1136 { 1137 Py_XDECREF(%(am)s); 1138 %(am)s = (PyArrayObject*) PyArray_SimpleNew(1, 1139 PyArray_DIMS(%(y_idx)s), PyArray_TYPE(%(y_idx)s)); 1140 if(!%(am)s) 1141 { 1142 PyErr_SetString(PyExc_MemoryError, 1143 "failed to alloc am output"); 1144 %(fail)s; 1145 } 1146 } 1147 """, 1148 begin_row_loop, 1149 """ 1150 const %(y_idx_type) s y_i = ((%(y_idx_type)s*)(PyArray_BYTES(%(y_idx)s) + PyArray_STRIDES(%(y_idx)s)[0] * i))[0]; 1151 dtype_%(nll) s* __restrict__ nll_i = (dtype_%(nll)s*)(PyArray_BYTES(%(nll)s) + PyArray_STRIDES(%(nll)s)[0] * i); 1152 %(am_type)s* __restrict__ am_i = (%(am_type)s*) (PyArray_BYTES(%(am)s) + PyArray_STRIDES(%(am)s)[0] * i); 1153 """, 1154 inside_row_loop, 1155 """ 1156 if ((y_i >= PyArray_DIMS(%(x)s)[1]) || (y_i < 0)) 1157 { 1158 PyErr_SetString(PyExc_ValueError, "y_i value out of bounds"); 1159 %(fail)s; 1160 } 1161 nll_i[0] = - x_i[y_i*Sx] 1162 - b_i[y_i*Sb] 1163 + row_max 1164 + log(sum); 1165 am_i[0] = row_max_j; 1166 """, 1167 end_row_loop) 1168 1169 def c_code_cache_version(self): 1170 return (5,) + SoftmaxWithBias.c_code_cache_version() 1171 1172 def c_code(self, node, name, inp, out, sub): 1173 x, b, y_idx = inp 1174 nll, sm, am = out 1175 y_idx_type = node.inputs[2].type.dtype_specs()[1] 1176 am_type = y_idx_type 1177 dtype = node.inputs[0].type.dtype_specs()[1] 1178 code_template = ''.join(self.c_code_template(dtype)) 1179 return code_template % dict(locals(), **sub) 1180 1181 1182class CrossentropySoftmax1HotWithBiasDx(gof.Op): 1183 """ 1184 Gradient wrt x of the CrossentropySoftmaxArgmax1HotWithBias Op. 1185 1186 """ 1187 1188 nin = 3 1189 nout = 1 1190 __props__ = () 1191 1192 def make_node(self, dy, sm, y_idx, **kwargs): 1193 dy = tensor.as_tensor_variable(dy) 1194 sm = tensor.as_tensor_variable(sm) 1195 y_idx = tensor.as_tensor_variable(y_idx) 1196 if (dy.type.ndim > 1 or 1197 dy.type.dtype not in tensor.float_dtypes): 1198 raise ValueError('dy must be {0,1}-d tensor of floats', dy.type) 1199 if (sm.type.ndim != 2 or 1200 sm.type.dtype not in tensor.float_dtypes): 1201 raise ValueError('sm must be 2-d tensor of floats', sm.type) 1202 if (y_idx.type.ndim != 1 or 1203 y_idx.type.dtype not in tensor.discrete_dtypes): 1204 raise ValueError('y_idx must be 1-d tensor of [u]ints', y_idx.type) 1205 return Apply(self, [dy, sm, y_idx], [sm.type()]) 1206 1207 def perform(self, node, input_storage, output_storage): 1208 dy, sm, y_idx = input_storage 1209 if any(y_idx < 0): 1210 raise ValueError("y_i value out of bounds") 1211 dx = np.zeros_like(sm) 1212 if dy.ndim == 0: 1213 dy = dy[None] 1214 incr = int(dy.shape[0] > 1) 1215 for i in xrange(sm.shape[0]): 1216 dy_i = dy[i * incr] 1217 dx[i] = dy_i * sm[i] # vector scale 1218 dx[i, y_idx[i]] -= dy_i # scalar decrement 1219 output_storage[0][0] = dx 1220 1221 def infer_shape(self, node, shapes): 1222 return [shapes[1]] 1223 1224 def grad(self, inp, grads): 1225 dy, sm, y_idx = inp 1226 g_dx, = grads 1227 # TODO: currently we do not compute the gradient w.r.t. dy, because 1228 # advanced indexing is not working yet. When it works, do it to avoid 1229 # potentially misleading behavior in gradient computations! (although 1230 # typically we should not need the gradient w.r.t. dy). 1231 y_idx_range = tensor.arange(y_idx.shape[0]) 1232 g_dy = tensor.sum( 1233 g_dx * subtensor.AdvancedIncSubtensor()( 1234 sm, tensor.fill(dy, -1), y_idx_range, y_idx), axis=1) 1235 g_sm = dy.dimshuffle(0, 'x') * g_dx 1236 g_y_idx = grad_not_implemented(self, 2, y_idx) 1237 return [g_dy, g_sm, g_y_idx] 1238 1239 def c_code_cache_version(self): 1240 return (6,) 1241 1242 def c_code(self, node, name, inp, out, sub): 1243 dnll, sm, y_idx = inp 1244 dx, = out 1245 y_idx_type = node.inputs[2].type.dtype_specs()[1] 1246 return """ 1247 if ((PyArray_TYPE(%(dnll)s) != NPY_DOUBLE) && 1248 (PyArray_TYPE(%(dnll)s) != NPY_FLOAT)) 1249 { 1250 PyErr_SetString(PyExc_TypeError, 1251 "dnll type should be float32 or float64"); 1252 %(fail)s; 1253 } 1254 if ((PyArray_TYPE(%(sm)s) != NPY_DOUBLE) && 1255 (PyArray_TYPE(%(sm)s) != NPY_FLOAT)) 1256 { 1257 PyErr_SetString(PyExc_TypeError, 1258 "sm type should be float32 or float64"); 1259 %(fail)s; 1260 } 1261 1262 // new scope because of variable declaration 1263 // TODO: proper indentation, but the diff will get messy 1264 { 1265 // Get `dnll.shape[0]` or set it to zero if `dnll` is a scalar. 1266 const npy_intp %(dnll)s_dims0 = (PyArray_NDIM(%(dnll)s) > 0 ? 1267 PyArray_DIMS(%(dnll)s)[0] : 1268 (npy_intp) 0); 1269 1270 // Get `dnll.strides[0]` and set it to zero if `dnll` is a scalar 1271 // or a vector with just one element. 1272 const npy_intp %(dnll)s_strides0 = (%(dnll)s_dims0 > 1 ? 1273 PyArray_STRIDES(%(dnll)s)[0] : 1274 (npy_intp) 0); 1275 1276 if ((PyArray_NDIM(%(dnll)s) > 1) 1277 || (PyArray_NDIM(%(sm)s) != 2) 1278 || (PyArray_NDIM(%(y_idx)s) != 1)) 1279 { 1280 PyErr_SetString(PyExc_ValueError, "rank error"); 1281 %(fail)s; 1282 } 1283 if (%(dnll)s_dims0 != PyArray_DIMS(%(sm)s)[0] && %(dnll)s_dims0 > 1) 1284 { 1285 PyErr_Format(PyExc_ValueError, 1286 "dnll.shape[0] (%%ld) != sm.shape[0] (%%ld)", 1287 (long int)%(dnll)s_dims0, 1288 (long int)PyArray_DIMS(%(sm)s)[0]); 1289 %(fail)s; 1290 } 1291 if (%(dnll)s_dims0 != PyArray_DIMS(%(y_idx)s)[0] && %(dnll)s_dims0 > 1) 1292 { 1293 PyErr_Format(PyExc_ValueError, 1294 "dnll.shape[0] (%%ld) != y_idx.shape[0] (%%ld)", 1295 (long int)%(dnll)s_dims0, 1296 (long int)PyArray_DIMS(%(y_idx)s)[0]); 1297 %(fail)s; 1298 } 1299 if (PyArray_DIMS(%(sm)s)[0] != 1300 PyArray_DIMS(%(y_idx)s)[0]) 1301 { 1302 PyErr_SetString(PyExc_ValueError, 1303 "sm.shape[0] != y_idx.shape[0]"); 1304 %(fail)s; 1305 } 1306 if ((NULL == %(dx)s) 1307 || (PyArray_DIMS(%(dx)s)[0] != PyArray_DIMS(%(sm)s)[0]) 1308 || (PyArray_DIMS(%(dx)s)[1] != PyArray_DIMS(%(sm)s)[1])) 1309 { 1310 if (NULL != %(dx)s) Py_XDECREF(%(dx)s); 1311 %(dx)s = (PyArrayObject*) PyArray_SimpleNew(2, 1312 PyArray_DIMS(%(sm)s), 1313 PyArray_TYPE(%(sm)s)); 1314 if(!%(dx)s) { 1315 PyErr_SetString(PyExc_MemoryError, 1316 "failed to alloc dx output"); 1317 %(fail)s 1318 } 1319 } 1320 1321 for (size_t i = 0; i < PyArray_DIMS(%(dx)s)[0]; ++i) 1322 { 1323 const dtype_%(dnll)s dnll_i = ((dtype_%(dnll)s*)(PyArray_BYTES(%(dnll)s) + %(dnll)s_strides0 * i))[0]; 1324 1325 const %(y_idx_type) s y_i = ((%(y_idx_type)s*)(PyArray_BYTES(%(y_idx)s) + PyArray_STRIDES(%(y_idx)s)[0] * i))[0]; 1326 1327 const dtype_%(sm)s* __restrict__ sm_i = (dtype_%(sm)s*)(PyArray_BYTES(%(sm)s) + PyArray_STRIDES(%(sm)s)[0] * i); 1328 npy_intp Ssm = PyArray_STRIDES(%(sm)s)[1]/sizeof(dtype_%(sm)s); 1329 1330 dtype_%(dx) s* __restrict__ dx_i = (dtype_%(dx)s*)(PyArray_BYTES(%(dx)s) + PyArray_STRIDES(%(dx)s)[0] * i); 1331 npy_intp Sdx = PyArray_STRIDES(%(dx)s)[1]/sizeof(dtype_%(dx)s); 1332 1333 for (size_t j = 0; j < PyArray_DIMS(%(dx)s)[1]; ++j) 1334 { 1335 dx_i[j * Sdx] = dnll_i * sm_i[j * Ssm]; 1336 } 1337 if (y_i >= PyArray_DIMS(%(dx)s)[1] || (y_i < 0)) 1338 { 1339 PyErr_SetString(PyExc_ValueError, "y_i >= dx dimensions[1] or y_i < 0."); 1340 %(fail)s; 1341 } 1342 dx_i[y_i * Sdx] -= dnll_i; 1343 } 1344 } 1345 """ % dict(locals(), **sub) 1346 1347crossentropy_softmax_argmax_1hot_with_bias = \ 1348 CrossentropySoftmaxArgmax1HotWithBias() 1349 1350crossentropy_softmax_1hot_with_bias_dx = \ 1351 CrossentropySoftmax1HotWithBiasDx() 1352 1353 1354def crossentropy_softmax_1hot_with_bias(x, b, y_idx, **kwargs): 1355 return crossentropy_softmax_argmax_1hot_with_bias(x, b, y_idx, 1356 **kwargs)[0:2] 1357 1358 1359def crossentropy_softmax_1hot(x, y_idx, **kwargs): 1360 b = tensor.zeros_like(x[0, :]) 1361 return crossentropy_softmax_1hot_with_bias(x, b, y_idx, **kwargs) 1362 1363 1364def crossentropy_softmax_max_and_argmax_1hot_with_bias(x, b, y_idx, **kwargs): 1365 """ 1366 Returns 1367 ------- 1368 object 1369 The cross-entropy, the softmax output, the max probability, 1370 and the argmax index. 1371 1372 TODO: Since we are recomputing the argmax, 1373 we might as well assert that it is correct. 1374 1375 TODO: Make this entire function is 1376 unnecessary? e.g. CrossentropySoftmaxArgmax1HotWithBias should return 1377 the appropriate information (i.e. the max probability)? 1378 1379 """ 1380 (xent, softmax) = crossentropy_softmax_1hot_with_bias(x, b, y_idx, **kwargs) 1381 (max_pr, argmax) = tensor.max_and_argmax(softmax, axis=-1) 1382 return (xent, softmax, max_pr, argmax) 1383 1384 1385def crossentropy_softmax_max_and_argmax_1hot(x, y_idx, **kwargs): 1386 b = tensor.zeros_like(x[0, :]) 1387 return crossentropy_softmax_max_and_argmax_1hot_with_bias(x, b, y_idx, 1388 **kwargs) 1389 1390 1391class CrossentropyCategorical1HotGrad(gof.Op): 1392 1393 __props__ = () 1394 1395 def make_node(self, g_y, coding_dist, true_one_of_n): 1396 return Apply(self, [g_y, coding_dist, true_one_of_n], 1397 [coding_dist.type()]) 1398 1399 def perform(self, node, inp, out): 1400 g_y, coding_dist, true_one_of_n = inp 1401 g_coding_strg, = out 1402 g_coding = np.zeros_like(coding_dist) 1403 for i in xrange(len(g_y)): 1404 g_coding[i, true_one_of_n[i]] = (-g_y[i] / 1405 coding_dist[i, true_one_of_n[i]]) 1406 g_coding_strg[0] = g_coding 1407 1408 def infer_shape(self, node, in_shapes): 1409 return [in_shapes[1]] 1410 1411crossentropy_categorical_1hot_grad = CrossentropyCategorical1HotGrad() 1412 1413 1414class CrossentropyCategorical1Hot(gof.Op): 1415 r""" 1416 Compute the cross entropy between a coding distribution and 1417 a true distribution of the form [0, 0, ... 0, 1, 0, ..., 0]. 1418 1419 .. math:: 1420 1421 y[i] = - \log(coding_dist[i, one_of_n[i]) 1422 1423 Notes 1424 ----- 1425 In the case that the coding distribution is the output of a 1426 softmax, an application of this Op will probably be optimized 1427 away in favour of one with a C implementation. 1428 1429 """ 1430 __props__ = () 1431 1432 def make_node(self, coding_dist, true_one_of_n): 1433 """ 1434 Parameters 1435 ---------- 1436 coding_dist : dense matrix 1437 true_one_of_n : lvector 1438 1439 Returns 1440 ------- 1441 dvector 1442 1443 """ 1444 _coding_dist = tensor.as_tensor_variable(coding_dist) 1445 _true_one_of_n = tensor.as_tensor_variable(true_one_of_n) 1446 if _coding_dist.type.ndim != 2: 1447 raise TypeError('matrix required for argument: coding_dist') 1448 if _true_one_of_n.type not in (tensor.lvector, tensor.ivector): 1449 raise TypeError( 1450 'integer vector required for argument: true_one_of_n' 1451 '(got type: %s instead of: %s)' % (_true_one_of_n.type, 1452 tensor.lvector)) 1453 1454 return Apply(self, [_coding_dist, _true_one_of_n], 1455 [tensor.Tensor(dtype=_coding_dist.dtype, 1456 broadcastable=[False])()]) 1457 1458 def perform(self, node, inp, out): 1459 coding, one_of_n = inp 1460 y_out, = out 1461 y = np.zeros_like(coding[:, 0]) 1462 for i in xrange(len(y)): 1463 y[i] = -np.log(coding[i, one_of_n[i]]) 1464 y_out[0] = y 1465 1466 def infer_shape(self, node, in_shapes): 1467 return [(in_shapes[0][0],)] 1468 1469 def grad(self, inp, grads): 1470 coding, one_of_n = inp 1471 g_y, = grads 1472 return [crossentropy_categorical_1hot_grad(g_y, coding, one_of_n), 1473 grad_not_implemented(self, 1, one_of_n)] 1474 1475crossentropy_categorical_1hot = CrossentropyCategorical1Hot() 1476 1477 1478@opt.register_stabilize('fast_compile_gpu') 1479@opt.register_specialize('fast_compile_gpu') 1480@gof.optimizer 1481def crossentropy_to_crossentropy_with_softmax_with_bias(fgraph): 1482 """ 1483 This is a stabilization optimization. 1484 1485 Notes 1486 ----- 1487 Not a local optimization because we are replacing outputs 1488 from several nodes at once. 1489 1490 """ 1491 1492 def search_make_one_sub(): 1493 for node in fgraph.toposort(): 1494 if node.op == crossentropy_categorical_1hot: 1495 nll, = node.outputs 1496 sm, one_of_n = node.inputs 1497 if sm.owner and sm.owner.op == softmax_with_bias: 1498 x, b = sm.owner.inputs 1499 new_nll, new_sm, new_am = crossentropy_softmax_argmax_1hot_with_bias( 1500 x, b, one_of_n) 1501 fgraph.replace_all_validate( 1502 [(nll, new_nll), (sm, new_sm)], 1503 reason="crossentropy_to_crossentropy_with_softmax_with_bias") 1504 return True 1505 1506 return False 1507 1508 while search_make_one_sub(): 1509 pass 1510 return 1511 1512 1513@gof.optimizer 1514def crossentropy_to_crossentropy_with_softmax(fgraph): 1515 """ 1516 This is a stabilization optimization that is more general than 1517 crossentropy_to_crossentropy_with_softmax_with_bias. 1518 1519 It must be executed after local_softmax_with_bias optimization in 1520 specialize. 1521 1522 TODO : This is a stabilization optimization! How to make this more cleanly? 1523 1524 Notes 1525 ----- 1526 Not a local optimization because we are replacing outputs from several 1527 nodes at once. 1528 1529 """ 1530 1531 def search_make_one_sub(): 1532 for node in fgraph.toposort(): 1533 if node.op == crossentropy_categorical_1hot: 1534 nll, = node.outputs 1535 sm, one_of_n = node.inputs 1536 if sm.owner and sm.owner.op == softmax_op: 1537 x, = sm.owner.inputs 1538 new_nll, new_sm, new_am = crossentropy_softmax_argmax_1hot_with_bias( 1539 x, tensor.zeros_like(x[0]), one_of_n) 1540 fgraph.replace_all_validate( 1541 [(nll, new_nll), (sm, new_sm)], 1542 reason="crossentropy_to_crossentropy_with_softmax") 1543 return True 1544 if sm.owner and sm.owner.op == softmax_with_bias: 1545 x, b = sm.owner.inputs 1546 new_nll, new_sm, new_am = crossentropy_softmax_argmax_1hot_with_bias(x, b, 1547 one_of_n) 1548 fgraph.replace_all_validate( 1549 [(nll, new_nll), (sm, new_sm)], 1550 reason="crossentropy_to_crossentropy_with_softmax") 1551 return True 1552 1553 return False 1554 1555 while search_make_one_sub(): 1556 pass 1557 return 1558 1559optdb.register('crossentropy_to_crossentropy_with_softmax', 1560 crossentropy_to_crossentropy_with_softmax, 2.01, 1561 'fast_run', 'xent', 'fast_compile_gpu') 1562 1563 1564@opt.register_specialize( 1565 'fast_compile_gpu', 1566 'local_crossentropy_to_crossentropy_with_softmax_grad') # old name 1567@gof.local_optimizer([softmax_grad]) 1568def local_softmax_grad_to_crossentropy_with_softmax_grad(node): 1569 if node.op == softmax_grad: 1570 g_coding_dist, coding_dist = node.inputs 1571 if (g_coding_dist.owner and 1572 g_coding_dist.owner.op == crossentropy_categorical_1hot_grad): 1573 g_nll, coding_dist, true_one_of_n = g_coding_dist.owner.inputs 1574 dx = crossentropy_softmax_1hot_with_bias_dx(g_nll, coding_dist, 1575 true_one_of_n) 1576 copy_stack_trace(node.outputs[0], dx) 1577 return [dx] 1578 1579 1580@opt.register_specialize('fast_compile_gpu') 1581@gof.local_optimizer([tensor.MaxAndArgmax]) 1582def local_argmax_pushdown(node): 1583 if isinstance(node.op, tensor.MaxAndArgmax) and node.inputs[0].owner and \ 1584 len(node.outputs[0].clients) > 0 and node.inputs[0].owner.op in \ 1585 (softmax_op, softplus, tensor.exp, tensor.log, tensor.tanh, sigmoid, 1586 softmax_with_bias): 1587 if theano.config.warn.argmax_pushdown_bug: 1588 logging.getLogger('theano.tensor.nnet.nnet').warn( 1589 "WARNING: there " 1590 "was a bug in Theano fixed on May 27th, 2010 in this case." 1591 " I.E. when we take the max of a softplus, softmax, exp, " 1592 "log, tanh, sigmoid, softmax_with_bias op, we were doing " 1593 "the max of the parent of the input. To remove this " 1594 "warning set the Theano flags 'warn.argmax_pushdown_bug' " 1595 "to False") 1596 1597 if (isinstance(node.op, tensor.MaxAndArgmax) and 1598 node.inputs[0].owner and len(node.outputs[0].clients) == 0): 1599 x_max, x_argmax = node.outputs 1600 x = node.inputs[0] 1601 axis = node.op.get_params(node) 1602 # TODO: Make a list/set of monotonic ops... 1603 if x.owner and x.owner.op in (softmax_op, softplus, tensor.exp, 1604 tensor.log, tensor.tanh, sigmoid): 1605 pre_x, = x.owner.inputs 1606 ret = tensor.max_and_argmax(pre_x, axis) 1607 copy_stack_trace(x_max, ret) 1608 return ret 1609 if x.owner and x.owner.op == softmax_with_bias: 1610 pre_x, pre_bias = x.owner.inputs 1611 ret = tensor.max_and_argmax(pre_x + 1612 tensor.DimShuffle( 1613 pre_bias.broadcastable, 1614 ('x', 0))(pre_bias), axis) 1615 # copy both stack traces 1616 copy_stack_trace(x_max, ret) 1617 return ret 1618 1619# Utility function used by the two next optimizations 1620 1621 1622def _check_rows_is_arange_len_labels(rows, labels): 1623 """ 1624 Check that 'rows' is the same node as T.arange(labels.shape[0]). 1625 1626 Also considers the case where labels.shape[0] is constant and equal 1627 to 1, and T.arange(labels.shape[0]) has been constant-folded into 0. 1628 1629 """ 1630 1631 if labels.owner and hasattr(labels.owner.fgraph, 'shape_feature'): 1632 shape_of = labels.owner.fgraph.shape_feature.shape_of 1633 # TODO: consider cases where shape_of[labels] is constant, and 1634 # has a value different from 1. 1635 # This case is harder, as _is_const only accepts a scalar value 1636 # as second argument, so checking for 1637 # _is_const(rows, numpy.arange(...)) does not work for the moment. 1638 if len(shape_of[labels]) == 1 and _is_const(shape_of[labels][0], 1): 1639 return _is_const(rows, 0) 1640 1641 if rows.owner and isinstance(rows.owner.op, tensor.ARange): 1642 start, stop, step = rows.owner.inputs 1643 if getattr(start, 'data', None) != 0: # constants will have data 1644 return False 1645 if getattr(step, 'data', None) != 1: # constant step will have data 1646 return False 1647 if not stop.owner: 1648 return False 1649 1650 # Not sure if that case happens any more after the introduction of 1651 # ShapeOptimizer, but we keep it if ShapeOptimizer is not present 1652 if isinstance(stop.owner.op, subtensor.Subtensor): 1653 shape_subtensor = stop.owner 1654 if shape_subtensor.op.get_constant_idx(shape_subtensor.inputs, 1655 allow_partial=True) == [0]: 1656 shape_var = shape_subtensor.inputs[0] 1657 if shape_var.owner and shape_var.owner.op == tensor.shape: 1658 return shape_var.owner.inputs[0] is labels 1659 else: 1660 shape_of = stop.owner.fgraph.shape_feature.shape_of 1661 return shape_of[labels][0] is stop 1662 1663 1664def _is_const(z, val, approx=False): 1665 try: 1666 maybe = opt.get_scalar_constant_value(z) 1667 except tensor.NotScalarConstantError: 1668 return False 1669 if approx: 1670 return np.allclose(maybe, val) 1671 else: 1672 return np.all(maybe == val) 1673 1674 1675@opt.register_specialize('fast_compile_gpu') 1676@gof.local_optimizer([subtensor.AdvancedSubtensor, tensor.log]) 1677def local_advanced_indexing_crossentropy_onehot(node): 1678 log = None 1679 sm = None 1680 # First case: log(softmax(x))[rows, labels] 1681 if isinstance(node.op, subtensor.AdvancedSubtensor): 1682 try: 1683 log, rows, labels = node.inputs 1684 except Exception: 1685 pass 1686 if log and log.owner and log.owner.op == tensor.log: 1687 sm = log.owner.inputs[0] 1688 1689 # Second case: log(softmax(x)[rows, labels]) 1690 elif node.op == tensor.log: 1691 pre_log = node.inputs[0].owner 1692 if pre_log and isinstance(pre_log.op, subtensor.AdvancedSubtensor): 1693 try: 1694 sm, rows, labels = pre_log.inputs 1695 except Exception: 1696 pass 1697 1698 if sm is not None and sm.owner and sm.owner.op in (softmax_op, 1699 softmax_with_bias): 1700 sm_w_bias = local_softmax_with_bias.transform(sm.owner) 1701 if sm_w_bias: 1702 assert sm_w_bias[0].owner.op == softmax_with_bias 1703 x_var, b_var = sm_w_bias[0].owner.inputs 1704 else: 1705 x_var = sm.owner.inputs[0] 1706 b_var = tensor.zeros_like(x_var[0]) 1707 1708 # Check that rows == arange(labels.shape[0]) 1709 if _check_rows_is_arange_len_labels(rows, labels): 1710 if labels.ndim == 1 and x_var.ndim == 2: 1711 minus_ret = crossentropy_softmax_argmax_1hot_with_bias(x_var, 1712 b_var, 1713 labels)[0] 1714 ret = -minus_ret 1715 copy_stack_trace(node.outputs[0], [minus_ret, ret]) 1716 return [ret] 1717 1718 1719@opt.register_specialize('fast_compile_gpu') 1720@gof.local_optimizer([softmax_grad]) 1721def local_advanced_indexing_crossentropy_onehot_grad(node): 1722 if not (node.op == softmax_grad): 1723 return 1724 1725 sm = None 1726 try: 1727 d_sm, sm = node.inputs 1728 except Exception: 1729 return 1730 1731 if (sm is not None) and sm.owner and (sm.owner.op in (softmax_op, 1732 softmax_with_bias)): 1733 sm_w_bias = local_softmax_with_bias.transform(sm.owner) 1734 if sm_w_bias: 1735 assert sm_w_bias[0].owner.op == softmax_with_bias 1736 x_var, b_var = sm_w_bias[0].owner.inputs 1737 else: 1738 x_var = sm.owner.inputs[0] 1739 else: 1740 return 1741 1742 # Two cases are supported: 1743 # 1. AdvancedIncSubtensor( 1744 # zeros_like(softmax(x)), 1745 # -out_grad / AdvancedSubtensor(softmax(x), arange(y.shape[0]), y), 1746 # arange(y.shape[0]), 1747 # y) 1748 # which arises from the gradient of log(softmax(x)[arange(y.shape[0]), y]) 1749 # 1750 # 2. AdvancedIncSubtensor( 1751 # zeros_like(log(softmax(x))), 1752 # -out_grad, 1753 # arange(y.shape[0]), 1754 # y) 1755 # / softmax(x) 1756 # which arises from the gradient of log(softmax(x))[arange(y.shape[0]), y] 1757 # 1758 # out_grad represents the gradient of the (final) cost wrt the output. 1759 1760 # 1761 # N.B. Regarding clients -- This substitution is important for numerical stability, so we 1762 # perform the substitution even when intermediate values have multiple clients. 1763 # 1764 1765 # First case. 1766 # After the check for AdvancedIncSubtensor, if anything does not fit with 1767 # the formula above, there's no way to fit it with the the second case, 1768 # so we return immediately. 1769 if d_sm.owner and isinstance(d_sm.owner.op, subtensor.AdvancedIncSubtensor): 1770 try: 1771 z, incr, rows, labels = d_sm.owner.inputs 1772 except Exception: 1773 return 1774 # Check that z == zeros_like(softmax(x)) 1775 # We know z has the right size because z has the same size as d_sm, 1776 # and d_sm and sm are both inputs of softmax_grad (so they have 1777 # the same size). 1778 if not _is_const(z, 0): 1779 return 1780 1781 # In the base case (output gradient = 1), incr is -1./sm[arange(len(y)), y] 1782 # Here, we are looking for the AdvancedSubtensor term (sm[arange(len(y)), y]), 1783 # and constructing out_grad by incorporating the other terms. 1784 # out_grad will be constructed in 3 steps as follow: 1785 # out_grad = +/- 1. (according to sign) 1786 # out_grad *= -numerator 1787 # out_grad /= denominator 1788 # Then, if out_grad is a scalar, it will be allocated as a vector 1789 adv_subtensor = None 1790 out_grad = 1. 1791 1792 # If there's a 'minus' sign before the whole expression, put it in 1793 # out_grad and iterate 1794 if incr.owner and incr.owner.op == tensor.neg: 1795 out_grad = - out_grad 1796 incr = incr.owner.inputs[0] 1797 1798 if incr.owner and incr.owner.op == tensor.true_div: 1799 num, denom = incr.owner.inputs 1800 1801 # set out_grad according to the numerator, it may be divided later 1802 # num should be a vector or a scalar 1803 if num.ndim == 1 or np.all(num.broadcastable): 1804 out_grad *= -num 1805 else: 1806 return 1807 1808 if not denom.owner: 1809 return 1810 1811 if isinstance(denom.owner.op, subtensor.AdvancedSubtensor): 1812 # Base case 1813 adv_subtensor = denom 1814 # out_grad /= 1. 1815 elif denom.owner.op == tensor.mul: 1816 # Try to find the AdvancedSubtensor node mentionned above, 1817 # and the output gradient 1818 for i, input in enumerate(denom.owner.inputs): 1819 if input.owner and isinstance(input.owner.op, 1820 subtensor.AdvancedSubtensor): 1821 other_inputs = [in_ for (j, in_) in 1822 enumerate(denom.owner.inputs) if j != i] 1823 if len(other_inputs) == 1: 1824 rest = other_inputs[0] 1825 else: 1826 rest = tensor.mul(*[other_inputs]) 1827 1828 # Check that rest is a vector or a scalar 1829 if rest.ndim == 1 or np.all(rest.broadcastable): 1830 adv_subtensor = input 1831 out_grad /= rest 1832 break 1833 else: 1834 return 1835 1836 # The output gradient needs to be a vector 1837 out_grad = tensor.fill(x_var[:, 0], out_grad) 1838 1839 if adv_subtensor is not None: 1840 try: 1841 maybe_sm, maybe_rows, \ 1842 maybe_labels = adv_subtensor.owner.inputs 1843 except Exception: 1844 return 1845 1846 if (not (maybe_sm is sm and 1847 maybe_rows is rows and 1848 maybe_labels is labels)): 1849 return 1850 # else: OK 1851 else: 1852 return 1853 else: 1854 return 1855 1856 # Check that rows is arange(labels.shape[0]) 1857 if not _check_rows_is_arange_len_labels(rows, labels): 1858 return 1859 # else, arguments of AdvancedIncSubtensor are OK, 1860 # it was really case 1. 1861 1862 # Second case 1863 elif d_sm.owner and d_sm.owner.op == tensor.true_div: 1864 # we're looking for 1865 # AdvIncSubtensor(zeros, grad_nll, arange(len(y)), y) / softmax 1866 try: 1867 num, denom = d_sm.owner.inputs 1868 except Exception: 1869 return 1870 1871 if denom != sm: 1872 return 1873 1874 # Check the numerator (AdvancedIncSubtensor) 1875 if num.owner and isinstance(num.owner.op, subtensor.AdvancedIncSubtensor): 1876 try: 1877 z, incr, rows, labels = num.owner.inputs 1878 except Exception: 1879 return 1880 1881 # Check z is zeros_like(log(sm)) 1882 if not _is_const(z, 0): 1883 return 1884 if z.broadcastable not in [(False, False), (True, False)]: 1885 return 1886 # here we know that we are incrementing a matrix of zeros 1887 # (or a broadcasted vector). 1888 # Since d_sm and sm are the inputs of softmax_grad, 1889 # if the graph is valid, they have the same shape, so we 1890 # also know that z has the right shape. 1891 1892 if incr.ndim != 1 or incr.dtype not in tensor.float_dtypes: 1893 return 1894 1895 # here we know that we are incrementing some part of 1896 # matrix z by a vector 1897 1898 # unless the user has taken care to mark that the data and 1899 # labels have the same number of rows, we cannot be sure 1900 # here that len(y) == len(z) However, in the common case 1901 # that these are predictions and labels it is true. We 1902 # leave it to the Op to crash (and the user to complain) 1903 # if this assumption is ever not true. 1904 1905 out_grad = -incr 1906 1907 # Check that rows is arange(labels.shape[0]) 1908 if not _check_rows_is_arange_len_labels(rows, labels): 1909 return 1910 # else, arguments of AdvancedIncSubtensor are OK 1911 else: 1912 return 1913 1914 # numerator and denominator are OK, 1915 # it was really case 2. 1916 1917 else: 1918 return 1919 1920 # Dimension check before substitution 1921 if labels.ndim == 1 and x_var.ndim == 2: 1922 ret = crossentropy_softmax_1hot_with_bias_dx(out_grad, sm, labels) 1923 # The stack trace is not added to output_grad, sm and labels at 1924 # the moment but may need to be added at a future point 1925 copy_stack_trace(node.outputs[0], ret) 1926 return [ret] 1927 else: 1928 return 1929 1930 1931@opt.register_specialize('fast_compile_gpu') 1932@gof.local_optimizer([softmax_with_bias]) 1933def graph_merge_softmax_with_crossentropy_softmax(node): 1934 if node.op == softmax_with_bias: 1935 x, b = node.inputs 1936 for x_client in x.clients: 1937 if x_client[0].op == crossentropy_softmax_argmax_1hot_with_bias: 1938 big_client = x_client[0] 1939 if big_client in [b_client[0] for b_client in b.clients]: 1940 xx, bb, ll = big_client.inputs 1941 mergeable_client = big_client.op(x, b, ll) 1942 copy_stack_trace(node.outputs[0], mergeable_client[1]) 1943 return [mergeable_client[1]] 1944 1945 1946@opt.register_specialize 1947@opt.register_stabilize 1948@opt.register_canonicalize 1949@gof.local_optimizer([CrossentropySoftmax1HotWithBiasDx]) 1950def local_useless_crossentropy_softmax_1hot_with_bias_dx_alloc(node): 1951 """ 1952 Replace a CrossentropySoftmax1HotWithBiasDx op, whose incoming gradient is 1953 an `alloc` of a scalar variable or one that has either broadcastable or 1954 matching dimensions with the output variable, by one that skips the 1955 intermediate `alloc`. 1956 1957 """ 1958 if isinstance(node.op, CrossentropySoftmax1HotWithBiasDx): 1959 dy, sm, y_idx = node.inputs 1960 1961 # Those cases are directly handled by the internal broadcasting of the 1962 # `CrossentropySoftmax1HotWithBiasDx` op. 1963 if dy.ndim == 0: 1964 return False 1965 if dy.ndim == 1 and dy.broadcastable[0]: 1966 return False 1967 1968 assert dy.ndim == 1 1969 1970 if dy.owner is not None and isinstance(dy.owner.op, tensor.Alloc): 1971 # dz is the input of the Alloc op, i.e. T.alloc(dz, <shape>) 1972 dz = dy.owner.inputs[0] 1973 1974 try: 1975 shape_feature = node.fgraph.shape_feature 1976 except AttributeError: 1977 # The shape feature may not be available in some mode, but we 1978 # need it for this optimization, so don't continue. 1979 return False 1980 1981 shape_of = shape_feature.shape_of 1982 same_shape = shape_feature.same_shape 1983 1984 # Build `dz_broad` explicitly to include extra implicit dimensions. 1985 dz_broad = (True,) * (dy.ndim - dz.ndim) + dz.broadcastable 1986 1987 # If we can infer statically that the shape of `sm` and 1988 # `dy` are the same in dimension `k` or the shape of `dy` is equal 1989 # to 1 (which triggers the internal broadcasting in 1990 # `CrossentropySoftmax1HotWithBiasDx`) we do not need to 1991 # check it at runtime. 1992 if (dz_broad[0] and 1993 not same_shape(sm, dy, dim_x=0, dim_y=0) and 1994 shape_of[dy][0] != 1): 1995 # If `dz` is broadcastable, we need to check whether the shapes 1996 # of `dy` and `sm` are the same or whether the shape of `dy` is 1997 # equal to 1. 1998 cond = tensor.or_(tensor.eq(dy.shape[0], 1), 1999 tensor.eq(dy.shape[0], sm.shape[0])) 2000 msg = '`sm` and `dy` do not have the same shape.' 2001 dz = opt.Assert(msg)(dz, cond) 2002 2003 ret = node.op(dz, sm, y_idx) 2004 copy_stack_trace(node.outputs[0], ret) 2005 return [ret] 2006 2007 2008def binary_crossentropy(output, target): 2009 """ 2010 Compute the crossentropy of binary random variables. 2011 2012 Output and target are each expectations of binary random 2013 variables; target may be exactly 0 or 1 but output must 2014 lie strictly between 0 and 1. 2015 2016 Notes 2017 ----- 2018 We could use the x log y op to support output=0 and output=1. 2019 The gradient would still be undefined though. 2020 2021 We do not sum, crossentropy is computed by component. 2022 TODO : Rewrite as a scalar, and then broadcast to tensor. 2023 2024 """ 2025 return -(target * tensor.log(output) + (1.0 - target) * tensor.log(1.0 - output)) 2026 2027 2028def sigmoid_binary_crossentropy(output, target): 2029 """ 2030 Compute the cross-entropy of binary random variables. 2031 2032 `output` should be real-valued (range (-inf, +inf)); `sigmoid` will be 2033 applied to produce a (0, 1) valued input. 2034 2035 `target` is assumed to be probabilities in [0, 1]. 2036 2037 Notes 2038 ----- 2039 Mathematically equivalent to `binary_crossentropy(sigmoid(output), target)`, 2040 but with more efficient and numerically stable computation. 2041 """ 2042 def grad(inputs, out_grads): 2043 (output, target), (out_grad,) = inputs, out_grads 2044 g_output = out_grad * (sigmoid(output) - target) 2045 g_target = out_grad * (-output) 2046 return [g_output, g_target] 2047 inp = [output, target] 2048 outp = softplus(-abs(output)) + output * ((output > 0) - target) 2049 return theano.OpFromGraph(inp, [outp], grad_overrides=grad, inline=True, 2050 name='sigmoid_binary_crossentropy')(*inp) 2051 2052 2053def categorical_crossentropy(coding_dist, true_dist): 2054 r""" 2055 Return the cross-entropy between an approximating distribution and a true 2056 distribution. 2057 2058 .. warning:: THIS FUNCTION IS UNNECESSARILY POLYMORPHIC. 2059 We ultimately don't want the polymorphism, and will move this function 2060 to pylearn.algorithms.cost. The 1hot version will be removed. 2061 The length of the documentation here is a form of code smell. 2062 2063 The cross entropy between two probability distributions measures the average 2064 number of bits needed to identify an event from a set of possibilities, if a 2065 coding scheme is used based on a given probability distribution q, rather 2066 than the "true" distribution p. 2067 2068 Mathematically it is defined as follows: 2069 2070 .. math:: 2071 2072 H(p,q) = - \sum_x p(x) \log(q(x)) 2073 2074 Parameters 2075 ---------- 2076 coding_dist : a dense matrix 2077 Each slice along axis represents one distribution. 2078 true_dist : a dense matrix or sparse matrix or integer vector 2079 In the case of a matrix argument, each slice along axis represents one 2080 distribution. In the case of an integer vector argument, each element 2081 represents the position of the '1' in a 1-of-N encoding. 2082 2083 Returns 2084 ------- 2085 tensor of rank one-less-than `coding_dist` 2086 The cross entropy between each coding and true distribution. 2087 2088 Notes 2089 ----- 2090 axis : int 2091 The dimension over which each distribution runs 2092 (1 for row distributions, 0 for column distributions). 2093 2094 """ 2095 if true_dist.ndim == coding_dist.ndim: 2096 return -tensor.sum(true_dist * tensor.log(coding_dist), 2097 axis=coding_dist.ndim - 1) 2098 elif true_dist.ndim == coding_dist.ndim - 1: 2099 return crossentropy_categorical_1hot(coding_dist, true_dist) 2100 else: 2101 raise TypeError('rank mismatch between coding and true distributions') 2102 2103 2104class Prepend_scalar_constant_to_each_row(gof.Op): 2105 2106 __props__ = () 2107 2108 def __init__(self, val=0): 2109 if isinstance(val, float): 2110 val = scalar.constant(val) 2111 self.val = val 2112 2113 def __str__(self): 2114 return '%s{%s}' % (self.__class__.__name__, self.val) 2115 2116 def make_node(self, mat): 2117 # check type of input 2118 x = tensor.as_tensor_variable(mat) 2119 if not mat.type.broadcastable == (False, False): 2120 raise TypeError("Expected a matrix as input") 2121 y = tensor.as_tensor_variable(self.val) 2122 assert y.ndim == 0 2123 if x.type.dtype != y.type.dtype: 2124 TypeError( 2125 "the value to prepend don't have the same type as the matrix") 2126 2127 node = Apply(op=self, inputs=[mat], outputs=[mat.type()]) 2128 return node 2129 2130 def perform(self, node, inp, out): 2131 mat, = inp 2132 output, = out 2133 new_shape = (mat.shape[0], mat.shape[1] + 1) 2134 if output[0] is None: 2135 output[0] = np.empty(new_shape, dtype=mat.dtype) 2136 out = output[0] 2137 else: 2138 if output[0].shape != new_shape: 2139 try: 2140 output[0].resize(new_shape) 2141 except Exception: 2142 output[0] = np.empty(new_shape, dtype=mat.dtype) 2143 out = output[0] 2144 2145 out[:, 0].fill(self.val.data) 2146 out[:, 1:] = mat 2147 2148 def infer_shape(self, node, in_shapes): 2149 shp = (in_shapes[0][0], in_shapes[0][1] + 1) 2150 return [shp] 2151 2152 def grad(self, inp, grads): 2153 mat, = inp 2154 goutput, = grads 2155 return goutput[:, 1:] 2156 2157 2158class Prepend_scalar_to_each_row(gof.Op): 2159 2160 __props__ = () 2161 2162 def make_node(self, val, mat): 2163 # check type of input 2164 x = tensor.as_tensor_variable(mat) 2165 if isinstance(val, float): 2166 val = scalar.constant(val) 2167 if not mat.type.broadcastable == (False, False): 2168 raise TypeError("Expected a matrix as input") 2169 y = tensor.as_tensor_variable(val) 2170 assert y.ndim == 0 2171 if x.type.dtype != y.type.dtype: 2172 TypeError( 2173 "the value to prepend don't have the same type as the matrix") 2174 2175 node = Apply(op=self, inputs=[val, mat], outputs=[mat.type()]) 2176 return node 2177 2178 def perform(self, node, inp, out): 2179 val, mat = inp 2180 output, = out 2181 new_shape = (mat.shape[0], mat.shape[1] + 1) 2182 if output[0] is None: 2183 output[0] = np.empty(new_shape, dtype=mat.dtype) 2184 out = output[0] 2185 else: 2186 if output[0].shape != new_shape: 2187 try: 2188 output[0].resize(new_shape) 2189 except Exception: 2190 output[0] = np.empty(new_shape, dtype=mat.dtype) 2191 out = output[0] 2192 out[:, 0].fill(val) 2193 out[:, 1:] = mat 2194 2195 def infer_shape(self, node, in_shapes): 2196 shp = (in_shapes[1][0], in_shapes[1][1] + 1) 2197 return [shp] 2198 2199 def grad(self, inp, grads): 2200 val, mat = inp 2201 goutput, = grads 2202 return goutput[:, 0], goutput[:, 1:] 2203 2204prepend_scalar_to_each_row = Prepend_scalar_to_each_row() 2205prepend_0_to_each_row = Prepend_scalar_constant_to_each_row(0.) 2206prepend_1_to_each_row = Prepend_scalar_constant_to_each_row(1.) 2207 2208 2209def relu(x, alpha=0): 2210 """ 2211 Compute the element-wise rectified linear activation function. 2212 2213 .. versionadded:: 0.7.1 2214 2215 Parameters 2216 ---------- 2217 x : symbolic tensor 2218 Tensor to compute the activation function for. 2219 alpha : `scalar or tensor, optional` 2220 Slope for negative input, usually between 0 and 1. The default value 2221 of 0 will lead to the standard rectifier, 1 will lead to 2222 a linear activation function, and any value in between will give a 2223 leaky rectifier. A shared variable (broadcastable against `x`) will 2224 result in a parameterized rectifier with learnable slope(s). 2225 2226 Returns 2227 ------- 2228 symbolic tensor 2229 Element-wise rectifier applied to `x`. 2230 2231 Notes 2232 ----- 2233 This is numerically equivalent to ``T.switch(x > 0, x, alpha * x)`` 2234 (or ``T.maximum(x, alpha * x)`` for ``alpha < 1``), but uses a faster 2235 formulation or an optimized Op, so we encourage to use this function. 2236 2237 """ 2238 # This is probably the fastest implementation for GPUs. Both the forward 2239 # pass and the gradient get compiled into a single GpuElemwise call. 2240 # TODO: Check if it's optimal for CPU as well; add an "if" clause if not. 2241 # TODO: Check if there's a faster way for the gradient; create an Op if so. 2242 if alpha == 0: 2243 return 0.5 * (x + abs(x)) 2244 else: 2245 # We can't use 0.5 and 1 for one and half. as if alpha is a 2246 # numpy dtype, they will be considered as float64, so would 2247 # cause upcast to float64. 2248 alpha = tensor.as_tensor_variable(alpha) 2249 f1 = 0.5 * (1 + alpha) 2250 f2 = 0.5 * (1 - alpha) 2251 return f1 * x + f2 * abs(x) 2252 2253 2254def h_softmax(x, batch_size, n_outputs, n_classes, n_outputs_per_class, 2255 W1, b1, W2, b2, target=None): 2256 """ Two-level hierarchical softmax. 2257 2258 This function implements a two-layer hierarchical softmax. It is commonly 2259 used as an alternative of the softmax when the number of outputs is 2260 important (it is common to use it for millions of outputs). See 2261 reference [1]_ for more information about the computational gains. 2262 2263 The `n_outputs` outputs are organized in `n_classes` classes, each class 2264 containing the same number `n_outputs_per_class` of outputs. 2265 For an input `x` (last hidden activation), the first softmax layer predicts 2266 its class and the second softmax layer predicts its output among its class. 2267 2268 If `target` is specified, it will only compute the outputs of the 2269 corresponding targets. Otherwise, if `target` is `None`, it will compute 2270 all the outputs. 2271 2272 The outputs are grouped in classes in the same order as they are initially 2273 defined: if `n_outputs=10` and `n_classes=2`, then the first class is 2274 composed of the outputs labeled `{0,1,2,3,4}` while the second class is 2275 composed of `{5,6,7,8,9}`. If you need to change the classes, you have to 2276 re-label your outputs. 2277 2278 .. versionadded:: 0.7.1 2279 2280 Parameters 2281 ---------- 2282 x: tensor of shape (batch_size, number of features) 2283 the minibatch input of the two-layer hierarchical softmax. 2284 batch_size: int 2285 the size of the minibatch input x. 2286 n_outputs: int 2287 the number of outputs. 2288 n_classes: int 2289 the number of classes of the two-layer hierarchical softmax. It 2290 corresponds to the number of outputs of the first softmax. See note at 2291 the end. 2292 n_outputs_per_class: int 2293 the number of outputs per class. See note at the end. 2294 W1: tensor of shape (number of features of the input x, n_classes) 2295 the weight matrix of the first softmax, which maps the input x to the 2296 probabilities of the classes. 2297 b1: tensor of shape (n_classes,) 2298 the bias vector of the first softmax layer. 2299 W2: tensor of shape (n_classes, number of features of the input x, 2300 n_outputs_per_class) 2301 the weight matrix of the second softmax, which maps the input x to 2302 the probabilities of the outputs. 2303 b2: tensor of shape (n_classes, n_outputs_per_class) 2304 the bias vector of the second softmax layer. 2305 target: tensor of shape either (batch_size,) or (batch_size, 1) 2306 (optional, default None) 2307 contains the indices of the targets for the minibatch 2308 input x. For each input, the function computes the output for its 2309 corresponding target. If target is None, then all the outputs are 2310 computed for each input. 2311 2312 Returns 2313 ------- 2314 tensor of shape (`batch_size`, `n_outputs`) or (`batch_size`, 1) 2315 Output tensor of the two-layer hierarchical softmax for input `x`. 2316 Depending on argument `target`, it can have two different shapes. 2317 If `target` is not specified (`None`), then all the outputs are 2318 computed and the returned tensor has shape (`batch_size`, `n_outputs`). 2319 Otherwise, when `target` is specified, only the corresponding outputs 2320 are computed and the returned tensor has thus shape (`batch_size`, 1). 2321 2322 Notes 2323 ----- 2324 The product of `n_outputs_per_class` and `n_classes` has to be greater or 2325 equal to `n_outputs`. If it is strictly greater, then the irrelevant 2326 outputs will be ignored. 2327 `n_outputs_per_class` and `n_classes` have to be the same as the 2328 corresponding dimensions of the tensors of `W1`, `b1`, `W2` and `b2`. 2329 The most computational efficient configuration is when 2330 `n_outputs_per_class` and `n_classes` are equal to the square root of 2331 `n_outputs`. 2332 2333 Examples 2334 -------- 2335 The following example builds a simple hierarchical softmax layer. 2336 2337 >>> import numpy as np 2338 >>> import theano 2339 >>> from theano import tensor 2340 >>> from theano.tensor.nnet import h_softmax 2341 >>> 2342 >>> # Parameters 2343 >>> batch_size = 32 2344 >>> n_outputs = 100 2345 >>> dim_x = 10 # dimension of the input 2346 >>> n_classes = int(np.ceil(np.sqrt(n_outputs))) 2347 >>> n_outputs_per_class = n_classes 2348 >>> output_size = n_outputs_per_class * n_outputs_per_class 2349 >>> 2350 >>> # First level of h_softmax 2351 >>> floatX = theano.config.floatX 2352 >>> W1 = theano.shared( 2353 ... np.random.normal(0, 0.001, (dim_x, n_classes)).astype(floatX)) 2354 >>> b1 = theano.shared(np.zeros((n_classes,), floatX)) 2355 >>> 2356 >>> # Second level of h_softmax 2357 >>> W2 = np.random.normal(0, 0.001, 2358 ... size=(n_classes, dim_x, n_outputs_per_class)).astype(floatX) 2359 >>> W2 = theano.shared(W2) 2360 >>> b2 = theano.shared(np.zeros((n_classes, n_outputs_per_class), floatX)) 2361 >>> 2362 >>> # We can now build the graph to compute a loss function, typically the 2363 >>> # negative log-likelihood: 2364 >>> 2365 >>> x = tensor.imatrix('x') 2366 >>> target = tensor.imatrix('target') 2367 >>> 2368 >>> # This only computes the output corresponding to the target. 2369 >>> # The complexity is O(n_classes + n_outputs_per_class). 2370 >>> y_hat_tg = h_softmax(x, batch_size, output_size, n_classes, 2371 ... n_outputs_per_class, W1, b1, W2, b2, target) 2372 >>> 2373 >>> negll = -tensor.mean(tensor.log(y_hat_tg)) 2374 >>> 2375 >>> # We may need to compute all the outputs (at test time usually): 2376 >>> 2377 >>> # This computes all the outputs. 2378 >>> # The complexity is O(n_classes * n_outputs_per_class). 2379 >>> output = h_softmax(x, batch_size, output_size, n_classes, 2380 ... n_outputs_per_class, W1, b1, W2, b2) 2381 2382 2383 References 2384 ---------- 2385 .. [1] J. Goodman, "Classes for Fast Maximum Entropy Training," 2386 ICASSP, 2001, <http://arxiv.org/abs/cs/0108006>`. 2387 """ 2388 2389 # First softmax that computes the probabilities of belonging to each class 2390 class_probs = theano.tensor.nnet.softmax(tensor.dot(x, W1) + b1) 2391 2392 if target is None: # Computes the probabilites of all the outputs 2393 2394 # Second softmax that computes the output probabilities 2395 activations = tensor.tensordot(x, W2, (1, 1)) + b2 2396 output_probs = theano.tensor.nnet.softmax( 2397 activations.reshape((-1, n_outputs_per_class))) 2398 output_probs = output_probs.reshape((batch_size, n_classes, -1)) 2399 output_probs = class_probs.dimshuffle(0, 1, 'x') * output_probs 2400 output_probs = output_probs.reshape((batch_size, -1)) 2401 # output_probs.shape[1] is n_classes * n_outputs_per_class, which might 2402 # be greater than n_outputs, so we ignore the potential irrelevant 2403 # outputs with the next line: 2404 output_probs = output_probs[:, :n_outputs] 2405 2406 else: # Computes the probabilities of the outputs specified by the targets 2407 2408 target = target.flatten() 2409 2410 # Classes to which belong each target 2411 target_classes = target // n_outputs_per_class 2412 2413 # Outputs to which belong each target inside a class 2414 target_outputs_in_class = target % n_outputs_per_class 2415 2416 # Second softmax that computes the output probabilities 2417 activations = sparse_block_dot( 2418 W2.dimshuffle('x', 0, 1, 2), x.dimshuffle(0, 'x', 1), 2419 tensor.zeros((batch_size, 1), dtype='int32'), b2, 2420 target_classes.dimshuffle(0, 'x')) 2421 2422 output_probs = theano.tensor.nnet.softmax(activations.dimshuffle(0, 2)) 2423 target_class_probs = class_probs[tensor.arange(batch_size), 2424 target_classes] 2425 output_probs = output_probs[tensor.arange(batch_size), 2426 target_outputs_in_class] 2427 output_probs = target_class_probs * output_probs 2428 2429 return output_probs 2430 2431 2432def elu(x, alpha=1): 2433 """ 2434 Compute the element-wise exponential linear activation function [2]_. 2435 2436 .. versionadded:: 0.8.0 2437 2438 Parameters 2439 ---------- 2440 x : symbolic tensor 2441 Tensor to compute the activation function for. 2442 alpha : scalar 2443 2444 2445 Returns 2446 ------- 2447 symbolic tensor 2448 Element-wise exponential linear activation function applied to `x`. 2449 2450 References 2451 ----- 2452 .. [2] Djork-Arne Clevert, Thomas Unterthiner, Sepp Hochreiter 2453 "Fast and Accurate Deep Network Learning by 2454 Exponential Linear Units (ELUs)" <http://arxiv.org/abs/1511.07289>`. 2455 """ 2456 return tensor.switch(x > 0, x, alpha * tensor.expm1(x)) 2457 2458 2459def selu(x): 2460 """Compute the element-wise Scaled Exponential Linear unit [3]_. 2461 2462 .. versionadded:: 0.9.0 2463 2464 Parameters 2465 ---------- 2466 x : symbolic tensor 2467 Tensor to compute the activation function for. 2468 2469 Returns 2470 ------- 2471 symbolic tensor 2472 Element-wise scaled exponential linear activation function applied to `x`. 2473 2474 References 2475 ---------- 2476 .. [3] Klambauer G, Unterthiner T, Mayr A, Hochreiter S. 2477 "Self-Normalizing Neural Networks" <https://arxiv.org/abs/1706.02515> 2478 """ 2479 alpha = 1.6732632423543772848170429916717 2480 scale = 1.0507009873554804934193349852946 2481 return scale * elu(x, alpha) 2482 2483 2484class ScalarSoftsign(theano.scalar.UnaryScalarOp): 2485 """ 2486 Softsign activation function 2487 :math:`\\varphi(\\mathbf{x}) = \\frac{1}{1+|x|}` 2488 2489 """ 2490 @staticmethod 2491 def static_impl(x): 2492 return x / (1.0 + abs(x)) 2493 2494 def impl(self, x): 2495 return ScalarSoftsign.static_impl(x) 2496 2497 def grad(self, inp, grads): 2498 x, = inp 2499 gz, = grads 2500 if 'float' in x.type.dtype: 2501 d = (1.0 + abs(x)) 2502 return [gz / (d * d)] 2503 else: 2504 return NotImplemented 2505 2506 def c_code(self, node, name, inp, out, sub): 2507 x, = inp 2508 z, = out 2509 if node.inputs[0].type in [theano.scalar.float32, 2510 theano.scalar.float64]: 2511 return "%(z)s = %(x)s / (1.0+fabs(%(x)s));" % locals() 2512 raise NotImplementedError('only floating point x is implemented') 2513 2514scalar_softsign = ScalarSoftsign(theano.scalar.upgrade_to_float, 2515 name='scalar_softsign') 2516softsign = elemwise.Elemwise(scalar_softsign, name='softsign') 2517 2518 2519def confusion_matrix(actual, pred): 2520 """ 2521 Computes the confusion matrix of given vectors containing 2522 actual observations and predicted observations. 2523 2524 Parameters 2525 ---------- 2526 actual : 1-d tensor variable 2527 pred : 1-d tensor variable 2528 2529 Returns 2530 ------- 2531 conf_mat : Confusion matrix of actual and predictions observations as shown below. 2532 2533 | Predicted 2534 ___________|___________ 2535 Actual | 2536 | 2537 2538 order : 1-d array of order of entries in rows and columns 2539 2540 Examples 2541 -------- 2542 >>> import theano 2543 >>> from theano.tensor.nnet import confusion_matrix 2544 2545 >>> x = theano.tensor.vector() 2546 >>> y = theano.tensor.vector() 2547 >>> f = theano.function([x, y], confusion_matrix(x, y)) 2548 >>> y_true = [2, 0, 2, 2, 0, 1] 2549 >>> y_pred = [0, 0, 2, 2, 0, 2] 2550 >>> print(f(y_true, y_pred)) 2551 [array([[2, 0, 0], 2552 [0, 0, 1], 2553 [1, 0, 2]]), array([ 0., 1., 2.])] 2554 """ 2555 if actual.ndim != 1: 2556 raise ValueError('actual must be 1-d tensor variable') 2557 if pred.ndim != 1: 2558 raise ValueError('pred must be 1-d tensor variable') 2559 2560 order = extra_ops.Unique(False, False, False)(tensor.concatenate([actual, pred])) 2561 2562 colA = actual.dimshuffle(0, 'x') 2563 colP = pred.dimshuffle(0, 'x') 2564 2565 oneHotA = tensor.eq(colA, order).astype('int64') 2566 oneHotP = tensor.eq(colP, order).astype('int64') 2567 2568 conf_mat = tensor.dot(oneHotA.T, oneHotP) 2569 return [conf_mat, order] 2570