1"""Driver for gradient calculations.""" 2from __future__ import absolute_import, print_function, division 3from collections import OrderedDict 4import six.moves.builtins as builtins 5import logging 6import time 7import warnings 8 9import numpy as np # for numeric_grad 10from six import itervalues 11 12import theano 13 14from theano import gof 15from theano.gof import utils, Variable 16from theano.compat import izip 17from six.moves import xrange, reduce 18from theano.gof.null_type import NullType, null_type 19from theano.gof.op import get_debug_values 20from theano.compile import ViewOp, FAST_RUN, DebugMode, get_mode 21 22__authors__ = "James Bergstra, Razvan Pascanu, Arnaud Bergeron, Ian Goodfellow" 23__copyright__ = "(c) 2011, Universite de Montreal" 24__license__ = "3-clause BSD License" 25__contact__ = "theano-dev <theano-dev@googlegroups.com>" 26 27__docformat__ = "restructuredtext en" 28_logger = logging.getLogger('theano.gradient') 29 30# we can't do "import theano.tensor" 31# tensor depends on theano.compile 32# theano.compile depends on theano.gradient (this file) 33# the reason theano.compile depends on theano.gradient 34# is that theano.compile.builders contains the op from graph 35# functionality and it uses theano.gradient to implement 36# the new op's grad method 37tensor = None 38 39_msg_retType = 'op.grad(...) returned a non-list' 40 41grad_time = 0 42 43 44def format_as(use_list, use_tuple, outputs): 45 """ 46 Formats the outputs according to the flags `use_list` and `use_tuple`. 47 48 If `use_list` is True, `outputs` is returned as a list (if `outputs` 49 is not a list or a tuple then it is converted in a one element list). 50 If `use_tuple` is True, `outputs` is returned as a tuple (if `outputs` 51 is not a list or a tuple then it is converted into a one element tuple). 52 Otherwise (if both flags are false), `outputs` is returned. 53 """ 54 assert not (use_list and use_tuple), \ 55 "Both flags cannot be simultaneously True" 56 if (use_list or use_tuple) and not isinstance(outputs, (list, tuple)): 57 if use_list: 58 return [outputs] 59 else: 60 return (outputs,) 61 elif not (use_list or use_tuple) and isinstance(outputs, (list, tuple)): 62 assert len(outputs) == 1, \ 63 "Wrong arguments. Expected a one element list" 64 return outputs[0] 65 elif use_list or use_tuple: 66 if use_list: 67 return list(outputs) 68 else: 69 return tuple(outputs) 70 else: 71 return outputs 72 73 74def grad_not_implemented(op, x_pos, x, comment=""): 75 """ 76 Return an un-computable symbolic variable of type `x.type`. 77 78 If any call to tensor.grad results in an expression containing this 79 un-computable variable, an exception (NotImplementedError) will be 80 raised indicating that the gradient on the 81 `x_pos`'th input of `op` has not been implemented. Likewise if 82 any call to theano.function involves this variable. 83 84 Optionally adds a comment to the exception explaining why this 85 gradient is not implemented. 86 """ 87 88 return (NullType(( 89 "This variable is Null because the grad method for " 90 "input %s (%s) of the %s op is not implemented. %s" 91 ) % (x_pos, x, op, comment)))() 92 93 94def grad_undefined(op, x_pos, x, comment=""): 95 """ 96 Return an un-computable symbolic variable of type `x.type`. 97 98 If any call to tensor.grad results in an expression containing this 99 un-computable variable, an exception (GradUndefinedError) will be 100 raised indicating that the gradient on the 101 `x_pos`'th input of `op` is mathematically undefined. Likewise if 102 any call to theano.function involves this variable. 103 104 Optionally adds a comment to the exception explaining why this 105 gradient is not defined. 106 """ 107 108 return (NullType( 109 ( 110 "This variable is Null because the grad method for " 111 "input %s (%s) of the %s op is mathematically undefined. %s" 112 ) % (x_pos, x, op, comment)))() 113 114 115class DisconnectedType(theano.gof.type.Type): 116 117 """ A type indicating that a variable is a result 118 of taking the gradient of c with respect to x 119 when c is not a function of x. 120 A symbolic placeholder for 0, but to convey 121 the extra information that this gradient is 0 122 because it is disconnected. 123 """ 124 125 def filter(self, data, strict=False, allow_downcast=None): 126 raise AssertionError( 127 ( 128 "If you're assigning to a DisconnectedType you're" 129 " doing something wrong. It should only be used as" 130 " a symbolic placeholder." 131 )) 132 133 def fiter_variable(self, other): 134 raise AssertionError( 135 ( 136 "If you're assigning to a DisconnectedType you're" 137 " doing something wrong. It should only be used as" 138 " a symbolic placeholder." 139 )) 140 141 def may_share_memory(a, b): 142 return False 143 144 def value_eq(a, b, force_same_dtype=True): 145 raise AssertionError( 146 ( 147 "If you're assigning to a DisconnectedType you're" 148 " doing something wrong. It should only be used as" 149 " a symbolic placeholder." 150 )) 151 152 def __str__(self): 153 return 'DisconnectedType' 154 155 156disconnected_type = DisconnectedType() 157 158 159######################## 160# R Operator 161######################## 162 163 164def Rop(f, wrt, eval_points, disconnected_outputs="raise", 165 return_disconnected="zero"): 166 """ 167 Computes the R operation on `f` wrt to `wrt` at `eval_points`. 168 169 Mathematically this stands for the jacobian of `f` wrt 170 to `wrt` right muliplied by the eval points. 171 172 Parameters 173 ---------- 174 f : :class:`~theano.gof.graph.Variable` or list of Variables 175 `f` stands for the output of the computational graph to which you 176 want to apply the R operator 177 wrt : :class:`~theano.gof.graph.Variable` or list of Variables 178 variables for which you compute the R operator of the expression 179 described by `f` 180 eval_points : :class:`~theano.gof.graph.Variable` or list of Variables 181 evalutation points for each of the variables in `wrt` 182 disconnected_outputs : str 183 Defines the behaviour if some of the variables in `f` 184 have no dependency on any of the variable in `wrt` (or if 185 all links are non-differentiable). The possible values are: 186 187 - 'ignore': considers that the gradient on these parameters is zero. 188 - 'warn': consider the gradient zero, and print a warning. 189 - 'raise': raise DisconnectedInputError. 190 191 return_disconnected : {'zero', 'None', 'Disconnected'} 192 - 'zero' : If wrt[i] is disconnected, return value i will be 193 wrt[i].zeros_like() 194 - 'None' : If wrt[i] is disconnected, return value i will be 195 None 196 - 'Disconnected' : returns variables of type DisconnectedType 197 198 Returns 199 ------- 200 :class:`~theano.gof.graph.Variable` or list/tuple of Variables depending on type of f 201 Symbolic expression such that 202 R_op[i] = sum_j (d f[i] / d wrt[j]) eval_point[j] 203 where the indices in that expression are magic multidimensional 204 indices that specify both the position within a list and all 205 coordinates of the tensor element in the last. 206 If `wrt` is a list/tuple, then return a list/tuple with the results. 207 """ 208 from theano.tensor import as_tensor_variable 209 using_list = isinstance(f, list) 210 using_tuple = isinstance(f, tuple) 211 if not isinstance(wrt, (list, tuple)): 212 wrt = [wrt] 213 214 if not isinstance(eval_points, (list, tuple)): 215 eval_points = [eval_points] 216 217 if not isinstance(f, (list, tuple)): 218 f = [f] 219 220 assert len(wrt) == len(eval_points) 221 222 # Check that each element of wrt corresponds to an element 223 # of eval_points with the same dimensionality. 224 for pack in enumerate(zip(wrt, eval_points)): 225 i = pack[0] 226 wrt_elem, eval_point = pack[1] 227 if not isinstance(wrt_elem, gof.Variable): 228 wrt_elem = as_tensor_variable(wrt_elem) 229 if not isinstance(eval_point, gof.Variable): 230 eval_point = as_tensor_variable(eval_point) 231 232 try: 233 234 if wrt_elem.type.ndim != eval_point.type.ndim: 235 raise ValueError('Element ' + 236 str(i) + 237 ' of wrt/eval_point have mismatched ' + 238 'dimensionality: ' + 239 str(wrt_elem.type.ndim) + 240 ' versus ' + 241 str(eval_point.type.ndim)) 242 except AttributeError: 243 # wrt_elem and eval_point don't always have ndim like random type 244 # Tensor, Sparse and GpuArray have the ndim attribute 245 pass 246 247 seen_nodes = OrderedDict() 248 249 def _traverse(node): 250 """ TODO: writeme """ 251 252 if node is None: 253 return 254 255 op = node.op 256 inputs = node.inputs 257 258 # Compute the evaluation points corresponding to each of the 259 # inputs of the node 260 local_eval_points = [] 261 for inp in inputs: 262 if inp in wrt: 263 local_eval_points.append(eval_points[wrt.index(inp)]) 264 elif inp.owner is None: 265 try: 266 local_eval_points.append(inp.zeros_like()) 267 except Exception: 268 # None should be used for non-differentiable 269 # arguments, like for example random states 270 local_eval_points.append(None) 271 elif inp.owner in seen_nodes: 272 273 local_eval_points.append( 274 seen_nodes[inp.owner][inp.owner.outputs.index(inp)]) 275 276 else: 277 # We actually need to compute the R_op for this node 278 279 _traverse(inp.owner) 280 local_eval_points.append( 281 seen_nodes[inp.owner][inp.owner.outputs.index(inp)]) 282 same_type_eval_points = [] 283 for x, y in zip(inputs, local_eval_points): 284 if y is not None: 285 if not isinstance(x, gof.Variable): 286 x = as_tensor_variable(x) 287 if not isinstance(y, gof.Variable): 288 y = as_tensor_variable(y) 289 try: 290 y = x.type.filter_variable(y) 291 except TypeError: 292 # This is a hack 293 # Originally both grad and Rop were written 294 # with the assumption that a variable and the 295 # gradient wrt that variable would have the same 296 # dtype. This was a bad assumption because the 297 # gradient wrt an integer can take on non-integer 298 # values. 299 # grad is now fixed, but Rop is not, so when grad 300 # does the right thing and violates this assumption 301 # we have to make it be wrong for Rop to keep working 302 # Rop should eventually be upgraded to handle integers 303 # correctly, the same as grad 304 y = theano.tensor.cast(y, x.type.dtype) 305 y = x.type.filter_variable(y) 306 assert x.type == y.type 307 same_type_eval_points.append(y) 308 else: 309 same_type_eval_points.append(y) 310 311 seen_nodes[node] = op.R_op(node.inputs, same_type_eval_points) 312 # end _traverse 313 314 # Populate the dictionary 315 for out in f: 316 _traverse(out.owner) 317 318 rval = [] 319 for out in f: 320 if out in wrt: 321 rval.append(eval_points[wrt.index(out)]) 322 elif seen_nodes.get(out.owner, None) is None or \ 323 seen_nodes[out.owner][out.owner.outputs.index(out)] is None: 324 message = ("Rop method was asked to compute the gradient " 325 "with respect to a variable that is not part of " 326 "the computational graph of variables in wrt, or is " 327 "used only by a non-differentiable operator: %s" % out) 328 if disconnected_outputs == 'ignore': 329 pass 330 elif disconnected_outputs == 'warn': 331 warnings.warn(message, stacklevel=2) 332 elif disconnected_outputs == 'raise': 333 message = utils.get_variable_trace_string(out) 334 raise DisconnectedInputError(message) 335 else: 336 raise ValueError("Invalid value for keyword " 337 "'disconnected_inputs', valid values are " 338 "'ignore', 'warn' and 'raise'.") 339 if return_disconnected.lower() == "zero": 340 rval.append(tensor.zeros_like(out)) 341 elif return_disconnected.lower() == "none": 342 rval.append(None) 343 elif return_disconnected.lower() == "disconnected": 344 rval.append(disconnected_type()) 345 else: 346 raise ValueError("Invalid value for keyword " 347 "'return_disconnected', valid values are " 348 "'zero', 'None' and 'Disconnected'.") 349 else: 350 rval.append(seen_nodes[out.owner][out.owner.outputs.index(out)]) 351 352 return format_as(using_list, using_tuple, rval) 353 354 355def Lop(f, wrt, eval_points, consider_constant=None, 356 disconnected_inputs='raise'): 357 """ 358 Computes the L operation on `f` wrt to `wrt` at `eval_points`. 359 360 Mathematically this stands for the jacobian of `f` wrt 361 to `wrt` left muliplied by the eval points. 362 363 Parameters 364 ---------- 365 f : :class:`~theano.gof.graph.Variable` or list of Variables 366 `f` stands for the output of the computational graph to which you 367 want to apply the L operator 368 wrt : :class:`~theano.gof.graph.Variable` or list of Variables 369 variables for which you compute the L operator of the expression 370 described by `f` 371 eval_points : :class:`~theano.gof.graph.Variable` or list of Variables 372 evalutation points for each of the variables in `f` 373 374 Returns 375 ------- 376 :class:`~theano.gof.Variable` or list/tuple of Variables depending on type of f 377 Symbolic expression such that 378 L_op[i] = sum_i (d f[i] / d wrt[j]) eval_point[i] 379 where the indices in that expression are magic multidimensional 380 indices that specify both the position within a list and all 381 coordinates of the tensor element in the last 382 If `f` is a list/tuple, then return a list/tuple with the results. 383 """ 384 if type(eval_points) not in (list, tuple): 385 eval_points = [eval_points] 386 387 using_list = isinstance(wrt, list) 388 using_tuple = isinstance(wrt, tuple) 389 390 if not isinstance(f, (list, tuple)): 391 f = [f] 392 393 # make copies of f and grads so we don't modify the client's copy 394 f = list(f) 395 grads = list(eval_points) 396 397 if not isinstance(wrt, (list, tuple)): 398 wrt = [wrt] 399 400 assert len(f) == len(grads) 401 known = OrderedDict(izip(f, grads)) 402 403 ret = grad(cost=None, known_grads=known, 404 consider_constant=consider_constant, wrt=wrt, 405 disconnected_inputs=disconnected_inputs) 406 407 return format_as(using_list, using_tuple, ret) 408 409 410######################### 411# Gradient 412######################### 413 414def grad(cost, wrt, consider_constant=None, 415 disconnected_inputs='raise', add_names=True, 416 known_grads=None, return_disconnected='zero', 417 null_gradients='raise'): 418 """ 419 Return symbolic gradients of one cost with respect to one or more variables. 420 421 For more information about how automatic differentiation works in Theano, 422 see :mod:`gradient`. For information on how to implement the gradient of 423 a certain Op, see :func:`grad`. 424 425 Parameters 426 ---------- 427 cost : :class:`~theano.gof.graph.Variable` scalar (0-dimensional) tensor variable or ``None`` 428 Value that we are differentiating (that we want the gradient of). 429 May be `None` if `known_grads` is provided. 430 wrt : :class:`~theano.gof.graph.Variable` or list of Variables 431 Term[s] with respect to which we want gradients 432 consider_constant : list of variables 433 Expressions not to backpropagate through 434 disconnected_inputs : {'ignore', 'warn', 'raise'} 435 Defines the behaviour if some of the variables in `wrt` are 436 not part of the computational graph computing `cost` (or if 437 all links are non-differentiable). The possible values are: 438 439 - 'ignore': considers that the gradient on these parameters is zero. 440 - 'warn': consider the gradient zero, and print a warning. 441 - 'raise': raise DisconnectedInputError. 442 add_names : bool 443 If True, variables generated by grad will be named 444 (d<cost.name>/d<wrt.name>) provided that both cost and wrt 445 have names 446 known_grads : OrderedDict, optional 447 A ordered dictionary mapping variables to their gradients. This is 448 useful in the case where you know the gradient on some 449 variables but do not know the original cost. 450 return_disconnected : {'zero', 'None', 'Disconnected'} 451 - 'zero' : If wrt[i] is disconnected, return value i will be 452 wrt[i].zeros_like() 453 - 'None' : If wrt[i] is disconnected, return value i will be 454 None 455 - 'Disconnected' : returns variables of type DisconnectedType 456 null_gradients : {'raise', 'return'} 457 Defines the behaviour if some of the variables in `wrt` have a 458 null gradient. The possibles values are: 459 460 - 'raise' : raise a NullTypeGradError exception 461 - 'return' : return the null gradients 462 463 Returns 464 ------- 465 variable or list/tuple of variables (matches `wrt`) 466 Symbolic expression of gradient of `cost` with respect to each 467 of the `wrt` terms. If an element of `wrt` is not 468 differentiable with respect to the output, then a zero 469 variable is returned. 470 471 """ 472 t0 = time.time() 473 global tensor 474 if tensor is None: 475 from theano import tensor 476 477 if cost is None: 478 if known_grads is None: 479 raise AssertionError("cost and known_grads can't both be None.") 480 481 if cost is not None and isinstance(cost.type, NullType): 482 raise ValueError("Can't differentiate a NaN cost." 483 "cost is NaN because " + 484 cost.type.why_null) 485 486 if cost is not None and cost.ndim != 0: 487 raise TypeError("cost must be a scalar.") 488 489 if isinstance(wrt, set): 490 raise TypeError("wrt must not be a set. sets have no defined " 491 "iteration order, so we can't return gradients in a" 492 " matching order.") 493 494 using_list = isinstance(wrt, list) 495 using_tuple = isinstance(wrt, tuple) 496 if not using_list and not using_tuple: 497 wrt = [wrt] 498 499 for elem in wrt: 500 if not isinstance(elem, Variable): 501 raise TypeError("Expected Variable, got " + str(elem) + 502 " of type " + str(type(elem))) 503 504 outputs = [] 505 if cost is not None: 506 outputs.append(cost) 507 if known_grads is not None: 508 outputs.extend(list(known_grads.keys())) 509 510 var_to_app_to_idx = _populate_var_to_app_to_idx( 511 outputs, wrt, consider_constant) 512 513 # build a dict mapping var to the gradient of cost with respect to var 514 grad_dict = OrderedDict() 515 516 if known_grads is None: 517 known_grads = OrderedDict() 518 else: 519 m = "known_grads must be an OrderedDict. " 520 assert isinstance(known_grads, OrderedDict) or len(known_grads) <= 1, m 521 522 # The gradient of the cost is 1 unless specified otherwise by known_grads. 523 if cost is not None: 524 if cost in known_grads: 525 g_cost = known_grads[cost] 526 else: 527 g_cost = _float_ones_like(cost) 528 # g_cost may be Disconnected or NullType. A creative use of the 529 # function, sure, but nonetheless one we can and should support. 530 # So before we try to cast it make sure it even has a dtype 531 if (hasattr(g_cost.type, 'dtype') and 532 cost.type.dtype in tensor.continuous_dtypes): 533 # Here we enforce the constraint that floating point variables 534 # have the same dtype as their gradient. 535 g_cost = g_cost.astype(cost.type.dtype) 536 # DO NOT enforce g_cost to be 0 if cost is an integer. 537 # This is to be enforced by the Op.grad method for the 538 # Op that outputs cost. 539 if hasattr(g_cost.type, 'dtype'): 540 assert g_cost.type.dtype in tensor.continuous_dtypes 541 542 grad_dict[cost] = g_cost 543 544 for var in known_grads: 545 g_var = known_grads[var] 546 547 if not hasattr(g_var, 'type'): 548 raise TypeError('output grads must be theano variables.' 549 'Ambiguous whether %s should be made into tensor' 550 ' or sparse theano variable' % str(type(g_var))) 551 552 if (not isinstance(g_var.type, (NullType, DisconnectedType)) and 553 'float' not in str(g_var.type.dtype)): 554 raise TypeError("Gradients must always be NullType, " 555 "DisconnectedType, or continuous, but grad was " 556 "given a known_grad of type " + str(g_var.type)) 557 558 # DO NOT check that these gradients are equal to 0 if var is int 559 # The gradient is allowed to be non-zero on var in that case 560 # Ops outputing var should not backpropagate its gradient further 561 # but that is enforced elsewhere (grep for only_connected_to_int) 562 563 grad_dict[var] = g_var 564 565 def handle_disconnected(var): 566 message = ("grad method was asked to compute the gradient " 567 "with respect to a variable that is not part of " 568 "the computational graph of the cost, or is used " 569 "only by a non-differentiable operator: %s" % var) 570 if disconnected_inputs == 'ignore': 571 pass 572 elif disconnected_inputs == 'warn': 573 warnings.warn(message, stacklevel=2) 574 elif disconnected_inputs == 'raise': 575 message = utils.get_variable_trace_string(var) 576 raise DisconnectedInputError(message) 577 else: 578 raise ValueError("Invalid value for keyword " 579 "'disconnected_inputs', valid values are " 580 "'ignore', 'warn' and 'raise'.") 581 582 # variables that do not influence the cost have zero gradient. 583 # if wrt is such a variable, populate the grad_dict with this info 584 # so that wrt not being in var_to_app_to_idx won't cause an error below 585 # according to the flag, possibly raise an error if wrt is disconnected 586 for elem in wrt: 587 if elem not in var_to_app_to_idx and elem is not cost \ 588 and elem not in grad_dict: 589 handle_disconnected(elem) 590 grad_dict[elem] = disconnected_type() 591 592 cost_name = None 593 if add_names and cost is not None: 594 cost_name = cost.name 595 596 # Make sure we didn't initialize the grad_dict with any ints 597 # The gradient may NEVER be an int, even if the variable is an int. 598 # Read the Op contract and talk to Ian Goodfellow before changing this! 599 for var in grad_dict: 600 g = grad_dict[var] 601 if hasattr(g.type, 'dtype'): 602 assert g.type.dtype in tensor.float_dtypes 603 604 rval = _populate_grad_dict(var_to_app_to_idx, 605 grad_dict, wrt, cost_name) 606 607 for i in xrange(len(rval)): 608 if isinstance(rval[i].type, NullType): 609 if null_gradients == 'raise': 610 raise NullTypeGradError("tensor.grad encountered a NaN. " + 611 rval[i].type.why_null) 612 else: 613 assert null_gradients == 'return' 614 if isinstance(rval[i].type, DisconnectedType): 615 handle_disconnected(rval[i]) 616 if return_disconnected == 'zero': 617 rval[i] = _float_zeros_like(wrt[i]) 618 elif return_disconnected == 'None': 619 rval[i] = None 620 else: 621 assert return_disconnected == 'Disconnected' 622 623 if using_tuple: 624 rval = tuple(rval) 625 elif not using_list: 626 rval, = rval 627 t1 = time.time() 628 global grad_time 629 grad_time += t1 - t0 630 return rval 631 632 633def subgraph_grad(wrt, end, start=None, cost=None, details=False): 634 ''' 635 With respect to `wrt`, computes gradients of cost and/or from 636 existing `start` gradients, up to the `end` variables of a 637 symbolic digraph. In other words, computes gradients for a 638 subgraph of the symbolic theano function. Ignores all disconnected 639 inputs. 640 641 This can be useful when one needs to perform the gradient descent 642 iteratively (e.g. one layer at a time in an MLP), or when a 643 particular operation is not differentiable in theano 644 (e.g. stochastic sampling from a multinomial). In the latter case, 645 the gradient of the non-differentiable process could be 646 approximated by user-defined formula, which could be calculated 647 using the gradients of a cost with respect to samples (0s and 648 1s). These gradients are obtained by performing a subgraph_grad 649 from the `cost` or previously known gradients (`start`) up to the 650 outputs of the stochastic process (`end`). A dictionary mapping 651 gradients obtained from the user-defined differentiation of the 652 process, to variables, could then be fed into another 653 subgraph_grad as `start` with any other `cost` (e.g. weight 654 decay). 655 656 In an MLP, we could use subgraph_grad to iteratively backpropagate: 657 658 .. code-block:: python 659 660 x, t = theano.tensor.fvector('x'), theano.tensor.fvector('t') 661 w1 = theano.shared(np.random.randn(3,4)) 662 w2 = theano.shared(np.random.randn(4,2)) 663 a1 = theano.tensor.tanh(theano.tensor.dot(x,w1)) 664 a2 = theano.tensor.tanh(theano.tensor.dot(a1,w2)) 665 cost2 = theano.tensor.sqr(a2 - t).sum() 666 cost2 += theano.tensor.sqr(w2.sum()) 667 cost1 = theano.tensor.sqr(w1.sum()) 668 669 params = [[w2],[w1]] 670 costs = [cost2,cost1] 671 grad_ends = [[a1], [x]] 672 673 next_grad = None 674 param_grads = [] 675 for i in xrange(2): 676 param_grad, next_grad = theano.subgraph_grad( 677 wrt=params[i], end=grad_ends[i], 678 start=next_grad, cost=costs[i] 679 ) 680 next_grad = dict(zip(grad_ends[i], next_grad)) 681 param_grads.extend(param_grad) 682 683 Parameters 684 ---------- 685 686 wrt : list of variables 687 Gradients are computed with respect to `wrt`. 688 689 end : list of variables 690 Theano variables at which to end gradient descent (they are 691 considered constant in theano.grad). For convenience, the 692 gradients with respect to these variables are also returned. 693 694 start : dictionary of variables 695 If not None, a dictionary mapping variables to their 696 gradients. This is useful when the gradient on some variables 697 are known. These are used to compute the gradients backwards up 698 to the variables in `end` (they are used as known_grad in 699 theano.grad). 700 701 cost : :class:`~theano.gof.Variable` scalar (0-dimensional) variable 702 Additional costs for which to compute the gradients. For 703 example, these could be weight decay, an l1 constraint, MSE, 704 NLL, etc. May optionally be None if start is provided. 705 706 .. warning:: 707 708 If the gradients of `cost` with respect to any of the `start` 709 variables is already part of the `start` dictionary, then it 710 may be counted twice with respect to `wrt` and `end`. 711 712 details : bool 713 When True, additionally returns the list of gradients from 714 `start` and of `cost`, respectively, with respect to `wrt` (not 715 `end`). 716 717 Returns 718 ------- 719 Tuple of 2 or 4 Lists of Variables 720 Returns lists of gradients with respect to `wrt` and `end`, 721 respectively. 722 723 724 .. versionadded:: 0.7 725 ''' 726 assert ((cost is not None) or (start is not None)) 727 assert isinstance(end, list) 728 assert isinstance(wrt, list) 729 if start is not None: 730 assert isinstance(start, dict) 731 732 params = list(set(wrt + end)) 733 734 start_grads = None 735 cost_grads = None 736 if start is not None: 737 start_grads = list( 738 theano.grad( 739 cost=None, wrt=params, known_grads=start, 740 consider_constant=end, 741 disconnected_inputs='ignore' 742 ) 743 ) 744 745 if cost is not None: 746 cost_grads = list( 747 theano.grad( 748 cost=cost, wrt=params, 749 consider_constant=end, 750 disconnected_inputs='ignore' 751 ) 752 ) 753 754 grads = None 755 if start is None: 756 grads = cost_grads 757 else: 758 grads = start_grads 759 if cost_grads is not None: 760 for i in range(len(grads)): 761 grads[i] += cost_grads[i] 762 763 pgrads = OrderedDict(izip(params, grads)) 764 # separate wrt from end grads: 765 wrt_grads = list(pgrads[k] for k in wrt) 766 end_grads = list(pgrads[k] for k in end) 767 768 if details: 769 return wrt_grads, end_grads, start_grads, cost_grads 770 771 return wrt_grads, end_grads 772 773 774def _node_to_pattern(node): 775 """ given an apply node, obtain its connection pattern 776 this is just a wrapper around Op.connection_pattern 777 that does type checking and supplies the default value 778 if the method is not implemented 779 """ 780 781 if hasattr(node.op, 'connection_pattern'): 782 connection_pattern = node.op.connection_pattern(node) 783 784 if not isinstance(connection_pattern, list): 785 raise TypeError( 786 "Op.connection_pattern should return " + 787 ("list of list of bool, but for Op=%s" % node.op) + 788 "got %s with type %s." % (connection_pattern, 789 type(connection_pattern))) 790 if len(connection_pattern) != len(node.inputs): 791 raise ValueError( 792 '%s.connection_pattern should have %d' % 793 (node.op, len(node.inputs)) + ' rows but has %d.' % 794 len(connection_pattern)) 795 for ii, output_pattern in enumerate(connection_pattern): 796 if not isinstance(output_pattern, list): 797 raise TypeError( 798 '%s.connection_pattern should return' % 799 node.op + ' a list of lists, but element %d' % ii + 800 'is %s of type %s.' % (output_pattern, 801 type(output_pattern))) 802 else: 803 connection_pattern = [[True for output in node.outputs] 804 for ipt in node.inputs] 805 assert isinstance(connection_pattern, list) 806 assert len(connection_pattern) == len(node.inputs) 807 for ii in xrange(len(node.inputs)): 808 assert isinstance(connection_pattern[ii], list) 809 assert len(connection_pattern[ii]) == len(node.outputs) 810 return connection_pattern 811 812 813def _populate_var_to_app_to_idx(outputs, wrt, consider_constant): 814 """ 815 Helper function for grad function. 816 817 Parameters 818 ---------- 819 outputs 820 a list of variables we want to take gradients of 821 822 wrt 823 a list of variables we want to take the gradient with 824 respect to. 825 826 consider_constant 827 a list of variables not to backpropagate through. 828 829 Returns 830 ------- 831 var_to_app_to_idx: 832 A dictionary mapping a variable to a second dictionary. 833 The second dictionary maps apply nodes acting on this 834 variable to the variable's index in the apply node's 835 input list. 836 837 This dictionary will only contain variables that 838 meet two criteria: 839 840 1) The elements of at least one output are a 841 function of the elements of the variable 842 843 2) The elements of the variable are a function of the 844 elements of at least one member of wrt. 845 846 This set is exactly the set of variables that connect 847 the variables in wrt to the cost being differentiated. 848 849 (A variable in consider_constant is not a function of 850 anything) 851 852 """ 853 854 # Validate and format consider_constant 855 if consider_constant is None: 856 consider_constant = [] 857 else: 858 # error checking on consider_constant: verify that it is a collection 859 # of theano variables 860 # this is important, if someone accidentally passes a nested data 861 # structure with theano variables at the leaves, only the root will 862 # be properly considered constant 863 try: 864 iter(consider_constant) 865 except TypeError: 866 raise TypeError('consider_constant must be an iterable collection,' 867 ' got ' + str(type(consider_constant))) 868 for elem in consider_constant: 869 if not isinstance(elem, gof.Variable): 870 raise TypeError('Elements of consider_constant must be ' 871 'variables, but got ' + str(type(elem))) 872 873 # var_to_app_to_idx[var][node] = [i,j] means node has 874 # var as input at positions i and j 875 var_to_app_to_idx = OrderedDict() 876 877 # Set of variables that have been added to their true parents 878 # ('true' here means that the elements of the variable are a function 879 # of the elements of the parent, according to the op's 880 # connection_pattern) 881 # Note: we need to revisit the apply nodes repeatedly, because 882 # different outputs of the apply node are connected to 883 # different subsets of the inputs. 884 accounted_for = set([]) 885 886 def account_for(var): 887 # Don't visit the same variable twice 888 if var in accounted_for: 889 return 890 accounted_for.add(var) 891 892 # Constants are not a function of anything 893 if var in consider_constant: 894 return 895 896 # Recursively add the variables that this variable is 897 # a function of. 898 if var.owner is not None: 899 app = var.owner 900 901 connection_pattern = _node_to_pattern(app) 902 903 var_idx = app.outputs.index(var) 904 905 for i, ipt in enumerate(app.inputs): 906 907 # don't process ipt if it is not a true 908 # parent of var 909 if not connection_pattern[i][var_idx]: 910 continue 911 912 if ipt not in var_to_app_to_idx: 913 # This object here *must* be an OrderedDict, because 914 # we iterate over its keys when adding up the terms of the 915 # gradient on ipt. If it is a regular dict, the grad method 916 # will return something that is analytically correct, but 917 # whose order of doing additions depends on the memory 918 # location of the apply nodes. 919 var_to_app_to_idx[ipt] = OrderedDict() 920 app_to_idx = var_to_app_to_idx[ipt] 921 if app not in app_to_idx: 922 app_to_idx[app] = [] 923 idx = app_to_idx[app] 924 if i not in idx: 925 idx.append(i) 926 account_for(ipt) 927 928 # add all variables that are true ancestors of the cost 929 for output in outputs: 930 account_for(output) 931 932 # determine which variables have elements of wrt as a true 933 # ancestor. Do this with an upward pass starting from wrt, 934 # following only true connections 935 visited = set([]) 936 937 def visit(var): 938 if var in visited: 939 return 940 if var not in var_to_app_to_idx: 941 return 942 visited.add(var) 943 nodes = var_to_app_to_idx[var] 944 for node in nodes: 945 connection_pattern = _node_to_pattern(node) 946 for idx in nodes[node]: 947 for ii, output in enumerate(node.outputs): 948 if connection_pattern[idx][ii]: 949 visit(output) 950 951 for elem in wrt: 952 visit(elem) 953 954 # Remove variables that don't have wrt as a true ancestor 955 orig_vars = list(var_to_app_to_idx.keys()) 956 for var in orig_vars: 957 if var not in visited: 958 del var_to_app_to_idx[var] 959 960 return var_to_app_to_idx 961 962 963class NullTypeGradError(TypeError): 964 """ 965 Raised when grad encounters a NullType. 966 """ 967 968 969class DisconnectedInputError(ValueError): 970 """ 971 Raised when grad is asked to compute the gradient 972 with respect to a disconnected input and 973 disconnected_inputs='raise'. 974 """ 975 976 977def _populate_grad_dict(var_to_app_to_idx, 978 grad_dict, wrt, cost_name=None): 979 """Helper function for grad function. 980 981 Parameters 982 ---------- 983 var_to_app_to_idx : dict 984 a dictionary mapping a variable to a second dictionary. 985 the second dictionary maps apply nodes acting on 986 this variable to the variable's index in the apply 987 node's input list 988 grad_dict : dict 989 A dictionary mapping variables to their gradients. 990 Should be populated by grad function, which should: 991 992 - Set the gradient with respect to the cost to 1 993 - Load all gradients from known_grads, possibly 994 overriding the cost 995 - Set the gradient for disconnected 996 inputs to a variable with type DisconnectedType() 997 998 wrt : list of Variables 999 the minimal set of variables that must be included in `grad_dict` 1000 cost_name: string 1001 The name of the cost being differentiated, optional. 1002 Used to name the grad with respect to x as (d<cost_name>/dx) 1003 1004 Returns 1005 ------- 1006 list of Variables 1007 A list of gradients corresponding to `wrt` 1008 1009 """ 1010 # build a dict mapping node to the terms node contributes to each of 1011 # its inputs' gradients 1012 term_dict = OrderedDict() 1013 1014 def access_term_cache(node): 1015 """ Populates term_dict[node] and returns it """ 1016 1017 if node not in term_dict: 1018 1019 inputs = node.inputs 1020 1021 output_grads = [access_grad_cache(var) for var in node.outputs] 1022 1023 # list of bools indicating if each output is connected to the cost 1024 outputs_connected = [not isinstance(g.type, DisconnectedType) 1025 for g in output_grads] 1026 1027 connection_pattern = _node_to_pattern(node) 1028 1029 # list of bools indicating if each input is connected to the cost 1030 inputs_connected = [ 1031 (True in [input_to_output and output_to_cost for 1032 input_to_output, output_to_cost in 1033 zip(input_to_outputs, outputs_connected)]) for 1034 input_to_outputs in connection_pattern 1035 ] 1036 1037 # List of bools indicating if each output is an integer dtype 1038 output_is_int = [hasattr(output.type, 'dtype') and 1039 output.type.dtype in theano.tensor.discrete_dtypes 1040 for output in node.outputs] 1041 1042 # List of bools indicating if each output is NullType 1043 ograd_is_nan = [isinstance(output.type, NullType) 1044 for output in output_grads] 1045 1046 # List of bools indicating if each input only has NullType outputs 1047 only_connected_to_nan = [ 1048 (True not in 1049 [in_to_out and out_to_cost and not out_nan 1050 for in_to_out, out_to_cost, out_nan in 1051 zip(in_to_outs, outputs_connected, ograd_is_nan)]) 1052 for in_to_outs in connection_pattern] 1053 1054 if True not in inputs_connected: 1055 # All outputs of this op are disconnected so we can skip 1056 # Calling the op's grad method and report that the inputs 1057 # are disconnected 1058 # (The op's grad method could do this too, but this saves the 1059 # implementer the trouble of worrying about this case) 1060 input_grads = [disconnected_type() for ipt in inputs] 1061 elif False not in only_connected_to_nan: 1062 # All inputs are only connected to nan gradients, so we don't 1063 # need to bother calling the grad method. We know the gradient 1064 # with respect to all connected inputs is nan. 1065 input_grads = [] 1066 for connected in inputs_connected: 1067 if connected: 1068 input_grads.append(null_type()) 1069 else: 1070 input_grads.append(disconnected_type()) 1071 else: 1072 # At least one input of this op is connected to the cost so and 1073 # not all output gradients are undefined so we must 1074 # call the op's grad method 1075 1076 # Each Op's grad function requires inputs and output_grads 1077 # If the Op destroys any input, but the grad expression uses 1078 # it, then chances are the resulting graph will have a 1079 # dependency cycle. We avoid this cycle by passing (symbolic) 1080 # copies of each destroyed input. 1081 try: 1082 dinputs = [node.inputs[x[0]] for x in 1083 itervalues(node.op.destroy_map)] 1084 except AttributeError: 1085 dinputs = [] 1086 1087 def try_to_copy_if_needed(var): 1088 if var in dinputs and hasattr(var, 'copy'): 1089 return var.copy() 1090 return var 1091 1092 inputs = [try_to_copy_if_needed(ipt) for ipt in inputs] 1093 1094 # Build a list of output gradients with the same dtype as 1095 # the corresponding output variable. 1096 # If an output is of a float dtype, we want to cast the 1097 # output gradient into the same dtype, to avoid having a 1098 # gradient graph with double precision (taking more memory, 1099 # and more computation). 1100 # If an output is of an integer dtype, then we just leave it 1101 # alone. 1102 # DO NOT force integer variables to have zero grad. This causes 1103 # bugs where we fail to detect disconnected or undefined 1104 # gradients. 1105 # DO NOT force integer variables to have integer dtype. 1106 # This is a violation of the op contract. 1107 new_output_grads = [] 1108 for o, og in zip(node.outputs, output_grads): 1109 o_dt = getattr(o.type, 'dtype', None) 1110 og_dt = getattr(og.type, 'dtype', None) 1111 if (o_dt not in theano.tensor.discrete_dtypes and 1112 og_dt and o_dt != og_dt): 1113 new_output_grads.append(og.astype(o_dt)) 1114 else: 1115 new_output_grads.append(og) 1116 1117 # Make sure that, if new_output_grads[i] has a floating point 1118 # dtype, it is the same dtype as outputs[i] 1119 for o, ng in zip(node.outputs, new_output_grads): 1120 o_dt = getattr(o.type, 'dtype', None) 1121 ng_dt = getattr(ng.type, 'dtype', None) 1122 if (ng_dt is not None and 1123 o_dt not in theano.tensor.discrete_dtypes): 1124 assert ng_dt == o_dt 1125 1126 # Someone who had obviously not read the Op contract tried 1127 # to modify this part of the function. 1128 # If you ever think it is a good idea to make an integer 1129 # valued gradient, please 1130 # 1) Read the Op contract again 1131 # 2) Talk to Ian Goodfellow 1132 # (Both of these sources will tell you not to do it) 1133 for ng in new_output_grads: 1134 assert (getattr(ng.type, 'dtype', None) 1135 not in theano.tensor.discrete_dtypes) 1136 1137 # If config.compute_test_value is turned on, check that the 1138 # gradients on the outputs of this node have the right shape. 1139 # We also check the gradient on the inputs later--both checks 1140 # are needed, because some gradients are only ever specified 1141 # by the user, not computed by Op.grad, and some gradients are 1142 # only computed and returned, but never passed as another 1143 # node's output grads. 1144 for idx, packed in enumerate(izip(node.outputs, 1145 new_output_grads)): 1146 orig_output, new_output_grad = packed 1147 if not hasattr(orig_output, 'shape'): 1148 continue 1149 if isinstance(new_output_grad.type, DisconnectedType): 1150 continue 1151 for orig_output_v, new_output_grad_v in get_debug_values( 1152 *packed): 1153 o_shape = orig_output_v.shape 1154 g_shape = new_output_grad_v.shape 1155 if o_shape != g_shape: 1156 raise ValueError( 1157 "Got a gradient of shape " + 1158 str(o_shape) + " on an output of shape " + 1159 str(g_shape)) 1160 1161 input_grads = node.op.L_op(inputs, node.outputs, 1162 new_output_grads) 1163 1164 if input_grads is None: 1165 raise TypeError("%s.grad returned NoneType, " 1166 "expected iterable." % str(node.op)) 1167 1168 if len(input_grads) != len(inputs): 1169 raise ValueError(("%s returned the wrong number of" + 1170 " gradient terms.") % str(node.op)) 1171# We can not enforce this, as AdvancedSubtensor1 has an option to 1172# return the sparse grad for optimization reason. 1173 1174 # for ig, i in zip(input_grads, inputs): 1175# if (not isinstance(ig.type, (DisconnectedType, NullType)) and 1176# type(ig.type) != type(i.type)): 1177# raise ValueError( 1178# "%s returned the wrong type for gradient terms." 1179# " Sparse inputs must have sparse grads and dense" 1180# " inputs must have dense grad. Got %s, expected %s" %( 1181# str(node.op), ig.type, i.type)) 1182 1183 # must convert to list in case the op returns a tuple 1184 # we won't be able to post-process out the Nones if it does that 1185 input_grads = list(input_grads) 1186 1187 # Need to propagate the NullType gradients; if an input grad is 1188 # not disconnected and the corresponding input is connected 1189 # to at least one output whose gradient is NullType then the input 1190 # grad should be NullType. 1191 for inp_idx in range(len(input_grads)): 1192 for out_idx in range(len(ograd_is_nan)): 1193 if (ograd_is_nan[out_idx] and 1194 connection_pattern[inp_idx][out_idx] and 1195 not isinstance(input_grads[inp_idx].type, 1196 DisconnectedType)): 1197 input_grads[inp_idx] = output_grads[out_idx] 1198 1199 # Do type checking on the result 1200 1201 # List of bools indicating if each input only has integer outputs 1202 only_connected_to_int = [ 1203 (True not in 1204 [in_to_out and out_to_cost and not out_int 1205 for in_to_out, out_to_cost, out_int in 1206 zip(in_to_outs, outputs_connected, output_is_int)]) 1207 for in_to_outs in connection_pattern] 1208 1209 for i, term in enumerate(input_grads): 1210 1211 # Disallow Nones 1212 if term is None: 1213 # We don't know what None means. in the past it has been 1214 # used to mean undefined, zero, or disconnected. 1215 # We therefore don't allow it because its usage has become 1216 # so muddied. 1217 raise TypeError( 1218 ('%s.grad returned None for' + 1219 ' a gradient term, ' 1220 'this is prohibited. Instead of None,' 1221 'return zeros_like(input), disconnected_type(),' 1222 ' or a NullType variable such as those made with ' 1223 'the grad_undefined or grad_unimplemented helper ' 1224 'functions.') % node.op) 1225 1226 # Check that the gradient term for this input 1227 # has the right shape 1228 if hasattr(term, 'shape'): 1229 orig_ipt = inputs[i] 1230 for orig_ipt_v, term_v in get_debug_values(orig_ipt, term): 1231 i_shape = orig_ipt_v.shape 1232 t_shape = term_v.shape 1233 if i_shape != t_shape: 1234 raise ValueError( 1235 "%s.grad returned object of " 1236 "shape %s as gradient term on input %d " 1237 "of shape %s" % (node.op, t_shape, i, i_shape)) 1238 1239 if not isinstance(term.type, 1240 (NullType, DisconnectedType)): 1241 if term.type.dtype not in theano.tensor.float_dtypes: 1242 raise TypeError(str(node.op) + '.grad illegally ' 1243 ' returned an integer-valued variable.' 1244 ' (Input index %d, dtype %s)' % ( 1245 i, term.type.dtype)) 1246 1247 if only_connected_to_nan[i]: 1248 assert isinstance(term.type, NullType) 1249 1250 if only_connected_to_int[i]: 1251 # This term has only integer outputs and we know 1252 # it's not undefined or disconnected 1253 # The only other valid thing it can be is 0 1254 1255 is_zero = _is_zero(term) 1256 assert is_zero in ['yes', 'no', 'maybe'] 1257 if is_zero == 'maybe': 1258 msg = ("%s.grad returned %s of type %s for input" 1259 " %d. This input's only connections to " 1260 "the cost through this op are via " 1261 "integer-valued outputs so it should be " 1262 "NullType, DisconnectedType, or some form " 1263 "of zeros. It is not NullType or " 1264 "DisconnectedType and theano can't " 1265 "simplify it to a constant, so it's not " 1266 "verifiably zeros.") 1267 1268 msg %= (node.op, term, type(term), i) 1269 1270 elif is_zero == 'no': 1271 msg = ("%s.grad returned %s of type %s for input" 1272 " %d. Since this input is only connected " 1273 "to integer-valued outputs, it should " 1274 "evaluate to zeros, but it evaluates to" 1275 "%s.") 1276 1277 msg %= (node.op, term, type(term), i, 1278 theano.get_scalar_constant_value(term)) 1279 1280 raise ValueError(msg) 1281 1282 # Check that op.connection_pattern matches the connectivity 1283 # logic driving the op.grad method 1284 for i, (ipt, ig, connected) in enumerate( 1285 zip(inputs, input_grads, inputs_connected) 1286 ): 1287 actually_connected = \ 1288 not isinstance(ig.type, DisconnectedType) 1289 1290 if actually_connected and not connected: 1291 msg = ("%s.grad returned %s of type %s for input %d." 1292 " Expected DisconnectedType instance based on " 1293 " the output of the op's connection_pattern " 1294 "method.") 1295 msg %= (str(node.op), str(ig), str(ig.type), i) 1296 raise TypeError(msg) 1297 1298 elif connected and not actually_connected: 1299 msg = "%s.grad returned DisconnectedType for input %d." 1300 msg %= (str(node.op), i) 1301 if hasattr(node.op, 'connection_pattern'): 1302 msg += (' Its connection_pattern method does not' 1303 ' allow this.') 1304 raise TypeError(msg) 1305 else: 1306 msg += (' You may want to implement a ' 1307 'connection_pattern method for it.') 1308 warnings.warn(msg) 1309 1310 # cache the result 1311 term_dict[node] = input_grads 1312 1313 return term_dict[node] 1314 1315 # populate grad_dict[var] and return it 1316 def access_grad_cache(var): 1317 if var not in grad_dict: 1318 # If var is not in grad_dict already, we must compute it 1319 if var in var_to_app_to_idx: 1320 null_terms = [] 1321 terms = [] 1322 node_to_idx = var_to_app_to_idx[var] 1323 for node in node_to_idx: 1324 for idx in node_to_idx[node]: 1325 1326 term = access_term_cache(node)[idx] 1327 1328 if not isinstance(term, gof.Variable): 1329 raise TypeError( 1330 "%s.grad returned %s, expected" 1331 " Variable instance." % (str(node.op), 1332 type(term))) 1333 1334 if isinstance(term.type, NullType): 1335 null_terms.append(term) 1336 continue 1337 1338 # Don't try to sum up DisconnectedType placeholders 1339 if isinstance(term.type, DisconnectedType): 1340 continue 1341 1342 if hasattr(var, 'ndim') and term.ndim != var.ndim: 1343 raise ValueError( 1344 ("%s.grad returned a term with" 1345 " %d dimensions, but %d are required.") % ( 1346 str(node.op), term.ndim, var.ndim)) 1347 1348 terms.append(term) 1349 1350 # Add up the terms to get the total gradient on this variable 1351 if len(null_terms) > 0: 1352 # At least one term is a NullType : the total gradient 1353 # will also be a NullType 1354 grad_dict[var] = null_terms[0] 1355 elif len(terms) > 0: 1356 # the next line is like sum(terms) but doesn't add an 1357 # extraneous TensorConstant(0) 1358 grad_dict[var] = reduce(lambda x, y: x + y, terms) 1359 else: 1360 grad_dict[var] = disconnected_type() 1361 1362 if cost_name is not None and var.name is not None: 1363 grad_dict[var].name = '(d%s/d%s)' % (cost_name, var.name) 1364 else: 1365 # this variable isn't connected to the cost in the 1366 # computational graph 1367 grad_dict[var] = disconnected_type() 1368 # end if cache miss 1369 return grad_dict[var] 1370 1371 rval = [access_grad_cache(elem) for elem in wrt] 1372 1373 return rval 1374 1375 1376def _float_zeros_like(x): 1377 """ Like zeros_like, but forces the object to have a 1378 a floating point dtype """ 1379 1380 rval = x.zeros_like() 1381 1382 if rval.type.dtype.find('float') != -1: 1383 return rval 1384 1385 return rval.astype(theano.config.floatX) 1386 1387 1388def _float_ones_like(x): 1389 """ Like ones_like, but forces the object to have a 1390 floating point dtype """ 1391 1392 dtype = x.type.dtype 1393 if dtype not in tensor.float_dtypes: 1394 dtype = theano.config.floatX 1395 1396 return x.ones_like(dtype=dtype) 1397 1398 1399class numeric_grad(object): 1400 """ 1401 Compute the numeric derivative of a scalar-valued function at a particular 1402 point. 1403 """ 1404 1405 # Note on step sizes and tolerances: 1406 # 1407 # There is a relationship between the step size and the function value and 1408 # the measurement error that is incurred due to rounding. The finite 1409 # difference we measure is 1410 # delta = f(x0) - f(x0+eps) 1411 # 1412 # For maximum precision, f should be close to zero. 1413 # For every power of 2 that f departs from zero, we lose a bit of precision 1414 # in delta. 1415 # 1416 # Even in this case of maximum accuracy, there is a tradeoff between 1417 # stepsize and measurement error. 1418 # Taking small steps allows us to measure large derivatives accuractly, 1419 # but longer steps are required to measure small derivatives accurately. 1420 # However longer steps introduce bias into our measurement in general 1421 # for non-linear functions. 1422 # 1423 # It would be interesting to have a version of numeric grad that used an 1424 # adaptive stepsize. 1425 # 1426 # For now, we use a heuristic that catches very bad gradients, but is not 1427 # perfectly accurate. 1428 type_eps = {'float64': 1e-7, 1429 'float32': 3e-4, 1430 'float16': 1e-1, 1431 np.dtype('float64'): 1e-7, 1432 np.dtype('float32'): 3e-4, 1433 np.dtype('float16'): 1e-1} 1434 1435 def __init__(self, f, pt, eps=None, out_type=None): 1436 """Return the gradient of f at pt. 1437 1438 This function computes the gradient by a one-sided finite 1439 differences of a fixed step size (eps). 1440 1441 Parameters 1442 ---------- 1443 f : a differentiable function such that f(*pt) is a scalar 1444 The function to compute the gradient of. 1445 It is assumed that f(...) will return a scalar. 1446 It is assumed that all f's inputs are numpy.ndarray objects. 1447 pt : an ndarray, a list of ndarrays or tuple of ndarrays 1448 The point where to evaluate the gradient 1449 out_type: float 1450 dtype of output, if complex (i.e. 'complex32' or 'complex64') 1451 eps : float, optional 1452 The stepsize for the finite differencing. None means 1453 input dtype-dependent. See `type_eps`. 1454 """ 1455 1456 def prod(inputs): 1457 rval = 1 1458 for i in inputs: 1459 rval *= i 1460 return rval 1461 1462 packed_pt = False 1463 if not isinstance(pt, (list, tuple)): 1464 pt = [pt] 1465 packed_pt = True 1466 1467 apt = [np.array(p) for p in pt] 1468 1469 shapes = [p.shape for p in apt] 1470 dtypes = [str(p.dtype) for p in apt] 1471 1472 # TODO: remove this eventually (why was this here in the first place ?) 1473 # In the case of CSM, the arguments are a mixture of floats and 1474 # integers... 1475 # if not dtypes == [dtypes[0]] * len(apt): 1476 # raise TypeError('All function arguments must have same dtype') 1477 1478 total_size = builtins.sum(prod(sh) for sh in shapes) 1479 1480 working_dtype = builtins.min( 1481 (self.type_eps[dt], dt) for dt in dtypes)[1] 1482 1483 # create un-initialized memory 1484 x = np.ndarray((total_size,), dtype=working_dtype) 1485 # (not out_type is None) --> (out_type is not None) ??? 1486 if (out_type is not None) and (out_type.startswith('complex')): 1487 gx = np.ndarray((total_size,), dtype=out_type) 1488 else: 1489 gx = np.ndarray((total_size,), dtype=working_dtype) 1490 1491 if eps is None: 1492 eps = builtins.max(self.type_eps[dt] for dt in dtypes) 1493 1494 # set up aliases so that apt[i] is backed by memory in x 1495 # and self.gf is backed by memory in gx 1496 cur_pos = 0 1497 self.gf = [] 1498 for i, p in enumerate(apt): 1499 p_size = prod(p.shape) 1500 # set up alias 1501 apt[i] = x[cur_pos: cur_pos + p_size].reshape(p.shape) 1502 self.gf.append(gx[cur_pos: cur_pos + p_size].reshape(p.shape)) 1503 # initialize with p's value 1504 apt[i][...] = p 1505 cur_pos += p_size 1506 1507 f_x = f(*[p.copy() for p in apt]) 1508 1509 # now iterate over the elements of x, and call f on apt. 1510 x_copy = x.copy() 1511 for i in xrange(total_size): 1512 x[:] = x_copy 1513 1514 x[i] += eps 1515 f_eps = f(*apt) 1516 1517 # TODO: remove this when it is clear that the next 1518 # replacemement does not pose problems of its own. It was replaced 1519 # for its inability to handle complex variables. 1520 # gx[i] = numpy.asarray((f_eps - f_x) / eps) 1521 1522 gx[i] = ((f_eps - f_x) / eps) 1523 1524 if packed_pt: 1525 self.gf = self.gf[0] 1526 1527 @staticmethod 1528 def abs_rel_err(a, b): 1529 """Return absolute and relative error between a and b. 1530 1531 The relative error is a small number when a and b are close, relative 1532 to how big they are. 1533 1534 Formulas used: 1535 abs_err = abs(a - b) 1536 1537 rel_err = abs_err / max(abs(a) + abs(b), 1e-8) 1538 1539 The denominator is clipped at 1e-8 to avoid dividing by 0 when a and b 1540 are both close to 0. 1541 1542 The tuple (abs_err, rel_err) is returned 1543 """ 1544 abs_err = abs(a - b) 1545 # 1e-8 is to prevent division by zeros. 1546 # [] is to make sure that if a and b are float16, 1e-8 don't get 1547 # dowcasted to float16 as that give 0! This would add back the 1548 # division by zero 1549 rel_err = abs_err / np.maximum(abs(a) + abs(b), [1e-8]) 1550 # The numpy.asarray are needed as if a or b is a sparse matrix 1551 # this would result in a numpy.matrix and not a numpy.ndarray 1552 # and the behave differently causing problem later. 1553 # In particular a_npy_matrix.flatten().shape == (1, n_element) 1554 abs_err = np.asarray(abs_err) 1555 rel_err = np.asarray(rel_err) 1556 return (abs_err, rel_err) 1557 1558 def abs_rel_errors(self, g_pt): 1559 """Return the abs and rel error of gradient estimate `g_pt` 1560 1561 `g_pt` must be a list of ndarrays of the same length as self.gf, 1562 otherwise a ValueError is raised. 1563 1564 Corresponding ndarrays in `g_pt` and `self.gf` must have the same 1565 shape or ValueError is raised. 1566 1567 """ 1568 if len(g_pt) != len(self.gf): 1569 raise ValueError('argument has wrong number of elements', 1570 len(g_pt)) 1571 errs = [] 1572 for i, (a, b) in enumerate(zip(g_pt, self.gf)): 1573 if a.shape != b.shape: 1574 raise ValueError('argument element %i has wrong shape %s' % ( 1575 i, str((a.shape, b.shape)))) 1576 errs.append(numeric_grad.abs_rel_err(a, b)) 1577 return errs 1578 1579 def max_err(self, g_pt, abs_tol, rel_tol): 1580 """Find the biggest error between g_pt and self.gf. 1581 1582 What is measured is the violation of relative and absolute errors, 1583 wrt the provided tolerances (abs_tol, rel_tol). 1584 A value > 1 means both tolerances are exceeded. 1585 1586 Return the argmax of min(abs_err / abs_tol, rel_err / rel_tol) over 1587 g_pt, as well as abs_err and rel_err at this point. 1588 """ 1589 pos = [] 1590 errs = [] 1591 abs_errs = [] 1592 rel_errs = [] 1593 1594 abs_rel_errs = self.abs_rel_errors(g_pt) 1595 for abs_err, rel_err in abs_rel_errs: 1596 if not np.all(np.isfinite(abs_err)): 1597 raise ValueError('abs_err not finite', repr(abs_err)) 1598 if not np.all(np.isfinite(rel_err)): 1599 raise ValueError('rel_err not finite', repr(rel_err)) 1600 scaled_err = np.minimum(abs_err / abs_tol, rel_err / rel_tol) 1601 max_i = scaled_err.argmax() 1602 1603 pos.append(max_i) 1604 errs.append(scaled_err.flatten()[max_i]) 1605 abs_errs.append(abs_err.flatten()[max_i]) 1606 rel_errs.append(rel_err.flatten()[max_i]) 1607 1608 # max over the arrays in g_pt 1609 max_arg = np.argmax(errs) 1610 max_pos = pos[max_arg] 1611 return (max_arg, max_pos, abs_errs[max_arg], rel_errs[max_arg]) 1612 1613 1614def mode_not_slow(mode): 1615 if mode == 'FAST_COMPILE': 1616 return FAST_RUN 1617 mode = get_mode(mode) 1618 if isinstance(mode, DebugMode): 1619 opt = mode.optimizer 1620 return FAST_RUN.clone(optimizer=opt) 1621 else: 1622 return mode 1623 1624 1625def verify_grad(fun, pt, n_tests=2, rng=None, eps=None, 1626 out_type=None, abs_tol=None, 1627 rel_tol=None, mode=None, cast_to_output_type=False, 1628 no_debug_ref=True): 1629 """Test a gradient by Finite Difference Method. Raise error on failure. 1630 1631 Raises an Exception if the difference between the analytic gradient and 1632 numerical gradient (computed through the Finite Difference Method) of a 1633 random projection of the fun's output to a scalar exceeds the given 1634 tolerance. 1635 1636 Examples 1637 -------- 1638 >>> verify_grad(theano.tensor.tanh, 1639 ... (np.asarray([[2, 3, 4], [-1, 3.3, 9.9]]),), 1640 ... rng=np.random) 1641 1642 Parameters 1643 ---------- 1644 fun : a Python function 1645 `fun` takes Theano variables as inputs, and returns a Theano variable. 1646 For instance, an Op instance with a single output. 1647 pt : list of numpy.ndarrays 1648 Input values, points where the gradient is estimated. 1649 These arrays must be either float16, float32, or float64 arrays. 1650 n_tests : int 1651 number of times to run the test 1652 rng : numpy.random.RandomState, optional 1653 random number generator used to sample the output random projection `u`, 1654 we test gradient of sum(u * fun) at `pt` 1655 eps : float, optional 1656 stepsize used in the Finite Difference Method (Default 1657 None is type-dependent). 1658 Raising the value of eps can raise or lower the absolute 1659 and relative errors of the verification depending on the 1660 Op. Raising eps does not lower the verification quality for 1661 linear operations. It is better to raise `eps` than raising 1662 `abs_tol` or `rel_tol`. 1663 out_type : string 1664 dtype of output, if complex (i.e., 'complex32' or 'complex64') 1665 abs_tol : float 1666 absolute tolerance used as threshold for gradient comparison 1667 rel_tol : float 1668 relative tolerance used as threshold for gradient comparison 1669 cast_to_output_type : bool 1670 if the output is float32 and cast_to_output_type is True, cast 1671 the random projection to float32. Otherwise it is float64. 1672 float16 is not handled here. 1673 no_debug_ref : bool 1674 Don't use DebugMode for the numerical gradient function. 1675 1676 Note 1677 ---- 1678 This function does not support multiple outputs. In 1679 tests/test_scan.py there is an experimental verify_grad that 1680 covers that case as well by using random projections. 1681 1682 """ 1683 # The import is here to prevent circular import. 1684 from theano import compile, shared 1685 import theano.tensor 1686 from theano.tensor import as_tensor_variable, TensorType 1687 assert isinstance(pt, (list, tuple)) 1688 pt = [np.array(p) for p in pt] 1689 1690 for i, p in enumerate(pt): 1691 if p.dtype not in ('float16', 'float32', 'float64'): 1692 raise TypeError( 1693 ('verify_grad can work only with floating point ' 1694 'inputs, but input %i has dtype "%s".') % (i, p.dtype)) 1695 1696 _type_tol = dict( # relative error tolerances for different types 1697 float16=5e-2, 1698 float32=1e-2, 1699 float64=1e-4) 1700 1701 if abs_tol is None: 1702 abs_tol = builtins.max(_type_tol[str(p.dtype)] for p in pt) 1703 if rel_tol is None: 1704 rel_tol = builtins.max(_type_tol[str(p.dtype)] for p in pt) 1705 1706 if rng is None: 1707 raise TypeError(('rng should be a valid instance of ' 1708 'numpy.random.RandomState. You may ' 1709 'want to use theano.tests.unittest' 1710 '_tools.verify_grad instead of ' 1711 'theano.gradient.verify_grad.')) 1712 1713 # We allow input downcast in function, because numeric_grad works in the 1714 # most precise dtype used among the inputs, so we may need to cast some. 1715 def function(inputs, output, name, mode=mode): 1716 f = compile.function(inputs, output, accept_inplace=True, 1717 allow_input_downcast=True, mode=mode, 1718 on_unused_input='ignore', name=name) 1719 return f 1720 1721 tensor_pt = [ 1722 TensorType( 1723 as_tensor_variable(p).dtype, 1724 as_tensor_variable(p).broadcastable)(name='input %i' % i) 1725 for i, p in enumerate(pt)] 1726 1727 # fun can be either a function or an actual Op instance 1728 o_output = fun(*tensor_pt) 1729 1730 if isinstance(o_output, list): 1731 raise NotImplementedError(('cant (yet) autotest gradient of fun ' 1732 'with multiple outputs')) 1733 # we could make loop over outputs making random projections R for each, 1734 # but this doesn't handle the case where not all the outputs are 1735 # differentiable... so I leave this as TODO for now -JB. 1736 1737 o_fn = function(tensor_pt, o_output, name='gradient.py fwd') 1738 o_fn_out = o_fn(*[p.copy() for p in pt]) 1739 1740 if isinstance(o_fn_out, tuple) or isinstance(o_fn_out, list): 1741 raise TypeError( 1742 'It seems like you are trying to use verify_grad ' 1743 'on an op or a function which outputs a list: there should' 1744 ' be a single (array-like) output instead') 1745 1746 # random_projection should not have elements too small, 1747 # otherwise too much precision is lost in numerical gradient 1748 def random_projection(): 1749 plain = rng.rand(*o_fn_out.shape) + 0.5 1750 if cast_to_output_type and o_output.dtype == "float32": 1751 return np.array(plain, o_output.dtype) 1752 return plain 1753 1754 t_r = shared(random_projection(), borrow=True) 1755 t_r.name = 'random_projection' 1756 1757 # random projection of o onto t_r 1758 # This sum() is defined above, it's not the builtin sum. 1759 cost = theano.tensor.sum(t_r * o_output) 1760 1761 if no_debug_ref: 1762 mode_for_cost = mode_not_slow(mode) 1763 else: 1764 mode_for_cost = mode 1765 1766 cost_fn = function(tensor_pt, cost, name='gradient.py cost', 1767 mode=mode_for_cost) 1768 1769 symbolic_grad = grad(cost, tensor_pt, 1770 disconnected_inputs='ignore') 1771 1772 grad_fn = function(tensor_pt, symbolic_grad, 1773 name='gradient.py symbolic grad') 1774 1775 for test_num in xrange(n_tests): 1776 try: 1777 num_grad = numeric_grad(cost_fn, [p.copy() for p in pt], 1778 eps, out_type) 1779 1780 analytic_grad = grad_fn(*[p.copy() for p in pt]) 1781 1782 # Since `tensor_pt` is a list, `analytic_grad` should be one too. 1783 assert isinstance(analytic_grad, list) 1784 1785 max_arg, max_err_pos, max_abs_err, max_rel_err = num_grad.max_err( 1786 analytic_grad, abs_tol, rel_tol) 1787 1788 if max_abs_err > abs_tol and max_rel_err > rel_tol: 1789 1790 raise verify_grad.E_grad(max_arg, max_err_pos, 1791 analytic_grad[max_arg].shape, 1792 analytic_grad[max_arg].flatten()[max_err_pos], 1793 num_grad.gf[max_arg].flatten()[max_err_pos], 1794 max_abs_err, max_rel_err, 1795 abs_tol, rel_tol) 1796 1797 # get new random projection for next test 1798 if test_num < n_tests - 1: 1799 t_r.set_value(random_projection(), borrow=True) 1800 except Exception as e: 1801 e.args += ("\nThe error happened with the following inputs:", pt, 1802 "\nThe value of eps is:", eps, 1803 "\nThe out_type is:", out_type) 1804 raise 1805 1806 1807class GradientError(Exception): 1808 """This error is raised when a gradient is calculated, but incorrect.""" 1809 def __init__(self, arg, err_pos, shape, val1, val2, 1810 abs_err, rel_err, abs_tol, rel_tol): 1811 Exception.__init__(self) # to be compatible with python2.4 1812 self.arg = arg 1813 self.err_pos = err_pos 1814 self.shape = shape 1815 self.val1 = val1 1816 self.val2 = val2 1817 self.abs_err = abs_err 1818 self.rel_err = rel_err 1819 self.abs_tol = abs_tol 1820 self.rel_tol = rel_tol 1821 1822 def __str__(self): 1823 # args may have been inserted by e.g. makeTester 1824 args_msg = ", ".join(str(a) for a in self.args) 1825 return """\ 1826GradientError: numeric gradient and analytic gradient exceed tolerance: 1827 At position %i of argument %i with shape %s, 1828 val1 = %f , val2 = %f 1829 abs. error = %f, abs. tolerance = %f 1830 rel. error = %f, rel. tolerance = %f 1831Exception args: %s""" % (self.err_pos, self.arg, 1832 self.shape, 1833 self.val1, self.val2, 1834 self.abs_err, self.abs_tol, 1835 self.rel_err, self.rel_tol, 1836 args_msg) 1837 1838 1839verify_grad.E_grad = GradientError 1840 1841 1842def jacobian(expression, wrt, consider_constant=None, 1843 disconnected_inputs='raise'): 1844 """ 1845 Compute the full Jacobian, row by row. 1846 1847 Parameters 1848 ---------- 1849 expression : Vector (1-dimensional) :class:`~theano.gof.graph.Variable` 1850 Values that we are differentiating (that we want the Jacobian of) 1851 wrt : :class:`~theano.gof.graph.Variable` or list of Variables 1852 Term[s] with respect to which we compute the Jacobian 1853 consider_constant : list of variables 1854 Expressions not to backpropagate through 1855 1856 disconnected_inputs: string 1857 Defines the behaviour if some of the variables 1858 in `wrt` are not part of the computational graph computing `cost` 1859 (or if all links are non-differentiable). The possible values are: 1860 1861 - 'ignore': considers that the gradient on these parameters is zero. 1862 - 'warn': consider the gradient zero, and print a warning. 1863 - 'raise': raise an exception. 1864 1865 Returns 1866 ------- 1867 :class:`~theano.gof.graph.Variable` or list/tuple of Variables (depending upon `wrt`) 1868 The Jacobian of `expression` with respect to (elements of) `wrt`. 1869 If an element of `wrt` is not differentiable with respect to the 1870 output, then a zero variable is returned. The return value is 1871 of same type as `wrt`: a list/tuple or TensorVariable in all cases. 1872 """ 1873 from theano.tensor import arange 1874 # Check inputs have the right format 1875 assert isinstance(expression, Variable), \ 1876 "tensor.jacobian expects a Variable as `expression`" 1877 assert expression.ndim < 2, \ 1878 ("tensor.jacobian expects a 1 dimensional variable as " 1879 "`expression`. If not use flatten to make it a vector") 1880 1881 using_list = isinstance(wrt, list) 1882 using_tuple = isinstance(wrt, tuple) 1883 1884 if isinstance(wrt, (list, tuple)): 1885 wrt = list(wrt) 1886 else: 1887 wrt = [wrt] 1888 1889 if expression.ndim == 0: 1890 # expression is just a scalar, use grad 1891 return format_as(using_list, using_tuple, 1892 grad(expression, 1893 wrt, 1894 consider_constant=consider_constant, 1895 disconnected_inputs=disconnected_inputs)) 1896 1897 def inner_function(*args): 1898 idx = args[0] 1899 expr = args[1] 1900 rvals = [] 1901 for inp in args[2:]: 1902 rval = grad(expr[idx], 1903 inp, 1904 consider_constant=consider_constant, 1905 disconnected_inputs=disconnected_inputs) 1906 rvals.append(rval) 1907 return rvals 1908 # Computing the gradients does not affect the random seeds on any random 1909 # generator used n expression (because during computing gradients we are 1910 # just backtracking over old values. (rp Jan 2012 - if anyone has a 1911 # counter example please show me) 1912 jacobs, updates = theano.scan(inner_function, 1913 sequences=arange(expression.shape[0]), 1914 non_sequences=[expression] + wrt) 1915 assert not updates, \ 1916 ("Scan has returned a list of updates. This should not " 1917 "happen! Report this to theano-users (also include the " 1918 "script that generated the error)") 1919 return format_as(using_list, using_tuple, jacobs) 1920 1921 1922def hessian(cost, wrt, consider_constant=None, 1923 disconnected_inputs='raise'): 1924 """ 1925 Parameters 1926 ---------- 1927 cost: Scalar (0-dimensional) variable. 1928 wrt: Vector (1-dimensional tensor) 'Variable' or list of 1929 vectors (1-dimensional tensors) Variables 1930 consider_constant: 1931 a list of expressions not to backpropagate through 1932 disconnected_inputs: string 1933 Defines the behaviour if some of the variables 1934 in ``wrt`` are not part of the computational graph computing ``cost`` 1935 (or if all links are non-differentiable). The possible values are: 1936 1937 - 'ignore': considers that the gradient on these parameters is zero. 1938 - 'warn': consider the gradient zero, and print a warning. 1939 - 'raise': raise an exception. 1940 1941 Returns 1942 ------- 1943 :class:`~theano.gof.graph.Variable` or list/tuple of Variables 1944 The Hessian of the `cost` with respect to (elements of) `wrt`. 1945 If an element of `wrt` is not differentiable with respect to the 1946 output, then a zero variable is returned. The return value is 1947 of same type as `wrt`: a list/tuple or TensorVariable in all cases. 1948 """ 1949 from theano.tensor import arange 1950 # Check inputs have the right format 1951 assert isinstance(cost, Variable), \ 1952 "tensor.hessian expects a Variable as `cost`" 1953 assert cost.ndim == 0, \ 1954 "tensor.hessian expects a 0 dimensional variable as `cost`" 1955 1956 using_list = isinstance(wrt, list) 1957 using_tuple = isinstance(wrt, tuple) 1958 1959 if isinstance(wrt, (list, tuple)): 1960 wrt = list(wrt) 1961 else: 1962 wrt = [wrt] 1963 1964 hessians = [] 1965 for input in wrt: 1966 assert isinstance(input, Variable), \ 1967 "tensor.hessian expects a (list of) Variable as `wrt`" 1968 assert input.ndim == 1, \ 1969 "tensor.hessian expects a (list of) 1 dimensional variable "\ 1970 "as `wrt`" 1971 expr = grad(cost, input, consider_constant=consider_constant, 1972 disconnected_inputs=disconnected_inputs) 1973 1974 # It is possible that the inputs are disconnected from expr, 1975 # even if they are connected to cost. 1976 # This should not be an error. 1977 hess, updates = theano.scan(lambda i, y, x: grad( 1978 y[i], 1979 x, 1980 consider_constant=consider_constant, 1981 disconnected_inputs='ignore'), 1982 sequences=arange(expr.shape[0]), 1983 non_sequences=[expr, input]) 1984 assert not updates, \ 1985 ("Scan has returned a list of updates. This should not " 1986 "happen! Report this to theano-users (also include the " 1987 "script that generated the error)") 1988 hessians.append(hess) 1989 return format_as(using_list, using_tuple, hessians) 1990 1991 1992def _is_zero(x): 1993 """ 1994 Returns 'yes', 'no', or 'maybe' indicating whether x 1995 is always 0. 1996 'maybe' means that x is an expression that is complicated enough 1997 that we can't tell that it simplifies to 0. 1998 """ 1999 if not hasattr(x, 'type'): 2000 return np.all(x == 0.) 2001 if isinstance(x.type, NullType): 2002 return 'no' 2003 if isinstance(x.type, DisconnectedType): 2004 return 'yes' 2005 2006 no_constant_value = True 2007 try: 2008 constant_value = theano.get_scalar_constant_value(x) 2009 no_constant_value = False 2010 except theano.tensor.basic.NotScalarConstantError: 2011 pass 2012 2013 if no_constant_value: 2014 return 'maybe' 2015 2016 if constant_value != 0.: 2017 return 'no' 2018 2019 return 'yes' 2020 2021 2022class ConsiderConstant(ViewOp): 2023 def grad(self, args, g_outs): 2024 return [g_out.zeros_like(g_out) for g_out in g_outs] 2025 2026 2027consider_constant_ = ConsiderConstant() 2028 2029 2030# I create a function only to have the doc show well. 2031def consider_constant(x): 2032 """ 2033 DEPRECATED: use zero_grad() or disconnected_grad() instead. 2034 2035 Consider an expression constant when computing gradients. 2036 2037 The expression itself is unaffected, but when its gradient is 2038 computed, or the gradient of another expression that this 2039 expression is a subexpression of, it will not be backpropagated 2040 through. In other words, the gradient of the expression is 2041 truncated to 0. 2042 2043 :param x: A Theano expression whose gradient should be truncated. 2044 2045 :return: The expression is returned unmodified, but its gradient 2046 is now truncated to 0. 2047 2048 .. versionadded:: 0.7 2049 """ 2050 warnings.warn(( 2051 "consider_constant() is deprecated, use zero_grad() or " 2052 "disconnected_grad() instead."), stacklevel=3) 2053 2054 return consider_constant_(x) 2055 2056 2057class ZeroGrad(ViewOp): 2058 def grad(self, args, g_outs): 2059 return [g_out.zeros_like(g_out) for g_out in g_outs] 2060 2061 def R_op(self, inputs, eval_points): 2062 if eval_points[0] is None: 2063 return [None] 2064 2065 return theano.tensor.zeros(1) 2066 2067 2068zero_grad_ = ZeroGrad() 2069 2070 2071def zero_grad(x): 2072 """ 2073 Consider an expression constant when computing gradients. 2074 2075 The expression itself is unaffected, but when its gradient is 2076 computed, or the gradient of another expression that this 2077 expression is a subexpression of, it will be backpropagated 2078 through with a value of zero. In other words, the gradient of 2079 the expression is truncated to 0. 2080 2081 Parameters 2082 ---------- 2083 x: :class:`~theano.gof.graph.Variable` 2084 A Theano expression whose gradient should be truncated. 2085 2086 Returns 2087 ------- 2088 :class:`~theano.gof.graph.Variable` 2089 An expression equivalent to ``x``, with its gradient 2090 truncated to 0. 2091 """ 2092 return zero_grad_(x) 2093 2094 2095class UndefinedGrad(ViewOp): 2096 def grad(self, args, g_outs): 2097 return [grad_undefined(self, i, arg) for i, arg in enumerate(args)] 2098 2099 def R_op(self, inputs, eval_points): 2100 return [None] 2101 2102 def connection_pattern(self, node): 2103 return [[True]] 2104 2105 2106undefined_grad_ = UndefinedGrad() 2107 2108 2109def undefined_grad(x): 2110 """ 2111 Consider the gradient of this variable undefined. 2112 2113 This will generate an error message if its gradient is taken. 2114 2115 The expression itself is unaffected, but when its gradient is 2116 computed, or the gradient of another expression that this 2117 expression is a subexpression of, an error message will be generated 2118 specifying such gradient is not defined. 2119 2120 Parameters 2121 ---------- 2122 x: :class:`~theano.gof.graph.Variable` 2123 A Theano expression whose gradient should be undefined. 2124 2125 Returns 2126 ------- 2127 :class:`~theano.gof.graph.Variable` 2128 An expression equivalent to ``x``, with its gradient undefined. 2129 """ 2130 return undefined_grad_(x) 2131 2132 2133class DisconnectedGrad(ViewOp): 2134 def grad(self, args, g_outs): 2135 return [disconnected_type() for g_out in g_outs] 2136 2137 def R_op(self, inputs, eval_points): 2138 return [None] 2139 2140 def connection_pattern(self, node): 2141 return [[False]] 2142 2143 2144disconnected_grad_ = DisconnectedGrad() 2145 2146 2147def disconnected_grad(x): 2148 """ 2149 Consider an expression constant when computing gradients. 2150 2151 It will effectively not backpropagating through it. 2152 2153 The expression itself is unaffected, but when its gradient is 2154 computed, or the gradient of another expression that this 2155 expression is a subexpression of, it will not be backpropagated 2156 through. This is effectively equivalent to truncating the gradient 2157 expression to 0, but is executed faster than zero_grad(), which stilll 2158 has to go through the underlying computational graph related to the 2159 expression. 2160 2161 Parameters 2162 ---------- 2163 x: :class:`~theano.gof.graph.Variable` 2164 A Theano expression whose gradient should not be 2165 backpropagated through. 2166 2167 Returns 2168 ------- 2169 :class:`~theano.gof.graph.Variable` 2170 An expression equivalent to ``x``, with its gradient 2171 now effectively truncated to 0. 2172 """ 2173 return disconnected_grad_(x) 2174 2175 2176class GradClip(ViewOp): 2177 # See doc in user fct grad_clip 2178 __props__ = () 2179 2180 def __init__(self, clip_lower_bound, clip_upper_bound): 2181 # We do not put those member in __eq__ or __hash__ 2182 # as they do not influence the perform of this op. 2183 self.clip_lower_bound = clip_lower_bound 2184 self.clip_upper_bound = clip_upper_bound 2185 assert(self.clip_upper_bound >= self.clip_lower_bound) 2186 2187 def grad(self, args, g_outs): 2188 return [theano.tensor.clip(g_out, self.clip_lower_bound, 2189 self.clip_upper_bound) 2190 for g_out in g_outs] 2191 2192 2193def grad_clip(x, lower_bound, upper_bound): 2194 """ 2195 This op do a view in the forward, but clip the gradient. 2196 2197 This is an elemwise operation. 2198 2199 Parameters 2200 ---------- 2201 x: 2202 The variable we want its gradient inputs clipped 2203 lower_bound: 2204 The lower bound of the gradient value 2205 upper_bound: 2206 The upper bound of the gradient value. 2207 2208 Examples 2209 -------- 2210 >>> x = theano.tensor.scalar() 2211 >>> z = theano.tensor.grad(grad_clip(x, -1, 1)**2, x) 2212 >>> z2 = theano.tensor.grad(x**2, x) 2213 >>> f = theano.function([x], outputs = [z, z2]) 2214 >>> print(f(2.0)) 2215 [array(1.0), array(4.0)] 2216 2217 Note 2218 ---- 2219 We register an opt in tensor/opt.py that remove the GradClip. 2220 So it have 0 cost in the forward and only do work in the grad. 2221 2222 """ 2223 return GradClip(lower_bound, upper_bound)(x) 2224 2225 2226class GradScale(ViewOp): 2227 def __init__(self, multiplier): 2228 self.multiplier = multiplier 2229 2230 def grad(self, args, g_outs): 2231 return [self.multiplier * g_out for g_out in g_outs] 2232 2233 2234def grad_scale(x, multiplier): 2235 """ 2236 This op scale or inverse the gradient in the backpropagation. 2237 2238 Parameters 2239 ---------- 2240 x: 2241 The variable we want its gradient inputs scale 2242 multiplier: 2243 Scale of the gradient 2244 2245 Examples 2246 -------- 2247 >>> x = theano.tensor.fscalar() 2248 >>> fx = theano.tensor.sin(x) 2249 >>> fp = theano.tensor.grad(fx, wrt=x) 2250 >>> fprime = theano.function([x], fp) 2251 >>> print(fprime(2)) # doctest: +ELLIPSIS 2252 -0.416... 2253 >>> f_inverse=grad_scale(fx, -1.) 2254 >>> fpp = theano.tensor.grad(f_inverse, wrt=x) 2255 >>> fpprime = theano.function([x], fpp) 2256 >>> print(fpprime(2)) # doctest: +ELLIPSIS 2257 0.416... 2258 """ 2259 return GradScale(multiplier)(x) 2260