1"""Driver for gradient calculations."""
2from __future__ import absolute_import, print_function, division
3from collections import OrderedDict
4import six.moves.builtins as builtins
5import logging
6import time
7import warnings
8
9import numpy as np  # for numeric_grad
10from six import itervalues
11
12import theano
13
14from theano import gof
15from theano.gof import utils, Variable
16from theano.compat import izip
17from six.moves import xrange, reduce
18from theano.gof.null_type import NullType, null_type
19from theano.gof.op import get_debug_values
20from theano.compile import ViewOp, FAST_RUN, DebugMode, get_mode
21
22__authors__ = "James Bergstra, Razvan Pascanu, Arnaud Bergeron, Ian Goodfellow"
23__copyright__ = "(c) 2011, Universite de Montreal"
24__license__ = "3-clause BSD License"
25__contact__ = "theano-dev <theano-dev@googlegroups.com>"
26
27__docformat__ = "restructuredtext en"
28_logger = logging.getLogger('theano.gradient')
29
30# we can't do "import theano.tensor"
31# tensor depends on theano.compile
32# theano.compile depends on theano.gradient (this file)
33# the reason theano.compile depends on theano.gradient
34# is that theano.compile.builders contains the op from graph
35# functionality and it uses theano.gradient to implement
36# the new op's grad method
37tensor = None
38
39_msg_retType = 'op.grad(...) returned a non-list'
40
41grad_time = 0
42
43
44def format_as(use_list, use_tuple, outputs):
45    """
46    Formats the outputs according to the flags `use_list` and `use_tuple`.
47
48    If `use_list` is True, `outputs` is returned as a list (if `outputs`
49    is not a list or a tuple then it is converted in a one element list).
50    If `use_tuple` is True, `outputs` is returned as a tuple (if `outputs`
51    is not a list or a tuple then it is converted into a one element tuple).
52    Otherwise (if both flags are false), `outputs` is returned.
53    """
54    assert not (use_list and use_tuple), \
55        "Both flags cannot be simultaneously True"
56    if (use_list or use_tuple) and not isinstance(outputs, (list, tuple)):
57        if use_list:
58            return [outputs]
59        else:
60            return (outputs,)
61    elif not (use_list or use_tuple) and isinstance(outputs, (list, tuple)):
62        assert len(outputs) == 1, \
63            "Wrong arguments. Expected a one element list"
64        return outputs[0]
65    elif use_list or use_tuple:
66        if use_list:
67            return list(outputs)
68        else:
69            return tuple(outputs)
70    else:
71        return outputs
72
73
74def grad_not_implemented(op, x_pos, x, comment=""):
75    """
76    Return an un-computable symbolic variable of type `x.type`.
77
78    If any call to tensor.grad results in an expression containing this
79    un-computable variable, an exception (NotImplementedError) will be
80    raised indicating that the gradient on the
81    `x_pos`'th input of `op` has not been implemented. Likewise if
82    any call to theano.function involves this variable.
83
84    Optionally adds a comment to the exception explaining why this
85    gradient is not implemented.
86    """
87
88    return (NullType((
89        "This variable is Null because the grad method for "
90        "input %s (%s) of the %s op is not implemented. %s"
91    ) % (x_pos, x, op, comment)))()
92
93
94def grad_undefined(op, x_pos, x, comment=""):
95    """
96    Return an un-computable symbolic variable of type `x.type`.
97
98    If any call to tensor.grad results in an expression containing this
99    un-computable variable, an exception (GradUndefinedError) will be
100    raised indicating that the gradient on the
101    `x_pos`'th input of `op` is mathematically undefined. Likewise if
102    any call to theano.function involves this variable.
103
104    Optionally adds a comment to the exception explaining why this
105    gradient is not defined.
106    """
107
108    return (NullType(
109        (
110            "This variable is Null because the grad method for "
111            "input %s (%s) of the %s op is mathematically undefined. %s"
112        ) % (x_pos, x, op, comment)))()
113
114
115class DisconnectedType(theano.gof.type.Type):
116
117    """ A type indicating that a variable is a result
118        of taking the gradient of c with respect to x
119        when c is not a function of x.
120        A symbolic placeholder for 0, but to convey
121        the extra information that this gradient is 0
122        because it is disconnected.
123    """
124
125    def filter(self, data, strict=False, allow_downcast=None):
126        raise AssertionError(
127            (
128                "If you're assigning to a DisconnectedType you're"
129                " doing something wrong. It should only be used as"
130                " a symbolic placeholder."
131            ))
132
133    def fiter_variable(self, other):
134        raise AssertionError(
135            (
136                "If you're assigning to a DisconnectedType you're"
137                " doing something wrong. It should only be used as"
138                " a symbolic placeholder."
139            ))
140
141    def may_share_memory(a, b):
142        return False
143
144    def value_eq(a, b, force_same_dtype=True):
145        raise AssertionError(
146            (
147                "If you're assigning to a DisconnectedType you're"
148                " doing something wrong. It should only be used as"
149                " a symbolic placeholder."
150            ))
151
152    def __str__(self):
153        return 'DisconnectedType'
154
155
156disconnected_type = DisconnectedType()
157
158
159########################
160# R Operator
161########################
162
163
164def Rop(f, wrt, eval_points, disconnected_outputs="raise",
165        return_disconnected="zero"):
166    """
167    Computes the R operation on `f` wrt to `wrt` at `eval_points`.
168
169    Mathematically this stands for the jacobian of `f` wrt
170    to `wrt` right muliplied by the eval points.
171
172    Parameters
173    ----------
174    f : :class:`~theano.gof.graph.Variable` or list of Variables
175        `f` stands for the output of the computational graph to which you
176        want to apply the R operator
177    wrt : :class:`~theano.gof.graph.Variable` or list of Variables
178        variables for which you compute the R operator of the expression
179        described by `f`
180    eval_points : :class:`~theano.gof.graph.Variable` or list of Variables
181        evalutation points for each of the variables in `wrt`
182    disconnected_outputs : str
183        Defines the behaviour if some of the variables in `f`
184        have no dependency on any of the variable in `wrt` (or if
185        all links are non-differentiable). The possible values are:
186
187        - 'ignore': considers that the gradient on these parameters is zero.
188        - 'warn': consider the gradient zero, and print a warning.
189        - 'raise': raise DisconnectedInputError.
190
191    return_disconnected : {'zero', 'None', 'Disconnected'}
192        - 'zero' : If wrt[i] is disconnected, return value i will be
193          wrt[i].zeros_like()
194        - 'None' : If wrt[i] is disconnected, return value i will be
195          None
196        - 'Disconnected' : returns variables of type DisconnectedType
197
198    Returns
199    -------
200    :class:`~theano.gof.graph.Variable` or list/tuple of Variables depending on type of f
201        Symbolic expression such that
202        R_op[i] = sum_j (d f[i] / d wrt[j]) eval_point[j]
203        where the indices in that expression are magic multidimensional
204        indices that specify both the position within a list and all
205        coordinates of the tensor element in the last.
206        If `wrt` is a list/tuple, then return a list/tuple with the results.
207    """
208    from theano.tensor import as_tensor_variable
209    using_list = isinstance(f, list)
210    using_tuple = isinstance(f, tuple)
211    if not isinstance(wrt, (list, tuple)):
212        wrt = [wrt]
213
214    if not isinstance(eval_points, (list, tuple)):
215        eval_points = [eval_points]
216
217    if not isinstance(f, (list, tuple)):
218        f = [f]
219
220    assert len(wrt) == len(eval_points)
221
222    # Check that each element of wrt corresponds to an element
223    # of eval_points with the same dimensionality.
224    for pack in enumerate(zip(wrt, eval_points)):
225        i = pack[0]
226        wrt_elem, eval_point = pack[1]
227        if not isinstance(wrt_elem, gof.Variable):
228            wrt_elem = as_tensor_variable(wrt_elem)
229        if not isinstance(eval_point, gof.Variable):
230            eval_point = as_tensor_variable(eval_point)
231
232        try:
233
234            if wrt_elem.type.ndim != eval_point.type.ndim:
235                raise ValueError('Element ' +
236                                 str(i) +
237                                 ' of wrt/eval_point have mismatched ' +
238                                 'dimensionality: ' +
239                                 str(wrt_elem.type.ndim) +
240                                 ' versus ' +
241                                 str(eval_point.type.ndim))
242        except AttributeError:
243            # wrt_elem and eval_point don't always have ndim like random type
244            # Tensor, Sparse and GpuArray have the ndim attribute
245            pass
246
247    seen_nodes = OrderedDict()
248
249    def _traverse(node):
250        """ TODO: writeme """
251
252        if node is None:
253            return
254
255        op = node.op
256        inputs = node.inputs
257
258        # Compute the evaluation points corresponding to each of the
259        # inputs of the node
260        local_eval_points = []
261        for inp in inputs:
262            if inp in wrt:
263                local_eval_points.append(eval_points[wrt.index(inp)])
264            elif inp.owner is None:
265                try:
266                    local_eval_points.append(inp.zeros_like())
267                except Exception:
268                    # None should be used for non-differentiable
269                    # arguments, like for example random states
270                    local_eval_points.append(None)
271            elif inp.owner in seen_nodes:
272
273                local_eval_points.append(
274                    seen_nodes[inp.owner][inp.owner.outputs.index(inp)])
275
276            else:
277                # We actually need to compute the R_op for this node
278
279                _traverse(inp.owner)
280                local_eval_points.append(
281                    seen_nodes[inp.owner][inp.owner.outputs.index(inp)])
282        same_type_eval_points = []
283        for x, y in zip(inputs, local_eval_points):
284            if y is not None:
285                if not isinstance(x, gof.Variable):
286                    x = as_tensor_variable(x)
287                if not isinstance(y, gof.Variable):
288                    y = as_tensor_variable(y)
289                try:
290                    y = x.type.filter_variable(y)
291                except TypeError:
292                    # This is a hack
293                    # Originally both grad and Rop were written
294                    # with the assumption that a variable and the
295                    # gradient wrt that variable would have the same
296                    # dtype. This was a bad assumption because the
297                    # gradient wrt an integer can take on non-integer
298                    # values.
299                    # grad is now fixed, but Rop is not, so when grad
300                    # does the right thing and violates this assumption
301                    # we have to make it be wrong for Rop to keep working
302                    # Rop should eventually be upgraded to handle integers
303                    # correctly, the same as grad
304                    y = theano.tensor.cast(y, x.type.dtype)
305                    y = x.type.filter_variable(y)
306                assert x.type == y.type
307                same_type_eval_points.append(y)
308            else:
309                same_type_eval_points.append(y)
310
311        seen_nodes[node] = op.R_op(node.inputs, same_type_eval_points)
312    # end _traverse
313
314    # Populate the dictionary
315    for out in f:
316        _traverse(out.owner)
317
318    rval = []
319    for out in f:
320        if out in wrt:
321            rval.append(eval_points[wrt.index(out)])
322        elif seen_nodes.get(out.owner, None) is None or \
323                seen_nodes[out.owner][out.owner.outputs.index(out)] is None:
324            message = ("Rop method was asked to compute the gradient "
325                       "with respect to a variable that is not part of "
326                       "the computational graph of variables in wrt, or is "
327                       "used only by a non-differentiable operator: %s" % out)
328            if disconnected_outputs == 'ignore':
329                pass
330            elif disconnected_outputs == 'warn':
331                warnings.warn(message, stacklevel=2)
332            elif disconnected_outputs == 'raise':
333                message = utils.get_variable_trace_string(out)
334                raise DisconnectedInputError(message)
335            else:
336                raise ValueError("Invalid value for keyword "
337                                 "'disconnected_inputs', valid values are "
338                                 "'ignore', 'warn' and 'raise'.")
339            if return_disconnected.lower() == "zero":
340                rval.append(tensor.zeros_like(out))
341            elif return_disconnected.lower() == "none":
342                rval.append(None)
343            elif return_disconnected.lower() == "disconnected":
344                rval.append(disconnected_type())
345            else:
346                raise ValueError("Invalid value for keyword "
347                                 "'return_disconnected', valid values are "
348                                 "'zero', 'None' and 'Disconnected'.")
349        else:
350            rval.append(seen_nodes[out.owner][out.owner.outputs.index(out)])
351
352    return format_as(using_list, using_tuple, rval)
353
354
355def Lop(f, wrt, eval_points, consider_constant=None,
356        disconnected_inputs='raise'):
357    """
358    Computes the L operation on `f` wrt to `wrt` at `eval_points`.
359
360    Mathematically this stands for the jacobian of `f` wrt
361    to `wrt` left muliplied by the eval points.
362
363    Parameters
364    ----------
365    f : :class:`~theano.gof.graph.Variable` or list of Variables
366        `f` stands for the output of the computational graph to which you
367        want to apply the L operator
368    wrt : :class:`~theano.gof.graph.Variable` or list of Variables
369        variables for which you compute the L operator of the expression
370        described by `f`
371    eval_points : :class:`~theano.gof.graph.Variable` or list of Variables
372        evalutation points for each of the variables in `f`
373
374    Returns
375    -------
376    :class:`~theano.gof.Variable` or list/tuple of Variables depending on type of f
377        Symbolic expression such that
378        L_op[i] = sum_i (d f[i] / d wrt[j]) eval_point[i]
379        where the indices in that expression are magic multidimensional
380        indices that specify both the position within a list and all
381        coordinates of the tensor element in the last
382        If `f` is a list/tuple, then return a list/tuple with the results.
383    """
384    if type(eval_points) not in (list, tuple):
385        eval_points = [eval_points]
386
387    using_list = isinstance(wrt, list)
388    using_tuple = isinstance(wrt, tuple)
389
390    if not isinstance(f, (list, tuple)):
391        f = [f]
392
393    # make copies of f and grads so we don't modify the client's copy
394    f = list(f)
395    grads = list(eval_points)
396
397    if not isinstance(wrt, (list, tuple)):
398        wrt = [wrt]
399
400    assert len(f) == len(grads)
401    known = OrderedDict(izip(f, grads))
402
403    ret = grad(cost=None, known_grads=known,
404               consider_constant=consider_constant, wrt=wrt,
405               disconnected_inputs=disconnected_inputs)
406
407    return format_as(using_list, using_tuple, ret)
408
409
410#########################
411# Gradient
412#########################
413
414def grad(cost, wrt, consider_constant=None,
415         disconnected_inputs='raise', add_names=True,
416         known_grads=None, return_disconnected='zero',
417         null_gradients='raise'):
418    """
419    Return symbolic gradients of one cost with respect to one or more variables.
420
421    For more information about how automatic differentiation works in Theano,
422    see :mod:`gradient`. For information on how to implement the gradient of
423    a certain Op, see :func:`grad`.
424
425    Parameters
426    ----------
427    cost : :class:`~theano.gof.graph.Variable` scalar (0-dimensional) tensor variable or ``None``
428        Value that we are differentiating (that we want the gradient of).
429        May be `None` if `known_grads` is provided.
430    wrt : :class:`~theano.gof.graph.Variable` or list of Variables
431        Term[s] with respect to which we want gradients
432    consider_constant : list of variables
433        Expressions not to backpropagate through
434    disconnected_inputs : {'ignore', 'warn', 'raise'}
435        Defines the behaviour if some of the variables in `wrt` are
436        not part of the computational graph computing `cost` (or if
437        all links are non-differentiable). The possible values are:
438
439        - 'ignore': considers that the gradient on these parameters is zero.
440        - 'warn': consider the gradient zero, and print a warning.
441        - 'raise': raise DisconnectedInputError.
442    add_names : bool
443        If True, variables generated by grad will be named
444        (d<cost.name>/d<wrt.name>) provided that both cost and wrt
445        have names
446    known_grads : OrderedDict, optional
447        A ordered dictionary mapping variables to their gradients. This is
448        useful in the case where you know the gradient on some
449        variables but do not know the original cost.
450    return_disconnected : {'zero', 'None', 'Disconnected'}
451        - 'zero' : If wrt[i] is disconnected, return value i will be
452          wrt[i].zeros_like()
453        - 'None' : If wrt[i] is disconnected, return value i will be
454          None
455        - 'Disconnected' : returns variables of type DisconnectedType
456    null_gradients : {'raise', 'return'}
457        Defines the behaviour if some of the variables in `wrt` have a
458        null gradient. The possibles values are:
459
460        - 'raise' : raise a NullTypeGradError exception
461        - 'return' : return the null gradients
462
463    Returns
464    -------
465    variable or list/tuple of variables (matches `wrt`)
466        Symbolic expression of gradient of `cost` with respect to each
467        of the `wrt` terms.  If an element of `wrt` is not
468        differentiable with respect to the output, then a zero
469        variable is returned.
470
471    """
472    t0 = time.time()
473    global tensor
474    if tensor is None:
475        from theano import tensor
476
477    if cost is None:
478        if known_grads is None:
479            raise AssertionError("cost and known_grads can't both be None.")
480
481    if cost is not None and isinstance(cost.type, NullType):
482        raise ValueError("Can't differentiate a NaN cost."
483                         "cost is NaN because " +
484                         cost.type.why_null)
485
486    if cost is not None and cost.ndim != 0:
487        raise TypeError("cost must be a scalar.")
488
489    if isinstance(wrt, set):
490        raise TypeError("wrt must not be a set. sets have no defined "
491                        "iteration order, so we can't return gradients in a"
492                        "  matching order.")
493
494    using_list = isinstance(wrt, list)
495    using_tuple = isinstance(wrt, tuple)
496    if not using_list and not using_tuple:
497        wrt = [wrt]
498
499    for elem in wrt:
500        if not isinstance(elem, Variable):
501            raise TypeError("Expected Variable, got " + str(elem) +
502                            " of type " + str(type(elem)))
503
504    outputs = []
505    if cost is not None:
506        outputs.append(cost)
507    if known_grads is not None:
508        outputs.extend(list(known_grads.keys()))
509
510    var_to_app_to_idx = _populate_var_to_app_to_idx(
511        outputs, wrt, consider_constant)
512
513    # build a dict mapping var to the gradient of cost with respect to var
514    grad_dict = OrderedDict()
515
516    if known_grads is None:
517        known_grads = OrderedDict()
518    else:
519        m = "known_grads must be an OrderedDict. "
520        assert isinstance(known_grads, OrderedDict) or len(known_grads) <= 1, m
521
522    # The gradient of the cost is 1 unless specified otherwise by known_grads.
523    if cost is not None:
524        if cost in known_grads:
525            g_cost = known_grads[cost]
526        else:
527            g_cost = _float_ones_like(cost)
528        # g_cost may be Disconnected or NullType. A creative use of the
529        # function, sure, but nonetheless one we can and should support.
530        # So before we try to cast it make sure it even has a dtype
531        if (hasattr(g_cost.type, 'dtype') and
532                cost.type.dtype in tensor.continuous_dtypes):
533                # Here we enforce the constraint that floating point variables
534                # have the same dtype as their gradient.
535                g_cost = g_cost.astype(cost.type.dtype)
536        # DO NOT enforce g_cost to be 0 if cost is an integer.
537        # This is to be enforced by the Op.grad method for the
538        # Op that outputs cost.
539        if hasattr(g_cost.type, 'dtype'):
540            assert g_cost.type.dtype in tensor.continuous_dtypes
541
542        grad_dict[cost] = g_cost
543
544    for var in known_grads:
545        g_var = known_grads[var]
546
547        if not hasattr(g_var, 'type'):
548            raise TypeError('output grads must be theano variables.'
549                            'Ambiguous whether %s should be made into tensor'
550                            ' or sparse theano variable' % str(type(g_var)))
551
552        if (not isinstance(g_var.type, (NullType, DisconnectedType)) and
553                'float' not in str(g_var.type.dtype)):
554            raise TypeError("Gradients must always be NullType, "
555                            "DisconnectedType, or continuous, but grad was "
556                            "given a known_grad of type " + str(g_var.type))
557
558        # DO NOT check that these gradients are equal to 0 if var is int
559        # The gradient is allowed to be non-zero on var in that case
560        # Ops outputing var should not backpropagate its gradient further
561        # but that is enforced elsewhere (grep for only_connected_to_int)
562
563        grad_dict[var] = g_var
564
565    def handle_disconnected(var):
566            message = ("grad method was asked to compute the gradient "
567                       "with respect to a variable that is not part of "
568                       "the computational graph of the cost, or is used "
569                       "only by a non-differentiable operator: %s" % var)
570            if disconnected_inputs == 'ignore':
571                pass
572            elif disconnected_inputs == 'warn':
573                warnings.warn(message, stacklevel=2)
574            elif disconnected_inputs == 'raise':
575                message = utils.get_variable_trace_string(var)
576                raise DisconnectedInputError(message)
577            else:
578                raise ValueError("Invalid value for keyword "
579                                 "'disconnected_inputs', valid values are "
580                                 "'ignore', 'warn' and 'raise'.")
581
582    # variables that do not influence the cost have zero gradient.
583    # if wrt is such a variable, populate the grad_dict with this info
584    # so that wrt not being in var_to_app_to_idx won't cause an error below
585    # according to the flag, possibly raise an error if wrt is disconnected
586    for elem in wrt:
587        if elem not in var_to_app_to_idx and elem is not cost \
588                and elem not in grad_dict:
589            handle_disconnected(elem)
590            grad_dict[elem] = disconnected_type()
591
592    cost_name = None
593    if add_names and cost is not None:
594        cost_name = cost.name
595
596    # Make sure we didn't initialize the grad_dict with any ints
597    # The gradient may NEVER be an int, even if the variable is an int.
598    # Read the Op contract and talk to Ian Goodfellow before changing this!
599    for var in grad_dict:
600        g = grad_dict[var]
601        if hasattr(g.type, 'dtype'):
602            assert g.type.dtype in tensor.float_dtypes
603
604    rval = _populate_grad_dict(var_to_app_to_idx,
605                               grad_dict, wrt, cost_name)
606
607    for i in xrange(len(rval)):
608        if isinstance(rval[i].type, NullType):
609            if null_gradients == 'raise':
610                raise NullTypeGradError("tensor.grad encountered a NaN. " +
611                                        rval[i].type.why_null)
612            else:
613                assert null_gradients == 'return'
614        if isinstance(rval[i].type, DisconnectedType):
615            handle_disconnected(rval[i])
616            if return_disconnected == 'zero':
617                rval[i] = _float_zeros_like(wrt[i])
618            elif return_disconnected == 'None':
619                rval[i] = None
620            else:
621                assert return_disconnected == 'Disconnected'
622
623    if using_tuple:
624        rval = tuple(rval)
625    elif not using_list:
626        rval, = rval
627    t1 = time.time()
628    global grad_time
629    grad_time += t1 - t0
630    return rval
631
632
633def subgraph_grad(wrt, end, start=None, cost=None, details=False):
634    '''
635    With respect to `wrt`, computes gradients of cost and/or from
636    existing `start` gradients, up to the `end` variables of a
637    symbolic digraph.  In other words, computes gradients for a
638    subgraph of the symbolic theano function. Ignores all disconnected
639    inputs.
640
641    This can be useful when one needs to perform the gradient descent
642    iteratively (e.g. one layer at a time in an MLP), or when a
643    particular operation is not differentiable in theano
644    (e.g. stochastic sampling from a multinomial). In the latter case,
645    the gradient of the non-differentiable process could be
646    approximated by user-defined formula, which could be calculated
647    using the gradients of a cost with respect to samples (0s and
648    1s). These gradients are obtained by performing a subgraph_grad
649    from the `cost` or previously known gradients (`start`) up to the
650    outputs of the stochastic process (`end`).  A dictionary mapping
651    gradients obtained from the user-defined differentiation of the
652    process, to variables, could then be fed into another
653    subgraph_grad as `start` with any other `cost` (e.g. weight
654    decay).
655
656    In an MLP, we could use subgraph_grad to iteratively backpropagate:
657
658    .. code-block:: python
659
660        x, t = theano.tensor.fvector('x'), theano.tensor.fvector('t')
661        w1 = theano.shared(np.random.randn(3,4))
662        w2 = theano.shared(np.random.randn(4,2))
663        a1 = theano.tensor.tanh(theano.tensor.dot(x,w1))
664        a2 = theano.tensor.tanh(theano.tensor.dot(a1,w2))
665        cost2 = theano.tensor.sqr(a2 - t).sum()
666        cost2 += theano.tensor.sqr(w2.sum())
667        cost1 = theano.tensor.sqr(w1.sum())
668
669        params = [[w2],[w1]]
670        costs = [cost2,cost1]
671        grad_ends = [[a1], [x]]
672
673        next_grad = None
674        param_grads = []
675        for i in xrange(2):
676            param_grad, next_grad = theano.subgraph_grad(
677                wrt=params[i], end=grad_ends[i],
678                start=next_grad, cost=costs[i]
679            )
680            next_grad = dict(zip(grad_ends[i], next_grad))
681            param_grads.extend(param_grad)
682
683    Parameters
684    ----------
685
686    wrt : list of variables
687        Gradients are computed with respect to `wrt`.
688
689    end : list of variables
690        Theano variables at which to end gradient descent (they are
691        considered constant in theano.grad).  For convenience, the
692        gradients with respect to these variables are also returned.
693
694    start : dictionary of variables
695        If not None, a dictionary mapping variables to their
696        gradients. This is useful when the gradient on some variables
697        are known. These are used to compute the gradients backwards up
698        to the variables in `end` (they are used as known_grad in
699        theano.grad).
700
701    cost : :class:`~theano.gof.Variable` scalar (0-dimensional) variable
702        Additional costs for which to compute the gradients.  For
703        example, these could be weight decay, an l1 constraint, MSE,
704        NLL, etc. May optionally be None if start is provided.
705
706        .. warning::
707
708            If the gradients of `cost` with respect to any of the `start`
709            variables is already part of the `start` dictionary, then it
710            may be counted twice with respect to `wrt` and `end`.
711
712    details : bool
713        When True, additionally returns the list of gradients from
714        `start` and of `cost`, respectively, with respect to `wrt` (not
715        `end`).
716
717    Returns
718    -------
719    Tuple of 2 or 4 Lists of Variables
720        Returns lists of gradients with respect to `wrt` and `end`,
721        respectively.
722
723
724    .. versionadded:: 0.7
725    '''
726    assert ((cost is not None) or (start is not None))
727    assert isinstance(end, list)
728    assert isinstance(wrt, list)
729    if start is not None:
730        assert isinstance(start, dict)
731
732    params = list(set(wrt + end))
733
734    start_grads = None
735    cost_grads = None
736    if start is not None:
737        start_grads = list(
738            theano.grad(
739                cost=None, wrt=params, known_grads=start,
740                consider_constant=end,
741                disconnected_inputs='ignore'
742            )
743        )
744
745    if cost is not None:
746        cost_grads = list(
747            theano.grad(
748                cost=cost, wrt=params,
749                consider_constant=end,
750                disconnected_inputs='ignore'
751            )
752        )
753
754    grads = None
755    if start is None:
756        grads = cost_grads
757    else:
758        grads = start_grads
759        if cost_grads is not None:
760            for i in range(len(grads)):
761                grads[i] += cost_grads[i]
762
763    pgrads = OrderedDict(izip(params, grads))
764    # separate wrt from end grads:
765    wrt_grads = list(pgrads[k] for k in wrt)
766    end_grads = list(pgrads[k] for k in end)
767
768    if details:
769        return wrt_grads, end_grads, start_grads, cost_grads
770
771    return wrt_grads, end_grads
772
773
774def _node_to_pattern(node):
775    """ given an apply node, obtain its connection pattern
776     this is just a wrapper around Op.connection_pattern
777     that does type checking and supplies the default value
778     if the method is not implemented
779    """
780
781    if hasattr(node.op, 'connection_pattern'):
782        connection_pattern = node.op.connection_pattern(node)
783
784        if not isinstance(connection_pattern, list):
785            raise TypeError(
786                "Op.connection_pattern should return " +
787                ("list of list of bool, but for Op=%s" % node.op) +
788                "got %s with type %s." % (connection_pattern,
789                                          type(connection_pattern)))
790        if len(connection_pattern) != len(node.inputs):
791            raise ValueError(
792                '%s.connection_pattern should have %d' %
793                (node.op, len(node.inputs)) + ' rows but has %d.' %
794                len(connection_pattern))
795        for ii, output_pattern in enumerate(connection_pattern):
796            if not isinstance(output_pattern, list):
797                raise TypeError(
798                    '%s.connection_pattern should return' %
799                    node.op + ' a list of lists, but element %d' % ii +
800                    'is %s of type %s.' % (output_pattern,
801                                           type(output_pattern)))
802    else:
803        connection_pattern = [[True for output in node.outputs]
804                              for ipt in node.inputs]
805    assert isinstance(connection_pattern, list)
806    assert len(connection_pattern) == len(node.inputs)
807    for ii in xrange(len(node.inputs)):
808        assert isinstance(connection_pattern[ii], list)
809        assert len(connection_pattern[ii]) == len(node.outputs)
810    return connection_pattern
811
812
813def _populate_var_to_app_to_idx(outputs, wrt, consider_constant):
814    """
815    Helper function for grad function.
816
817    Parameters
818    ----------
819    outputs
820        a list of variables we want to take gradients of
821
822    wrt
823        a list of variables we want to take the gradient with
824        respect to.
825
826    consider_constant
827        a list of variables not to backpropagate through.
828
829    Returns
830    -------
831    var_to_app_to_idx:
832        A dictionary mapping a variable to a second dictionary.
833        The second dictionary maps apply nodes acting on this
834        variable to the variable's index in the apply node's
835        input list.
836
837        This dictionary will only contain variables that
838        meet two criteria:
839
840        1) The elements of at least one output are a
841           function of the elements of the variable
842
843        2) The elements of the variable are a function of the
844           elements of at least one member of wrt.
845
846    This set is exactly the set of variables that connect
847    the variables in wrt to the cost being differentiated.
848
849    (A variable in consider_constant is not a function of
850    anything)
851
852    """
853
854    # Validate and format consider_constant
855    if consider_constant is None:
856        consider_constant = []
857    else:
858        # error checking on consider_constant: verify that it is a collection
859        # of theano variables
860        # this is important, if someone accidentally passes a nested data
861        # structure with theano variables at the leaves, only the root will
862        # be properly considered constant
863        try:
864            iter(consider_constant)
865        except TypeError:
866            raise TypeError('consider_constant must be an iterable collection,'
867                            ' got ' + str(type(consider_constant)))
868        for elem in consider_constant:
869            if not isinstance(elem, gof.Variable):
870                raise TypeError('Elements of consider_constant must be '
871                                'variables, but got ' + str(type(elem)))
872
873    # var_to_app_to_idx[var][node] = [i,j] means node has
874    # var as input at positions i and j
875    var_to_app_to_idx = OrderedDict()
876
877    # Set of variables that have been added to their true parents
878    # ('true' here means that the elements of the variable are a function
879    #  of the elements of the parent, according to the op's
880    #  connection_pattern)
881    # Note: we need to revisit the apply nodes repeatedly, because
882    #       different outputs of the apply node are connected to
883    #       different subsets of the inputs.
884    accounted_for = set([])
885
886    def account_for(var):
887        # Don't visit the same variable twice
888        if var in accounted_for:
889            return
890        accounted_for.add(var)
891
892        # Constants are not a function of anything
893        if var in consider_constant:
894            return
895
896        # Recursively add the variables that this variable is
897        # a function of.
898        if var.owner is not None:
899            app = var.owner
900
901            connection_pattern = _node_to_pattern(app)
902
903            var_idx = app.outputs.index(var)
904
905            for i, ipt in enumerate(app.inputs):
906
907                # don't process ipt if it is not a true
908                # parent of var
909                if not connection_pattern[i][var_idx]:
910                    continue
911
912                if ipt not in var_to_app_to_idx:
913                    # This object here *must* be an OrderedDict, because
914                    # we iterate over its keys when adding up the terms of the
915                    # gradient on ipt. If it is a regular dict, the grad method
916                    # will return something that is analytically correct, but
917                    # whose order of doing additions depends on the memory
918                    # location of the apply nodes.
919                    var_to_app_to_idx[ipt] = OrderedDict()
920                app_to_idx = var_to_app_to_idx[ipt]
921                if app not in app_to_idx:
922                    app_to_idx[app] = []
923                idx = app_to_idx[app]
924                if i not in idx:
925                    idx.append(i)
926                account_for(ipt)
927
928    # add all variables that are true ancestors of the cost
929    for output in outputs:
930        account_for(output)
931
932    # determine which variables have elements of wrt as a true
933    # ancestor. Do this with an upward pass starting from wrt,
934    # following only true connections
935    visited = set([])
936
937    def visit(var):
938        if var in visited:
939            return
940        if var not in var_to_app_to_idx:
941            return
942        visited.add(var)
943        nodes = var_to_app_to_idx[var]
944        for node in nodes:
945            connection_pattern = _node_to_pattern(node)
946            for idx in nodes[node]:
947                for ii, output in enumerate(node.outputs):
948                    if connection_pattern[idx][ii]:
949                        visit(output)
950
951    for elem in wrt:
952        visit(elem)
953
954    # Remove variables that don't have wrt as a true ancestor
955    orig_vars = list(var_to_app_to_idx.keys())
956    for var in orig_vars:
957        if var not in visited:
958            del var_to_app_to_idx[var]
959
960    return var_to_app_to_idx
961
962
963class NullTypeGradError(TypeError):
964    """
965    Raised when grad encounters a NullType.
966    """
967
968
969class DisconnectedInputError(ValueError):
970    """
971    Raised when grad is asked to compute the gradient
972    with respect to a disconnected input and
973    disconnected_inputs='raise'.
974    """
975
976
977def _populate_grad_dict(var_to_app_to_idx,
978                        grad_dict, wrt, cost_name=None):
979    """Helper function for grad function.
980
981    Parameters
982    ----------
983    var_to_app_to_idx : dict
984        a dictionary mapping a variable to a second dictionary.
985        the second dictionary maps apply nodes acting on
986        this variable to the variable's index in the apply
987        node's input list
988    grad_dict : dict
989        A dictionary mapping variables to their gradients.
990        Should be populated by grad function, which should:
991
992        - Set the gradient with respect to the cost to 1
993        - Load all gradients from known_grads, possibly
994          overriding the cost
995        - Set the gradient for disconnected
996          inputs to a variable with type DisconnectedType()
997
998    wrt : list of Variables
999        the minimal set of variables that must be included in `grad_dict`
1000    cost_name: string
1001        The name of the cost being differentiated, optional.
1002        Used to name the grad with respect to x as (d<cost_name>/dx)
1003
1004    Returns
1005    -------
1006    list of Variables
1007        A list of gradients corresponding to `wrt`
1008
1009    """
1010    # build a dict mapping node to the terms node contributes to each of
1011    # its inputs' gradients
1012    term_dict = OrderedDict()
1013
1014    def access_term_cache(node):
1015        """ Populates term_dict[node] and returns it """
1016
1017        if node not in term_dict:
1018
1019            inputs = node.inputs
1020
1021            output_grads = [access_grad_cache(var) for var in node.outputs]
1022
1023            # list of bools indicating if each output is connected to the cost
1024            outputs_connected = [not isinstance(g.type, DisconnectedType)
1025                                 for g in output_grads]
1026
1027            connection_pattern = _node_to_pattern(node)
1028
1029            # list of bools indicating if each input is connected to the cost
1030            inputs_connected = [
1031                (True in [input_to_output and output_to_cost for
1032                          input_to_output, output_to_cost in
1033                          zip(input_to_outputs, outputs_connected)]) for
1034                input_to_outputs in connection_pattern
1035            ]
1036
1037            # List of bools indicating if each output is an integer dtype
1038            output_is_int = [hasattr(output.type, 'dtype') and
1039                             output.type.dtype in theano.tensor.discrete_dtypes
1040                             for output in node.outputs]
1041
1042            # List of bools indicating if each output is NullType
1043            ograd_is_nan = [isinstance(output.type, NullType)
1044                            for output in output_grads]
1045
1046            # List of bools indicating if each input only has NullType outputs
1047            only_connected_to_nan = [
1048                (True not in
1049                 [in_to_out and out_to_cost and not out_nan
1050                  for in_to_out, out_to_cost, out_nan in
1051                  zip(in_to_outs, outputs_connected, ograd_is_nan)])
1052                for in_to_outs in connection_pattern]
1053
1054            if True not in inputs_connected:
1055                # All outputs of this op are disconnected so we can skip
1056                # Calling the op's grad method and report that the inputs
1057                # are disconnected
1058                # (The op's grad method could do this too, but this saves the
1059                # implementer the trouble of worrying about this case)
1060                input_grads = [disconnected_type() for ipt in inputs]
1061            elif False not in only_connected_to_nan:
1062                # All inputs are only connected to nan gradients, so we don't
1063                # need to bother calling the grad method. We know the gradient
1064                # with respect to all connected inputs is nan.
1065                input_grads = []
1066                for connected in inputs_connected:
1067                    if connected:
1068                        input_grads.append(null_type())
1069                    else:
1070                        input_grads.append(disconnected_type())
1071            else:
1072                # At least one input of this op is connected to the cost so and
1073                # not all output gradients are undefined so we must
1074                # call the op's grad method
1075
1076                # Each Op's grad function requires inputs and output_grads
1077                # If the Op destroys any input, but the grad expression uses
1078                # it, then chances are the resulting graph will have a
1079                # dependency cycle. We avoid this cycle by passing (symbolic)
1080                # copies of each destroyed input.
1081                try:
1082                    dinputs = [node.inputs[x[0]] for x in
1083                               itervalues(node.op.destroy_map)]
1084                except AttributeError:
1085                    dinputs = []
1086
1087                def try_to_copy_if_needed(var):
1088                    if var in dinputs and hasattr(var, 'copy'):
1089                        return var.copy()
1090                    return var
1091
1092                inputs = [try_to_copy_if_needed(ipt) for ipt in inputs]
1093
1094                # Build a list of output gradients with the same dtype as
1095                # the corresponding output variable.
1096                # If an output is of a float dtype, we want to cast the
1097                # output gradient into the same dtype, to avoid having a
1098                # gradient graph with double precision (taking more memory,
1099                # and more computation).
1100                # If an output is of an integer dtype, then we just leave it
1101                # alone.
1102                # DO NOT force integer variables to have zero grad. This causes
1103                # bugs where we fail to detect disconnected or undefined
1104                # gradients.
1105                # DO NOT force integer variables to have integer dtype.
1106                # This is a violation of the op contract.
1107                new_output_grads = []
1108                for o, og in zip(node.outputs, output_grads):
1109                    o_dt = getattr(o.type, 'dtype', None)
1110                    og_dt = getattr(og.type, 'dtype', None)
1111                    if (o_dt not in theano.tensor.discrete_dtypes and
1112                            og_dt and o_dt != og_dt):
1113                        new_output_grads.append(og.astype(o_dt))
1114                    else:
1115                        new_output_grads.append(og)
1116
1117                # Make sure that, if new_output_grads[i] has a floating point
1118                # dtype, it is the same dtype as outputs[i]
1119                for o, ng in zip(node.outputs, new_output_grads):
1120                    o_dt = getattr(o.type, 'dtype', None)
1121                    ng_dt = getattr(ng.type, 'dtype', None)
1122                    if (ng_dt is not None and
1123                            o_dt not in theano.tensor.discrete_dtypes):
1124                        assert ng_dt == o_dt
1125
1126                # Someone who had obviously not read the Op contract tried
1127                # to modify this part of the function.
1128                # If you ever think it is a good idea to make an integer
1129                # valued gradient, please
1130                # 1) Read the Op contract again
1131                # 2) Talk to Ian Goodfellow
1132                # (Both of these sources will tell you not to do it)
1133                for ng in new_output_grads:
1134                    assert (getattr(ng.type, 'dtype', None)
1135                            not in theano.tensor.discrete_dtypes)
1136
1137                # If config.compute_test_value is turned on, check that the
1138                # gradients on the outputs of this node have the right shape.
1139                # We also check the gradient on the inputs later--both checks
1140                # are needed, because some gradients are only ever specified
1141                # by the user, not computed by Op.grad, and some gradients are
1142                # only computed and returned, but never passed as another
1143                # node's output grads.
1144                for idx, packed in enumerate(izip(node.outputs,
1145                                             new_output_grads)):
1146                    orig_output, new_output_grad = packed
1147                    if not hasattr(orig_output, 'shape'):
1148                        continue
1149                    if isinstance(new_output_grad.type, DisconnectedType):
1150                        continue
1151                    for orig_output_v, new_output_grad_v in get_debug_values(
1152                            *packed):
1153                        o_shape = orig_output_v.shape
1154                        g_shape = new_output_grad_v.shape
1155                        if o_shape != g_shape:
1156                            raise ValueError(
1157                                "Got a gradient of shape " +
1158                                str(o_shape) + " on an output of shape " +
1159                                str(g_shape))
1160
1161                input_grads = node.op.L_op(inputs, node.outputs,
1162                                           new_output_grads)
1163
1164                if input_grads is None:
1165                    raise TypeError("%s.grad returned NoneType, "
1166                                    "expected iterable." % str(node.op))
1167
1168                if len(input_grads) != len(inputs):
1169                    raise ValueError(("%s returned the wrong number of" +
1170                                      " gradient terms.") % str(node.op))
1171# We can not enforce this, as AdvancedSubtensor1 has an option to
1172# return the sparse grad for optimization reason.
1173
1174                    #            for ig, i in zip(input_grads, inputs):
1175#                if (not isinstance(ig.type, (DisconnectedType, NullType)) and
1176#                    type(ig.type) != type(i.type)):
1177#                    raise ValueError(
1178#                        "%s returned the wrong type for gradient terms."
1179#                        " Sparse inputs must have sparse grads and dense"
1180#                        " inputs must have dense grad. Got %s, expected %s" %(
1181#                            str(node.op), ig.type, i.type))
1182
1183            # must convert to list in case the op returns a tuple
1184            # we won't be able to post-process out the Nones if it does that
1185            input_grads = list(input_grads)
1186
1187            # Need to propagate the NullType gradients; if an input grad is
1188            # not disconnected and the corresponding input is connected
1189            # to at least one output whose gradient is NullType then the input
1190            # grad should be NullType.
1191            for inp_idx in range(len(input_grads)):
1192                for out_idx in range(len(ograd_is_nan)):
1193                    if (ograd_is_nan[out_idx] and
1194                            connection_pattern[inp_idx][out_idx] and
1195                            not isinstance(input_grads[inp_idx].type,
1196                                           DisconnectedType)):
1197                        input_grads[inp_idx] = output_grads[out_idx]
1198
1199            # Do type checking on the result
1200
1201            # List of bools indicating if each input only has integer outputs
1202            only_connected_to_int = [
1203                (True not in
1204                 [in_to_out and out_to_cost and not out_int
1205                  for in_to_out, out_to_cost, out_int in
1206                  zip(in_to_outs, outputs_connected, output_is_int)])
1207                for in_to_outs in connection_pattern]
1208
1209            for i, term in enumerate(input_grads):
1210
1211                # Disallow Nones
1212                if term is None:
1213                    # We don't know what None means. in the past it has been
1214                    # used to mean undefined, zero, or disconnected.
1215                    # We therefore don't allow it because its usage has become
1216                    # so muddied.
1217                    raise TypeError(
1218                        ('%s.grad returned None for' +
1219                         ' a gradient term, '
1220                         'this is prohibited. Instead of None,'
1221                         'return zeros_like(input), disconnected_type(),'
1222                         ' or a NullType variable such as those made with '
1223                         'the grad_undefined or grad_unimplemented helper '
1224                         'functions.') % node.op)
1225
1226                # Check that the gradient term for this input
1227                # has the right shape
1228                if hasattr(term, 'shape'):
1229                    orig_ipt = inputs[i]
1230                    for orig_ipt_v, term_v in get_debug_values(orig_ipt, term):
1231                        i_shape = orig_ipt_v.shape
1232                        t_shape = term_v.shape
1233                        if i_shape != t_shape:
1234                            raise ValueError(
1235                                "%s.grad returned object of "
1236                                "shape %s as gradient term on input %d "
1237                                "of shape %s" % (node.op, t_shape, i, i_shape))
1238
1239                if not isinstance(term.type,
1240                                  (NullType, DisconnectedType)):
1241                    if term.type.dtype not in theano.tensor.float_dtypes:
1242                        raise TypeError(str(node.op) + '.grad illegally '
1243                                        ' returned an integer-valued variable.'
1244                                        ' (Input index %d, dtype %s)' % (
1245                                            i, term.type.dtype))
1246
1247                    if only_connected_to_nan[i]:
1248                        assert isinstance(term.type, NullType)
1249
1250                    if only_connected_to_int[i]:
1251                        # This term has only integer outputs and we know
1252                        # it's not undefined or disconnected
1253                        # The only other valid thing it can be is 0
1254
1255                        is_zero = _is_zero(term)
1256                        assert is_zero in ['yes', 'no', 'maybe']
1257                        if is_zero == 'maybe':
1258                            msg = ("%s.grad returned %s of type %s for input"
1259                                   " %d. This input's only connections to "
1260                                   "the cost through this op are via "
1261                                   "integer-valued outputs so it should be "
1262                                   "NullType, DisconnectedType, or some form "
1263                                   "of zeros. It is not NullType or "
1264                                   "DisconnectedType and theano can't "
1265                                   "simplify it to a constant, so it's not "
1266                                   "verifiably zeros.")
1267
1268                            msg %= (node.op, term, type(term), i)
1269
1270                        elif is_zero == 'no':
1271                            msg = ("%s.grad returned %s of type %s for input"
1272                                   " %d. Since this input is only connected "
1273                                   "to integer-valued outputs, it should "
1274                                   "evaluate to zeros, but it evaluates to"
1275                                   "%s.")
1276
1277                            msg %= (node.op, term, type(term), i,
1278                                    theano.get_scalar_constant_value(term))
1279
1280                            raise ValueError(msg)
1281
1282            # Check that op.connection_pattern matches the connectivity
1283            # logic driving the op.grad method
1284            for i, (ipt, ig, connected) in enumerate(
1285                zip(inputs, input_grads, inputs_connected)
1286            ):
1287                actually_connected = \
1288                    not isinstance(ig.type, DisconnectedType)
1289
1290                if actually_connected and not connected:
1291                    msg = ("%s.grad returned %s of type %s for input %d."
1292                           " Expected DisconnectedType instance based on "
1293                           " the output of the op's connection_pattern "
1294                           "method.")
1295                    msg %= (str(node.op), str(ig), str(ig.type), i)
1296                    raise TypeError(msg)
1297
1298                elif connected and not actually_connected:
1299                    msg = "%s.grad returned DisconnectedType for input %d."
1300                    msg %= (str(node.op), i)
1301                    if hasattr(node.op, 'connection_pattern'):
1302                        msg += (' Its connection_pattern method does not'
1303                                ' allow this.')
1304                        raise TypeError(msg)
1305                    else:
1306                        msg += (' You may want to implement a '
1307                                'connection_pattern method for it.')
1308                        warnings.warn(msg)
1309
1310            # cache the result
1311            term_dict[node] = input_grads
1312
1313        return term_dict[node]
1314
1315    # populate grad_dict[var] and return it
1316    def access_grad_cache(var):
1317        if var not in grad_dict:
1318            # If var is not in grad_dict already, we must compute it
1319            if var in var_to_app_to_idx:
1320                null_terms = []
1321                terms = []
1322                node_to_idx = var_to_app_to_idx[var]
1323                for node in node_to_idx:
1324                    for idx in node_to_idx[node]:
1325
1326                        term = access_term_cache(node)[idx]
1327
1328                        if not isinstance(term, gof.Variable):
1329                            raise TypeError(
1330                                "%s.grad returned %s, expected"
1331                                " Variable instance." % (str(node.op),
1332                                                         type(term)))
1333
1334                        if isinstance(term.type, NullType):
1335                            null_terms.append(term)
1336                            continue
1337
1338                        # Don't try to sum up DisconnectedType placeholders
1339                        if isinstance(term.type, DisconnectedType):
1340                            continue
1341
1342                        if hasattr(var, 'ndim') and term.ndim != var.ndim:
1343                            raise ValueError(
1344                                ("%s.grad returned a term with"
1345                                 " %d dimensions, but %d are required.") % (
1346                                     str(node.op), term.ndim, var.ndim))
1347
1348                        terms.append(term)
1349
1350                # Add up the terms to get the total gradient on this variable
1351                if len(null_terms) > 0:
1352                    # At least one term is a NullType : the total gradient
1353                    # will also be a NullType
1354                    grad_dict[var] = null_terms[0]
1355                elif len(terms) > 0:
1356                    # the next line is like sum(terms) but doesn't add an
1357                    # extraneous TensorConstant(0)
1358                    grad_dict[var] = reduce(lambda x, y: x + y, terms)
1359                else:
1360                    grad_dict[var] = disconnected_type()
1361
1362                if cost_name is not None and var.name is not None:
1363                    grad_dict[var].name = '(d%s/d%s)' % (cost_name, var.name)
1364            else:
1365                # this variable isn't connected to the cost in the
1366                # computational graph
1367                grad_dict[var] = disconnected_type()
1368        # end if cache miss
1369        return grad_dict[var]
1370
1371    rval = [access_grad_cache(elem) for elem in wrt]
1372
1373    return rval
1374
1375
1376def _float_zeros_like(x):
1377    """ Like zeros_like, but forces the object to have a
1378    a floating point dtype """
1379
1380    rval = x.zeros_like()
1381
1382    if rval.type.dtype.find('float') != -1:
1383        return rval
1384
1385    return rval.astype(theano.config.floatX)
1386
1387
1388def _float_ones_like(x):
1389    """ Like ones_like, but forces the object to have a
1390    floating point dtype """
1391
1392    dtype = x.type.dtype
1393    if dtype not in tensor.float_dtypes:
1394        dtype = theano.config.floatX
1395
1396    return x.ones_like(dtype=dtype)
1397
1398
1399class numeric_grad(object):
1400    """
1401    Compute the numeric derivative of a scalar-valued function at a particular
1402    point.
1403    """
1404
1405    # Note on step sizes and tolerances:
1406    #
1407    # There is a relationship between the step size and the function value and
1408    # the measurement error that is incurred due to rounding.  The finite
1409    # difference we measure is
1410    # delta = f(x0) - f(x0+eps)
1411    #
1412    # For maximum precision, f should be close to zero.
1413    # For every power of 2 that f departs from zero, we lose a bit of precision
1414    # in delta.
1415    #
1416    # Even in this case of maximum accuracy, there is a tradeoff between
1417    # stepsize and measurement error.
1418    # Taking small steps allows us to measure large derivatives accuractly,
1419    # but longer steps are required to measure small derivatives accurately.
1420    # However longer steps introduce bias into our measurement in general
1421    # for non-linear functions.
1422    #
1423    # It would be interesting to have a version of numeric grad that used an
1424    # adaptive stepsize.
1425    #
1426    # For now, we use a heuristic that catches very bad gradients, but is not
1427    # perfectly accurate.
1428    type_eps = {'float64': 1e-7,
1429                'float32': 3e-4,
1430                'float16': 1e-1,
1431                np.dtype('float64'): 1e-7,
1432                np.dtype('float32'): 3e-4,
1433                np.dtype('float16'): 1e-1}
1434
1435    def __init__(self, f, pt, eps=None, out_type=None):
1436        """Return the gradient of f at pt.
1437
1438        This function computes the gradient by a one-sided finite
1439        differences of a fixed step size (eps).
1440
1441        Parameters
1442        ----------
1443        f : a differentiable function such that f(*pt) is a scalar
1444            The function to compute the gradient of.
1445            It is assumed that f(...) will return a scalar.
1446            It is assumed that all f's inputs are numpy.ndarray objects.
1447        pt : an ndarray, a list of ndarrays or tuple of ndarrays
1448            The point where to evaluate the gradient
1449        out_type: float
1450            dtype of output, if complex (i.e. 'complex32' or 'complex64')
1451        eps : float, optional
1452            The stepsize for the finite differencing.  None means
1453            input dtype-dependent. See `type_eps`.
1454        """
1455
1456        def prod(inputs):
1457            rval = 1
1458            for i in inputs:
1459                rval *= i
1460            return rval
1461
1462        packed_pt = False
1463        if not isinstance(pt, (list, tuple)):
1464            pt = [pt]
1465            packed_pt = True
1466
1467        apt = [np.array(p) for p in pt]
1468
1469        shapes = [p.shape for p in apt]
1470        dtypes = [str(p.dtype) for p in apt]
1471
1472        # TODO: remove this eventually (why was this here in the first place ?)
1473        # In the case of CSM, the arguments are a mixture of floats and
1474        # integers...
1475        # if not dtypes == [dtypes[0]] * len(apt):
1476        #      raise TypeError('All function arguments must have same dtype')
1477
1478        total_size = builtins.sum(prod(sh) for sh in shapes)
1479
1480        working_dtype = builtins.min(
1481            (self.type_eps[dt], dt) for dt in dtypes)[1]
1482
1483        # create un-initialized memory
1484        x = np.ndarray((total_size,), dtype=working_dtype)
1485        # (not out_type is None) --> (out_type is not None) ???
1486        if (out_type is not None) and (out_type.startswith('complex')):
1487            gx = np.ndarray((total_size,), dtype=out_type)
1488        else:
1489            gx = np.ndarray((total_size,), dtype=working_dtype)
1490
1491        if eps is None:
1492            eps = builtins.max(self.type_eps[dt] for dt in dtypes)
1493
1494        # set up aliases so that apt[i] is backed by memory in x
1495        # and self.gf is backed by memory in gx
1496        cur_pos = 0
1497        self.gf = []
1498        for i, p in enumerate(apt):
1499            p_size = prod(p.shape)
1500            # set up alias
1501            apt[i] = x[cur_pos: cur_pos + p_size].reshape(p.shape)
1502            self.gf.append(gx[cur_pos: cur_pos + p_size].reshape(p.shape))
1503            # initialize with p's value
1504            apt[i][...] = p
1505            cur_pos += p_size
1506
1507        f_x = f(*[p.copy() for p in apt])
1508
1509        # now iterate over the elements of x, and call f on apt.
1510        x_copy = x.copy()
1511        for i in xrange(total_size):
1512            x[:] = x_copy
1513
1514            x[i] += eps
1515            f_eps = f(*apt)
1516
1517            # TODO: remove this when it is clear that the next
1518            # replacemement does not pose problems of its own.  It was replaced
1519            # for its inability to handle complex variables.
1520            # gx[i] = numpy.asarray((f_eps - f_x) / eps)
1521
1522            gx[i] = ((f_eps - f_x) / eps)
1523
1524        if packed_pt:
1525            self.gf = self.gf[0]
1526
1527    @staticmethod
1528    def abs_rel_err(a, b):
1529        """Return absolute and relative error between a and b.
1530
1531        The relative error is a small number when a and b are close, relative
1532        to how big they are.
1533
1534        Formulas used:
1535            abs_err = abs(a - b)
1536
1537            rel_err = abs_err / max(abs(a) + abs(b), 1e-8)
1538
1539        The denominator is clipped at 1e-8 to avoid dividing by 0 when a and b
1540        are both close to 0.
1541
1542        The tuple (abs_err, rel_err) is returned
1543        """
1544        abs_err = abs(a - b)
1545        # 1e-8 is to prevent division by zeros.
1546        # [] is to make sure that if a and b are float16, 1e-8 don't get
1547        # dowcasted to float16 as that give 0! This would add back the
1548        # division by zero
1549        rel_err = abs_err / np.maximum(abs(a) + abs(b), [1e-8])
1550        # The numpy.asarray are needed as if a or b is a sparse matrix
1551        # this would result in a numpy.matrix and not a numpy.ndarray
1552        # and the behave differently causing problem later.
1553        # In particular a_npy_matrix.flatten().shape == (1, n_element)
1554        abs_err = np.asarray(abs_err)
1555        rel_err = np.asarray(rel_err)
1556        return (abs_err, rel_err)
1557
1558    def abs_rel_errors(self, g_pt):
1559        """Return the abs and rel error of gradient estimate `g_pt`
1560
1561        `g_pt` must be a list of ndarrays of the same length as self.gf,
1562        otherwise a ValueError is raised.
1563
1564        Corresponding ndarrays in `g_pt` and `self.gf` must have the same
1565        shape or ValueError is raised.
1566
1567        """
1568        if len(g_pt) != len(self.gf):
1569            raise ValueError('argument has wrong number of elements',
1570                             len(g_pt))
1571        errs = []
1572        for i, (a, b) in enumerate(zip(g_pt, self.gf)):
1573            if a.shape != b.shape:
1574                raise ValueError('argument element %i has wrong shape %s' % (
1575                    i, str((a.shape, b.shape))))
1576            errs.append(numeric_grad.abs_rel_err(a, b))
1577        return errs
1578
1579    def max_err(self, g_pt, abs_tol, rel_tol):
1580        """Find the biggest error between g_pt and self.gf.
1581
1582        What is measured is the violation of relative and absolute errors,
1583        wrt the provided tolerances (abs_tol, rel_tol).
1584        A value > 1 means both tolerances are exceeded.
1585
1586        Return the argmax of min(abs_err / abs_tol, rel_err / rel_tol) over
1587        g_pt, as well as abs_err and rel_err at this point.
1588        """
1589        pos = []
1590        errs = []
1591        abs_errs = []
1592        rel_errs = []
1593
1594        abs_rel_errs = self.abs_rel_errors(g_pt)
1595        for abs_err, rel_err in abs_rel_errs:
1596            if not np.all(np.isfinite(abs_err)):
1597                raise ValueError('abs_err not finite', repr(abs_err))
1598            if not np.all(np.isfinite(rel_err)):
1599                raise ValueError('rel_err not finite', repr(rel_err))
1600            scaled_err = np.minimum(abs_err / abs_tol, rel_err / rel_tol)
1601            max_i = scaled_err.argmax()
1602
1603            pos.append(max_i)
1604            errs.append(scaled_err.flatten()[max_i])
1605            abs_errs.append(abs_err.flatten()[max_i])
1606            rel_errs.append(rel_err.flatten()[max_i])
1607
1608        # max over the arrays in g_pt
1609        max_arg = np.argmax(errs)
1610        max_pos = pos[max_arg]
1611        return (max_arg, max_pos, abs_errs[max_arg], rel_errs[max_arg])
1612
1613
1614def mode_not_slow(mode):
1615    if mode == 'FAST_COMPILE':
1616        return FAST_RUN
1617    mode = get_mode(mode)
1618    if isinstance(mode, DebugMode):
1619        opt = mode.optimizer
1620        return FAST_RUN.clone(optimizer=opt)
1621    else:
1622        return mode
1623
1624
1625def verify_grad(fun, pt, n_tests=2, rng=None, eps=None,
1626                out_type=None, abs_tol=None,
1627                rel_tol=None, mode=None, cast_to_output_type=False,
1628                no_debug_ref=True):
1629    """Test a gradient by Finite Difference Method. Raise error on failure.
1630
1631    Raises an Exception if the difference between the analytic gradient and
1632    numerical gradient (computed through the Finite Difference Method) of a
1633    random projection of the fun's output to a scalar exceeds the given
1634    tolerance.
1635
1636    Examples
1637    --------
1638    >>> verify_grad(theano.tensor.tanh,
1639    ...             (np.asarray([[2, 3, 4], [-1, 3.3, 9.9]]),),
1640    ...             rng=np.random)
1641
1642    Parameters
1643    ----------
1644    fun : a Python function
1645        `fun` takes Theano variables as inputs, and returns a Theano variable.
1646        For instance, an Op instance with  a single output.
1647    pt : list of numpy.ndarrays
1648        Input values, points where the gradient is estimated.
1649        These arrays must be either float16, float32, or float64 arrays.
1650    n_tests : int
1651        number of times to run the test
1652    rng : numpy.random.RandomState, optional
1653        random number generator used to sample the output random projection `u`,
1654        we test gradient of sum(u * fun) at `pt`
1655    eps : float, optional
1656        stepsize used in the Finite Difference Method (Default
1657        None is type-dependent).
1658        Raising the value of eps can raise or lower the absolute
1659        and relative errors of the verification depending on the
1660        Op. Raising eps does not lower the verification quality for
1661        linear operations. It is better to raise `eps` than raising
1662        `abs_tol` or `rel_tol`.
1663    out_type : string
1664        dtype of output, if complex (i.e., 'complex32' or 'complex64')
1665    abs_tol : float
1666        absolute tolerance used as threshold for gradient comparison
1667    rel_tol : float
1668        relative tolerance used as threshold for gradient comparison
1669    cast_to_output_type : bool
1670        if the output is float32 and cast_to_output_type is True, cast
1671        the random projection to float32. Otherwise it is float64.
1672        float16 is not handled here.
1673    no_debug_ref : bool
1674        Don't use DebugMode for the numerical gradient function.
1675
1676    Note
1677    ----
1678    This function does not support multiple outputs. In
1679    tests/test_scan.py there is an experimental verify_grad that
1680    covers that case as well by using random projections.
1681
1682    """
1683    # The import is here to prevent circular import.
1684    from theano import compile, shared
1685    import theano.tensor
1686    from theano.tensor import as_tensor_variable, TensorType
1687    assert isinstance(pt, (list, tuple))
1688    pt = [np.array(p) for p in pt]
1689
1690    for i, p in enumerate(pt):
1691        if p.dtype not in ('float16', 'float32', 'float64'):
1692            raise TypeError(
1693                ('verify_grad can work only with floating point '
1694                 'inputs, but input %i has dtype "%s".') % (i, p.dtype))
1695
1696    _type_tol = dict(  # relative error tolerances for different types
1697        float16=5e-2,
1698        float32=1e-2,
1699        float64=1e-4)
1700
1701    if abs_tol is None:
1702        abs_tol = builtins.max(_type_tol[str(p.dtype)] for p in pt)
1703    if rel_tol is None:
1704        rel_tol = builtins.max(_type_tol[str(p.dtype)] for p in pt)
1705
1706    if rng is None:
1707        raise TypeError(('rng should be a valid instance of '
1708                        'numpy.random.RandomState. You may '
1709                         'want to use theano.tests.unittest'
1710                         '_tools.verify_grad instead of '
1711                         'theano.gradient.verify_grad.'))
1712
1713    # We allow input downcast in function, because numeric_grad works in the
1714    # most precise dtype used among the inputs, so we may need to cast some.
1715    def function(inputs, output, name, mode=mode):
1716        f = compile.function(inputs, output, accept_inplace=True,
1717                             allow_input_downcast=True, mode=mode,
1718                             on_unused_input='ignore', name=name)
1719        return f
1720
1721    tensor_pt = [
1722        TensorType(
1723            as_tensor_variable(p).dtype,
1724            as_tensor_variable(p).broadcastable)(name='input %i' % i)
1725        for i, p in enumerate(pt)]
1726
1727    # fun can be either a function or an actual Op instance
1728    o_output = fun(*tensor_pt)
1729
1730    if isinstance(o_output, list):
1731        raise NotImplementedError(('cant (yet) autotest gradient of fun '
1732                                   'with multiple outputs'))
1733        # we could make loop over outputs making random projections R for each,
1734        # but this doesn't handle the case where not all the outputs are
1735        # differentiable... so I leave this as TODO for now -JB.
1736
1737    o_fn = function(tensor_pt, o_output, name='gradient.py fwd')
1738    o_fn_out = o_fn(*[p.copy() for p in pt])
1739
1740    if isinstance(o_fn_out, tuple) or isinstance(o_fn_out, list):
1741        raise TypeError(
1742            'It seems like you are trying to use verify_grad '
1743            'on an op or a function which outputs a list: there should'
1744            ' be a single (array-like) output instead')
1745
1746    # random_projection should not have elements too small,
1747    # otherwise too much precision is lost in numerical gradient
1748    def random_projection():
1749        plain = rng.rand(*o_fn_out.shape) + 0.5
1750        if cast_to_output_type and o_output.dtype == "float32":
1751            return np.array(plain, o_output.dtype)
1752        return plain
1753
1754    t_r = shared(random_projection(), borrow=True)
1755    t_r.name = 'random_projection'
1756
1757    # random projection of o onto t_r
1758    # This sum() is defined above, it's not the builtin sum.
1759    cost = theano.tensor.sum(t_r * o_output)
1760
1761    if no_debug_ref:
1762        mode_for_cost = mode_not_slow(mode)
1763    else:
1764        mode_for_cost = mode
1765
1766    cost_fn = function(tensor_pt, cost, name='gradient.py cost',
1767                       mode=mode_for_cost)
1768
1769    symbolic_grad = grad(cost, tensor_pt,
1770                         disconnected_inputs='ignore')
1771
1772    grad_fn = function(tensor_pt, symbolic_grad,
1773                       name='gradient.py symbolic grad')
1774
1775    for test_num in xrange(n_tests):
1776        try:
1777            num_grad = numeric_grad(cost_fn, [p.copy() for p in pt],
1778                                    eps, out_type)
1779
1780            analytic_grad = grad_fn(*[p.copy() for p in pt])
1781
1782            # Since `tensor_pt` is a list, `analytic_grad` should be one too.
1783            assert isinstance(analytic_grad, list)
1784
1785            max_arg, max_err_pos, max_abs_err, max_rel_err = num_grad.max_err(
1786                analytic_grad, abs_tol, rel_tol)
1787
1788            if max_abs_err > abs_tol and max_rel_err > rel_tol:
1789
1790                raise verify_grad.E_grad(max_arg, max_err_pos,
1791                                         analytic_grad[max_arg].shape,
1792                                         analytic_grad[max_arg].flatten()[max_err_pos],
1793                                         num_grad.gf[max_arg].flatten()[max_err_pos],
1794                                         max_abs_err, max_rel_err,
1795                                         abs_tol, rel_tol)
1796
1797            # get new random projection for next test
1798            if test_num < n_tests - 1:
1799                t_r.set_value(random_projection(), borrow=True)
1800        except Exception as e:
1801            e.args += ("\nThe error happened with the following inputs:", pt,
1802                       "\nThe value of eps is:", eps,
1803                       "\nThe out_type is:", out_type)
1804            raise
1805
1806
1807class GradientError(Exception):
1808    """This error is raised when a gradient is calculated, but incorrect."""
1809    def __init__(self, arg, err_pos, shape, val1, val2,
1810                 abs_err, rel_err, abs_tol, rel_tol):
1811        Exception.__init__(self)  # to be compatible with python2.4
1812        self.arg = arg
1813        self.err_pos = err_pos
1814        self.shape = shape
1815        self.val1 = val1
1816        self.val2 = val2
1817        self.abs_err = abs_err
1818        self.rel_err = rel_err
1819        self.abs_tol = abs_tol
1820        self.rel_tol = rel_tol
1821
1822    def __str__(self):
1823        # args may have been inserted by e.g. makeTester
1824        args_msg = ", ".join(str(a) for a in self.args)
1825        return """\
1826GradientError: numeric gradient and analytic gradient exceed tolerance:
1827        At position %i of argument %i with shape %s,
1828            val1 = %f      ,  val2 = %f
1829            abs. error = %f,  abs. tolerance = %f
1830            rel. error = %f,  rel. tolerance = %f
1831Exception args: %s""" % (self.err_pos, self.arg,
1832                         self.shape,
1833                         self.val1, self.val2,
1834                         self.abs_err, self.abs_tol,
1835                         self.rel_err, self.rel_tol,
1836                         args_msg)
1837
1838
1839verify_grad.E_grad = GradientError
1840
1841
1842def jacobian(expression, wrt, consider_constant=None,
1843             disconnected_inputs='raise'):
1844    """
1845    Compute the full Jacobian, row by row.
1846
1847    Parameters
1848    ----------
1849    expression : Vector (1-dimensional) :class:`~theano.gof.graph.Variable`
1850        Values that we are differentiating (that we want the Jacobian of)
1851    wrt : :class:`~theano.gof.graph.Variable` or list of Variables
1852        Term[s] with respect to which we compute the Jacobian
1853    consider_constant : list of variables
1854        Expressions not to backpropagate through
1855
1856    disconnected_inputs: string
1857        Defines the behaviour if some of the variables
1858        in `wrt` are not part of the computational graph computing `cost`
1859        (or if all links are non-differentiable). The possible values are:
1860
1861        - 'ignore': considers that the gradient on these parameters is zero.
1862        - 'warn': consider the gradient zero, and print a warning.
1863        - 'raise': raise an exception.
1864
1865    Returns
1866    -------
1867    :class:`~theano.gof.graph.Variable` or list/tuple of Variables (depending upon `wrt`)
1868        The Jacobian of `expression` with respect to (elements of) `wrt`.
1869        If an element of `wrt` is not differentiable with respect to the
1870        output, then a zero variable is returned. The return value is
1871        of same type as `wrt`: a list/tuple or TensorVariable in all cases.
1872    """
1873    from theano.tensor import arange
1874    # Check inputs have the right format
1875    assert isinstance(expression, Variable), \
1876        "tensor.jacobian expects a Variable as `expression`"
1877    assert expression.ndim < 2, \
1878        ("tensor.jacobian expects a 1 dimensional variable as "
1879         "`expression`. If not use flatten to make it a vector")
1880
1881    using_list = isinstance(wrt, list)
1882    using_tuple = isinstance(wrt, tuple)
1883
1884    if isinstance(wrt, (list, tuple)):
1885        wrt = list(wrt)
1886    else:
1887        wrt = [wrt]
1888
1889    if expression.ndim == 0:
1890        # expression is just a scalar, use grad
1891        return format_as(using_list, using_tuple,
1892                         grad(expression,
1893                              wrt,
1894                              consider_constant=consider_constant,
1895                              disconnected_inputs=disconnected_inputs))
1896
1897    def inner_function(*args):
1898        idx = args[0]
1899        expr = args[1]
1900        rvals = []
1901        for inp in args[2:]:
1902            rval = grad(expr[idx],
1903                        inp,
1904                        consider_constant=consider_constant,
1905                        disconnected_inputs=disconnected_inputs)
1906            rvals.append(rval)
1907        return rvals
1908    # Computing the gradients does not affect the random seeds on any random
1909    # generator used n expression (because during computing gradients we are
1910    # just backtracking over old values. (rp Jan 2012 - if anyone has a
1911    # counter example please show me)
1912    jacobs, updates = theano.scan(inner_function,
1913                                  sequences=arange(expression.shape[0]),
1914                                  non_sequences=[expression] + wrt)
1915    assert not updates, \
1916        ("Scan has returned a list of updates. This should not "
1917         "happen! Report this to theano-users (also include the "
1918         "script that generated the error)")
1919    return format_as(using_list, using_tuple, jacobs)
1920
1921
1922def hessian(cost, wrt, consider_constant=None,
1923            disconnected_inputs='raise'):
1924    """
1925    Parameters
1926    ----------
1927    cost: Scalar (0-dimensional) variable.
1928    wrt: Vector (1-dimensional tensor) 'Variable' or list of
1929    vectors (1-dimensional tensors) Variables
1930    consider_constant:
1931        a list of expressions not to backpropagate through
1932    disconnected_inputs: string
1933        Defines the behaviour if some of the variables
1934        in ``wrt`` are not part of the computational graph computing ``cost``
1935        (or if all links are non-differentiable). The possible values are:
1936
1937        - 'ignore': considers that the gradient on these parameters is zero.
1938        - 'warn': consider the gradient zero, and print a warning.
1939        - 'raise': raise an exception.
1940
1941    Returns
1942    -------
1943    :class:`~theano.gof.graph.Variable` or list/tuple of Variables
1944        The Hessian of the `cost` with respect to (elements of) `wrt`.
1945        If an element of `wrt` is not differentiable with respect to the
1946        output, then a zero variable is returned. The return value is
1947        of same type as `wrt`: a list/tuple or TensorVariable in all cases.
1948    """
1949    from theano.tensor import arange
1950    # Check inputs have the right format
1951    assert isinstance(cost, Variable), \
1952        "tensor.hessian expects a Variable as `cost`"
1953    assert cost.ndim == 0, \
1954        "tensor.hessian expects a 0 dimensional variable as `cost`"
1955
1956    using_list = isinstance(wrt, list)
1957    using_tuple = isinstance(wrt, tuple)
1958
1959    if isinstance(wrt, (list, tuple)):
1960        wrt = list(wrt)
1961    else:
1962        wrt = [wrt]
1963
1964    hessians = []
1965    for input in wrt:
1966        assert isinstance(input, Variable), \
1967            "tensor.hessian expects a (list of) Variable as `wrt`"
1968        assert input.ndim == 1, \
1969            "tensor.hessian expects a (list of) 1 dimensional variable "\
1970            "as `wrt`"
1971        expr = grad(cost, input, consider_constant=consider_constant,
1972                    disconnected_inputs=disconnected_inputs)
1973
1974        # It is possible that the inputs are disconnected from expr,
1975        # even if they are connected to cost.
1976        # This should not be an error.
1977        hess, updates = theano.scan(lambda i, y, x: grad(
1978            y[i],
1979            x,
1980            consider_constant=consider_constant,
1981            disconnected_inputs='ignore'),
1982            sequences=arange(expr.shape[0]),
1983            non_sequences=[expr, input])
1984        assert not updates, \
1985            ("Scan has returned a list of updates. This should not "
1986             "happen! Report this to theano-users (also include the "
1987             "script that generated the error)")
1988        hessians.append(hess)
1989    return format_as(using_list, using_tuple, hessians)
1990
1991
1992def _is_zero(x):
1993    """
1994    Returns 'yes', 'no', or 'maybe' indicating whether x
1995    is always 0.
1996    'maybe' means that x is an expression that is complicated enough
1997    that we can't tell that it simplifies to 0.
1998    """
1999    if not hasattr(x, 'type'):
2000        return np.all(x == 0.)
2001    if isinstance(x.type, NullType):
2002        return 'no'
2003    if isinstance(x.type, DisconnectedType):
2004        return 'yes'
2005
2006    no_constant_value = True
2007    try:
2008        constant_value = theano.get_scalar_constant_value(x)
2009        no_constant_value = False
2010    except theano.tensor.basic.NotScalarConstantError:
2011        pass
2012
2013    if no_constant_value:
2014        return 'maybe'
2015
2016    if constant_value != 0.:
2017        return 'no'
2018
2019    return 'yes'
2020
2021
2022class ConsiderConstant(ViewOp):
2023    def grad(self, args, g_outs):
2024        return [g_out.zeros_like(g_out) for g_out in g_outs]
2025
2026
2027consider_constant_ = ConsiderConstant()
2028
2029
2030# I create a function only to have the doc show well.
2031def consider_constant(x):
2032    """
2033    DEPRECATED: use zero_grad() or disconnected_grad() instead.
2034
2035    Consider an expression constant when computing gradients.
2036
2037    The expression itself is unaffected, but when its gradient is
2038    computed, or the gradient of another expression that this
2039    expression is a subexpression of, it will not be backpropagated
2040    through. In other words, the gradient of the expression is
2041    truncated to 0.
2042
2043    :param x: A Theano expression whose gradient should be truncated.
2044
2045    :return: The expression is returned unmodified, but its gradient
2046        is now truncated to 0.
2047
2048    .. versionadded:: 0.7
2049    """
2050    warnings.warn((
2051        "consider_constant() is deprecated, use zero_grad() or "
2052        "disconnected_grad() instead."), stacklevel=3)
2053
2054    return consider_constant_(x)
2055
2056
2057class ZeroGrad(ViewOp):
2058    def grad(self, args, g_outs):
2059        return [g_out.zeros_like(g_out) for g_out in g_outs]
2060
2061    def R_op(self, inputs, eval_points):
2062        if eval_points[0] is None:
2063            return [None]
2064
2065        return theano.tensor.zeros(1)
2066
2067
2068zero_grad_ = ZeroGrad()
2069
2070
2071def zero_grad(x):
2072    """
2073    Consider an expression constant when computing gradients.
2074
2075    The expression itself is unaffected, but when its gradient is
2076    computed, or the gradient of another expression that this
2077    expression is a subexpression of, it will be backpropagated
2078    through with a value of zero. In other words, the gradient of
2079    the expression is truncated to 0.
2080
2081    Parameters
2082    ----------
2083    x: :class:`~theano.gof.graph.Variable`
2084        A Theano expression whose gradient should be truncated.
2085
2086    Returns
2087    -------
2088    :class:`~theano.gof.graph.Variable`
2089        An expression equivalent to ``x``, with its gradient
2090        truncated to 0.
2091    """
2092    return zero_grad_(x)
2093
2094
2095class UndefinedGrad(ViewOp):
2096    def grad(self, args, g_outs):
2097        return [grad_undefined(self, i, arg) for i, arg in enumerate(args)]
2098
2099    def R_op(self, inputs, eval_points):
2100        return [None]
2101
2102    def connection_pattern(self, node):
2103        return [[True]]
2104
2105
2106undefined_grad_ = UndefinedGrad()
2107
2108
2109def undefined_grad(x):
2110    """
2111    Consider the gradient of this variable undefined.
2112
2113    This will generate an error message if its gradient is taken.
2114
2115    The expression itself is unaffected, but when its gradient is
2116    computed, or the gradient of another expression that this
2117    expression is a subexpression of, an error message will be generated
2118    specifying such gradient is not defined.
2119
2120    Parameters
2121    ----------
2122    x: :class:`~theano.gof.graph.Variable`
2123        A Theano expression whose gradient should be undefined.
2124
2125    Returns
2126    -------
2127    :class:`~theano.gof.graph.Variable`
2128        An expression equivalent to ``x``, with its gradient undefined.
2129    """
2130    return undefined_grad_(x)
2131
2132
2133class DisconnectedGrad(ViewOp):
2134    def grad(self, args, g_outs):
2135        return [disconnected_type() for g_out in g_outs]
2136
2137    def R_op(self, inputs, eval_points):
2138        return [None]
2139
2140    def connection_pattern(self, node):
2141        return [[False]]
2142
2143
2144disconnected_grad_ = DisconnectedGrad()
2145
2146
2147def disconnected_grad(x):
2148    """
2149    Consider an expression constant when computing gradients.
2150
2151    It will effectively not backpropagating through it.
2152
2153    The expression itself is unaffected, but when its gradient is
2154    computed, or the gradient of another expression that this
2155    expression is a subexpression of, it will not be backpropagated
2156    through. This is effectively equivalent to truncating the gradient
2157    expression to 0, but is executed faster than zero_grad(), which stilll
2158    has to go through the underlying computational graph related to the
2159    expression.
2160
2161    Parameters
2162    ----------
2163    x: :class:`~theano.gof.graph.Variable`
2164        A Theano expression whose gradient should not be
2165        backpropagated through.
2166
2167    Returns
2168    -------
2169    :class:`~theano.gof.graph.Variable`
2170        An expression equivalent to ``x``, with its gradient
2171        now effectively truncated to 0.
2172    """
2173    return disconnected_grad_(x)
2174
2175
2176class GradClip(ViewOp):
2177    # See doc in user fct grad_clip
2178    __props__ = ()
2179
2180    def __init__(self, clip_lower_bound, clip_upper_bound):
2181        # We do not put those member in __eq__ or __hash__
2182        # as they do not influence the perform of this op.
2183        self.clip_lower_bound = clip_lower_bound
2184        self.clip_upper_bound = clip_upper_bound
2185        assert(self.clip_upper_bound >= self.clip_lower_bound)
2186
2187    def grad(self, args, g_outs):
2188        return [theano.tensor.clip(g_out, self.clip_lower_bound,
2189                                   self.clip_upper_bound)
2190                for g_out in g_outs]
2191
2192
2193def grad_clip(x, lower_bound, upper_bound):
2194    """
2195    This op do a view in the forward, but clip the gradient.
2196
2197    This is an elemwise operation.
2198
2199    Parameters
2200    ----------
2201    x:
2202        The variable we want its gradient inputs clipped
2203    lower_bound:
2204        The lower bound of the gradient value
2205    upper_bound:
2206        The upper bound of the gradient value.
2207
2208    Examples
2209    --------
2210    >>> x = theano.tensor.scalar()
2211    >>> z = theano.tensor.grad(grad_clip(x, -1, 1)**2, x)
2212    >>> z2 = theano.tensor.grad(x**2, x)
2213    >>> f = theano.function([x], outputs = [z, z2])
2214    >>> print(f(2.0))
2215    [array(1.0), array(4.0)]
2216
2217    Note
2218    ----
2219    We register an opt in tensor/opt.py that remove the GradClip.
2220    So it have 0 cost in the forward and only do work in the grad.
2221
2222    """
2223    return GradClip(lower_bound, upper_bound)(x)
2224
2225
2226class GradScale(ViewOp):
2227    def __init__(self, multiplier):
2228        self.multiplier = multiplier
2229
2230    def grad(self, args, g_outs):
2231        return [self.multiplier * g_out for g_out in g_outs]
2232
2233
2234def grad_scale(x, multiplier):
2235    """
2236    This op scale or inverse the gradient in the backpropagation.
2237
2238    Parameters
2239    ----------
2240    x:
2241        The variable we want its gradient inputs scale
2242    multiplier:
2243        Scale of the gradient
2244
2245    Examples
2246    --------
2247    >>> x = theano.tensor.fscalar()
2248    >>> fx = theano.tensor.sin(x)
2249    >>> fp = theano.tensor.grad(fx, wrt=x)
2250    >>> fprime = theano.function([x], fp)
2251    >>> print(fprime(2))  # doctest: +ELLIPSIS
2252    -0.416...
2253    >>> f_inverse=grad_scale(fx, -1.)
2254    >>> fpp = theano.tensor.grad(f_inverse, wrt=x)
2255    >>> fpprime = theano.function([x], fpp)
2256    >>> print(fpprime(2))  # doctest: +ELLIPSIS
2257    0.416...
2258    """
2259    return GradScale(multiplier)(x)
2260