1from __future__ import absolute_import, print_function, division
2import copy
3import numpy as np
4import logging
5import pdb
6import time
7from six import iteritems
8from six.moves import xrange
9import sys
10
11import theano
12from theano import tensor, scalar, gof, config
13from theano.compile import optdb
14from theano.compile.ops import shape_i
15from theano.gof import (local_optimizer, EquilibriumDB, TopoOptimizer,
16                        LocalGroupDB,
17                        SequenceDB, Optimizer, DB, toolbox, graph)
18from theano.gof.opt import (LocalMetaOptimizer, copy_stack_trace,
19                            inherit_stack_trace)
20from theano.ifelse import IfElse
21from theano.misc.ordered_set import OrderedSet
22
23from theano.scalar.basic import Scalar, Pow, Cast
24from theano.scalar.basic import log, neg, true_div
25from theano.scalar.basic_scipy import Erfinv, Erfcinv
26from theano.scan_module import scan_utils, scan_op, scan_opt
27
28from theano.tensor.nnet import bn, conv3d2d
29from theano.tensor.nnet.conv import ConvOp
30from theano.tensor.nnet.blocksparse import SparseBlockGemv, SparseBlockOuter
31from theano.tensor.nnet.abstract_conv import (BaseAbstractConv,
32                                              AbstractConv2d,
33                                              AbstractConv2d_gradWeights,
34                                              AbstractConv2d_gradInputs,
35                                              AbstractConv3d,
36                                              AbstractConv3d_gradWeights,
37                                              AbstractConv3d_gradInputs,
38                                              get_conv_output_shape)
39from theano.tensor.nnet.neighbours import Images2Neibs
40from theano.tensor.nnet.ctc import ConnectionistTemporalClassification
41import theano.tensor.nlinalg as nlinalg
42import theano.tensor.signal.pool as pool
43import theano.tensor.slinalg as slinalg
44from collections import Counter
45
46from theano.tests.breakpoint import PdbBreakpoint
47
48from .type import (GpuArrayType, GpuArrayConstant, get_context,
49                   ContextNotDefined, move_to_gpu)
50from .basic_ops import (as_gpuarray_variable, infer_context_name,
51                        host_from_gpu, GpuToGpu,
52                        HostFromGpu, GpuFromHost,
53                        GpuSplit, GpuContiguous, gpu_contiguous,
54                        GpuAlloc, GpuAllocEmpty, GpuReshape,
55                        GpuEye, GpuTri, gpu_join, GpuJoin)
56from .blas import (gpu_dot22, GpuGemm, GpuGer, GpuGemmBatch,
57                   gpugemm_no_inplace, gpugemm_inplace,
58                   gpugemmbatch_no_inplace,
59                   gpugemv_no_inplace, gpugemv_inplace,
60                   GpuCorrMM, GpuCorrMM_gradInputs, GpuCorrMM_gradWeights,
61                   GpuCorr3dMM, GpuCorr3dMM_gradInputs, GpuCorr3dMM_gradWeights)
62from .pool import (GpuPool, GpuMaxPoolGrad, GpuAveragePoolGrad, GpuMaxPoolRop,
63                   GpuDownsampleFactorMaxGradGrad)
64from .blocksparse import (GpuSparseBlockGemv, GpuSparseBlockOuter,
65                          gpu_sparse_block_outer,
66                          gpu_sparse_block_outer_inplace,
67                          gpu_sparse_block_gemv, gpu_sparse_block_gemv_inplace)
68from .nnet import (gpu_crossentropy_softmax_1hot_with_bias_dx,
69                   gpu_crossentropy_softmax_argmax_1hot_with_bias,
70                   gpu_softmax_with_bias, gpu_softmax)
71from .elemwise import (GpuElemwise, GpuDimShuffle, GpuCAReduceCuda,
72                       GpuCAReduceCPY, gpu_erfinv, gpu_erfcinv,
73                       max_inputs_to_GpuElemwise)
74from .subtensor import (GpuIncSubtensor, GpuSubtensor,
75                        GpuAdvancedSubtensor,
76                        GpuAdvancedSubtensor1,
77                        GpuAdvancedBooleanSubtensor,
78                        GpuAdvancedIncSubtensor,
79                        GpuAdvancedIncSubtensor1,
80                        GpuAdvancedIncSubtensor1_dev20,
81                        GpuAdvancedBooleanIncSubtensor,
82                        GpuAllocDiag, GpuExtractDiag)
83from .opt_util import alpha_merge, output_merge, pad_dims, unpad_dims
84from .reduction import GpuMaxAndArgmax
85from .linalg import (GpuCusolverSolve, MATRIX_STRUCTURES_SOLVE, GpuCholesky,
86                     cusolver_available, GpuMagmaMatrixInverse, gpu_svd,
87                     GpuMagmaCholesky, gpu_qr, GpuMagmaEigh,
88                     GpuCublasTriangularSolve, cublas_available)
89from .neighbours import GpuImages2Neibs
90from .ctc import GpuConnectionistTemporalClassification
91
92_logger = logging.getLogger("theano.gpuarray.opt")
93
94
95gpu_optimizer = EquilibriumDB()
96gpu_cut_copies = EquilibriumDB()
97
98# Not used for an EquilibriumOptimizer. It has the "tracks" that we need for GraphToGPUDB.
99gpu_optimizer2 = EquilibriumDB()
100
101
102class GraphToGPUDB(DB):
103    """
104    Retrieves the list local optimizers based on the optimizer flag's value
105    from EquilibriumOptimizer by calling the method query.
106
107    """
108
109    def query(self, *tags, **kwtags):
110        opt = gpu_optimizer2.query(*tags, **kwtags)
111        return GraphToGPU(opt.local_optimizers_all, opt.local_optimizers_map)
112
113
114gpu_seqopt = SequenceDB()
115
116gpu_seqopt.register('gpuarray_graph_optimization', GraphToGPUDB(), -0.5,
117                    'fast_compile', 'fast_run', 'gpuarray')
118
119gpu_seqopt.register('gpuarray_local_optimizations', gpu_optimizer, 1,
120                    'fast_compile', 'fast_run', 'gpuarray', 'gpuarray_local_optimiziations')
121gpu_seqopt.register('gpuarray_cut_transfers', gpu_cut_copies, 2,
122                    'fast_compile', 'fast_run', 'gpuarray')
123
124# do not add 'fast_run' to these two as this would always enable gpuarray mode
125optdb.register('gpuarray_opt', gpu_seqopt,
126               optdb.__position__.get('add_destroy_handler', 49.5) - 1,
127               'gpuarray')
128
129
130def register_opt(*tags, **kwargs):
131    def f(local_opt):
132        name = (kwargs and kwargs.pop('name')) or local_opt.__name__
133        gpu_optimizer.register(name, local_opt, 'fast_run', 'gpuarray', *tags)
134        return local_opt
135    return f
136
137
138def register_opt2(tracks, *tags, **kwargs):
139    '''
140    Decorator for the new GraphToGPU optimizer.
141    Takes an extra parameter(Op) compared to register_opt decorator.
142
143    Parameters
144    ----------
145    tracks : List of Op class Or Op instance or None
146        The Node's Op to which optimization is being applied.
147
148    tags : String
149        The optimization tag to which the optimizer will be registered.
150
151    '''
152    def f(local_opt):
153        name = (kwargs and kwargs.pop('name')) or local_opt.__name__
154        if isinstance(local_opt, theano.gof.DB):
155            opt = local_opt
156        else:
157            opt = theano.gof.local_optimizer(tracks)(local_opt)
158        gpu_optimizer2.register(name, opt, 'fast_run', 'gpuarray', *tags)
159        return local_opt
160    return f
161
162
163def register_inplace(*tags, **kwargs):
164    def f(local_opt):
165        name = (kwargs and kwargs.pop('name')) or local_opt.__name__
166        optdb.register(
167            name, TopoOptimizer(
168                local_opt, failure_callback=TopoOptimizer.warn_inplace),
169            60, 'fast_run', 'inplace', 'gpuarray', *tags)
170        return local_opt
171    return f
172
173
174register_opt('fast_compile')(theano.tensor.opt.local_track_shape_i)
175register_opt(final_opt=True, name='gpua_constant_folding')(
176    tensor.opt.constant_folding)
177gpu_optimizer.register('local_remove_all_assert',
178                       theano.tensor.opt.local_remove_all_assert,
179                       'unsafe')
180
181
182# Define a few operations to use in optimizations,
183# in order to avoid introducin new CPU Ops, or useless ones.
184def safe_to_gpu(x, ctx_name):
185    if isinstance(x.type, tensor.TensorType):
186        return GpuFromHost(ctx_name)(x)
187    else:
188        return x
189
190
191def safe_to_cpu(x):
192    if isinstance(x.type, GpuArrayType):
193        return x.transfer('cpu')
194    else:
195        return x
196
197gpu_log = GpuElemwise(log)
198gpu_neg = GpuElemwise(neg)
199gpu_true_div = GpuElemwise(true_div)
200
201
202def op_lifter(OP, cuda_only=False):
203    """
204    OP(..., host_from_gpu(), ...) -> host_from_gpu(GpuOP(...))
205
206    gpu_from_host(OP(inp0, ...)) -> GpuOP(inp0, ...)
207
208    """
209    def f(maker):
210        def local_opt(node):
211            if type(node.op) in OP:
212                # Either one of our inputs is on the gpu or
213                # all of our clients are on the gpu
214                replace = False
215                # TODO: Maybe set context_name with infer_context_name()?
216                context_name = None
217                # We replace if any input is a host_from_gpu
218                for i in node.inputs:
219                    if (i.owner and i.owner.op == host_from_gpu and
220                            move_to_gpu(i)):
221                        context_name = i.owner.inputs[0].type.context_name
222                        replace = True
223                        break
224
225                if not replace:
226                    # We replace if *all* clients are on the GPU
227                    clients = [c for o in node.outputs for c in o.clients]
228                    replace = len(clients) != 0
229                    for c, idx in clients:
230                        if (c == 'output' or
231                                not isinstance(c.op, GpuFromHost)):
232                            replace = False
233                    # TODO: check that the clients want the same context?
234                    if replace:
235                        # All clients are GpuFromHost and we have at least one
236                        context_name = clients[0][0].op.context_name
237
238                # Check if we should replace
239                if (not replace or
240                        (cuda_only and
241                         get_context(context_name).kind != b'cuda') or
242                        any(["complex" in getattr(i, 'dtype', "")
243                             for i in node.inputs])):
244                    return False
245
246                # tag the inputs with the context in case
247                # the context was derived from the outputs
248                for i in node.inputs:
249                    i.tag.context_name = context_name
250
251                new_op = maker(node.op, context_name, node.inputs, node.outputs)
252
253                # This is needed as sometimes new_op inherits from OP.
254                if new_op and new_op != node.op:
255                    if isinstance(new_op, theano.Op):
256                        new_outputs = new_op(*node.inputs, return_list=True)
257                        to_cpu_fn = safe_to_cpu
258                    elif isinstance(new_op, (tuple, list)):
259                        new_outputs = new_op
260                        to_cpu_fn = safe_to_cpu
261                    else:  # suppose it is a variable on the GPU
262                        new_outputs = [new_op]
263
264                        def to_cpu_fn(x):
265                            return x.transfer('cpu')
266                    # copy stack traces onto gpu outputs
267                    # also copy the stack traces onto HostFromGpu outputs
268                    on_cpu = []
269                    for old_output, new_output in zip(node.outputs, new_outputs):
270                        copy_stack_trace(old_output, new_output)
271                        cpu = to_cpu_fn(new_output)
272                        on_cpu.append(cpu)
273                        copy_stack_trace(old_output, cpu)
274                    return on_cpu
275            return False
276        local_opt.__name__ = maker.__name__
277        return local_optimizer(OP)(local_opt)
278    return f
279
280
281class InputToGpuOptimizer(Optimizer):
282    """
283    Transfer the input to the gpu to start the rolling wave.
284
285    """
286    def add_requirements(self, fgraph):
287        fgraph.attach_feature(toolbox.ReplaceValidate())
288
289    def apply(self, fgraph):
290        for input in fgraph.inputs:
291            if isinstance(input.type, GpuArrayType):
292                continue
293
294            # If all clients are outputs or transfers don't do anything.
295            if (all(cl[0] == 'output' or isinstance(cl[0].op, GpuFromHost)
296                    for cl in input.clients)):
297                continue
298
299            target = getattr(input.tag, 'target', None)
300            if target == 'cpu':
301                continue
302            if (isinstance(input.type, tensor.TensorType) and
303                    not move_to_gpu(input)):
304                continue
305
306            try:
307                new_input = GpuFromHost(target)(input).transfer('cpu')
308                fgraph.replace_validate(input, new_input,
309                                        "InputToGpuOptimizer")
310            except TypeError:
311                # This could fail if the inputs are not TensorTypes
312                pass
313            except ContextNotDefined:
314                if hasattr(input.tag, 'target'):
315                    raise
316                # If there is no context tag and no default context
317                # then it stays on the CPU
318                pass
319
320
321gpu_seqopt.register('InputToGpuArrayOptimizer', InputToGpuOptimizer(),
322                    0, 'fast_run', 'fast_compile', 'merge')
323
324
325class GraphToGPU(Optimizer):
326    """
327    Transfer the graph as a whole to GPU instead of transferring node by node.
328
329    Parameters
330    ----------
331    local_optimizers_all : List or SortedSet
332        The local optimizations to apply to a node.
333    local_optimizers_map : Dict
334        Dictionary object containing the mapping of Op to list of
335        LocalOptimizers.
336    """
337
338    def __init__(self, local_optimizers_all, local_optimizers_map):
339        self.local_optimizers_all = local_optimizers_all
340        self.local_optimizers_map = local_optimizers_map
341
342    def add_requirements(self, fgraph):
343        fgraph.attach_feature(toolbox.ReplaceValidate())
344
345    def apply(self, fgraph):
346        mapping = {}
347        time_opts = {}
348        node_created = {}
349        process_count = {}
350        t_topo = time.time()
351        topo = fgraph.toposort()
352        time_topo = time.time()
353        toposort_timing = time_topo - t_topo
354
355        # Building a new graph
356        # Iterating through inputs of graph
357        target = infer_context_name(*fgraph.inputs)
358        for i in fgraph.inputs:
359            if isinstance(i.type, tensor.TensorType) and move_to_gpu(i):
360                mapping[i] = i.transfer(getattr(i.tag, 'target', target))
361            else:
362                mapping[i] = i
363        for i in fgraph.variables:
364            if isinstance(i, theano.Constant):
365                mapping[i] = i
366        for node in topo:
367            for lopt in (self.local_optimizers_map.get(node.op, []) +
368                         self.local_optimizers_map.get(type(node.op), []) +
369                         self.local_optimizers_all):
370                process_count.setdefault(lopt, 0)
371                time_opts.setdefault(lopt, 0)
372                node_created.setdefault(lopt, 0)
373
374        for node in topo:
375
376            if isinstance(node.op, HostFromGpu):
377                mapping[node.outputs[0]] = mapping[node.inputs[0]]
378                continue
379
380            # Move only if any of the inputs are on the GPU.
381            move_to_GPU = False
382
383            context_name = None
384            for i in [mapping[i] for i in node.inputs]:
385                if isinstance(i.type, GpuArrayType):
386                    context_name = i.type.context_name
387                    move_to_GPU = True
388                    break
389            if (not move_to_GPU and
390                    isinstance(node.op, (theano.tensor.Alloc,
391                                         theano.tensor.AllocEmpty,
392                                         theano.tensor.basic.Eye,
393                                         theano.tensor.basic.Tri))):
394                # If the Alloc[Empty] have a client that will be moved
395                # to the GPU, we should move the Alloc* on the GPU.
396
397                # We approximate this by supposing that if we have an
398                # optimization for one of the clients op, then we will
399                # move the client to the GPU.
400                for c, _ in node.outputs[0].clients:
401                    if (c != 'output' and
402                        (self.local_optimizers_map.get(c.op, []) +
403                         self.local_optimizers_map.get(type(c.op), []))):
404                        move_to_GPU = True
405            new_ops = None
406            if move_to_GPU and any(["complex" in getattr(i, 'dtype', "")
407                                    for i in node.inputs]):
408                move_to_GPU = False
409
410            # Apply the lifter
411            if move_to_GPU:
412                for lopt in (self.local_optimizers_map.get(node.op, []) +
413                             self.local_optimizers_map.get(type(node.op), []) +
414                             self.local_optimizers_all):
415                        t_opt = time.time()
416                        new_ops = lopt.transform(node.op, context_name,
417                                                 [mapping[i] for i in node.inputs],
418                                                 node.outputs)
419                        t_opt2 = time.time()
420                        time_opts[lopt] += t_opt2 - t_opt
421
422                        if new_ops:
423                            process_count[lopt] += 1
424                            break
425            outputs = []
426
427            if isinstance(new_ops, theano.Op):
428                with inherit_stack_trace(node.outputs):
429                    outputs = new_ops(*[mapping[i] for i in node.inputs], return_list=True)
430            elif not new_ops:
431                newnode = node.clone_with_new_inputs([mapping.get(i) for i in node.inputs])
432                outputs = newnode.outputs
433            elif isinstance(new_ops, (tuple, list)):
434                outputs = new_ops
435            elif isinstance(new_ops, theano.Variable):
436                outputs = [new_ops]
437
438            for old_output, new_output in zip(node.outputs, outputs):
439                copy_stack_trace(old_output, new_output)
440
441            if new_ops:
442                node_created[lopt] += len(graph.ops([mapping[i] for i in node.inputs], outputs))
443                if any([getattr(old_o, 'dtype', None) != getattr(new_o, 'dtype', None)
444                        for old_o, new_o in zip(outputs, node.outputs)]):
445                    _logger.warning(
446                        "The optimization %s returned bad dtype. Skipping it."
447                        " Write to theano-dev mailing list about this." %
448                        str(lopt))
449                    newnode = node.clone_with_new_inputs([mapping.get(i) for i in node.inputs])
450                    outputs = newnode.outputs
451
452            for new_o, old_o in zip(outputs, node.outputs):
453                assert len(outputs) == len(node.outputs)
454                mapping[old_o] = new_o
455
456        new_nodes = []
457        for o in fgraph.outputs:
458            new_o = mapping[o]
459            if new_o.type != o.type:
460                assert isinstance(o.type, tensor.TensorType)
461                assert isinstance(new_o.type, GpuArrayType)
462
463                # This condition is needed in the case one input is an
464                # output of the graph. Without this, it would
465                # introduce cycle as we don't replace correctly that
466                # case. It would also add extra transfer to/from the
467                # gpu.
468                if (new_o.owner and
469                        isinstance(new_o.owner.op, GpuFromHost) and
470                        new_o.owner.inputs[0].type == o.type):
471                    new_o = new_o.owner.inputs[0]
472                else:
473                    new_o = copy_stack_trace(o, safe_to_cpu(new_o))
474            new_nodes.append(new_o)
475        fgraph.replace_all_validate(zip(fgraph.outputs, new_nodes),
476                                    reason=self.__class__.__name__)
477
478        return (self, toposort_timing, time_opts, node_created, process_count)
479
480    @staticmethod
481    def print_profile(stream, prof, level=0):
482        (opt, toposort_timing, time_opts, node_created, process_count) = prof
483        blanc = ('    ' * level)
484        print(blanc, "GraphToGPUOptimizer", end=' ', file=stream)
485
486        print(blanc, getattr(opt, "name",
487                             getattr(opt, "__name__", "")), file=stream)
488
489        print(blanc, "  time io_toposort %.3fs" % toposort_timing, file=stream)
490
491        s = sum(time_opts.values())
492        print(blanc, "Total time taken by local optimizers %.3fs " % s, file=stream)
493
494        count_opt = []
495        not_used = []
496        not_used_time = 0
497
498        for o, count in iteritems(process_count):
499            if count > 0:
500                count_opt.append((time_opts[o], count,
501                                  node_created[o], o))
502            else:
503                not_used.append((time_opts[o], o))
504                not_used_time += time_opts[o]
505
506        if count_opt:
507            print(blanc,
508                  '  times - times applied - Node created - name:',
509                  file=stream)
510            count_opt.sort()
511            for (t, count, n_created, o) in count_opt[::-1]:
512                print(blanc, '  %.3fs - %d - %d - %s' % (
513                    t, count, n_created, o), file=stream)
514            print(blanc, '  %.3fs - in %d optimization that were not used (display only those with a runtime > 0)' % (
515                not_used_time, len(not_used)), file=stream)
516            not_used.sort(key=lambda nu: (nu[0], str(nu[1])))
517            for (t, o) in not_used[::-1]:
518                if t > 0:
519                    # Skip opt that have 0 times, they probably wasn't even tried.
520                    print(blanc + "  ", '  %.3fs - %s' % (t, o), file=stream)
521            print(file=stream)
522
523    @staticmethod
524    def merge_profile(prof1, prof2):
525        # (opt, toposort_timing, time_opts, node_created, process_count) = prof1
526        local_optimizers = OrderedSet(prof1[0].local_optimizers_all).union(
527            prof2[0].local_optimizers_all)
528
529        def merge_dict(d1, d2):
530            """
531            merge 2 dicts by adding the values.
532            """
533            d = d1.copy()
534            for k, v in iteritems(d2):
535                if k in d:
536                    d[k] += v
537                else:
538                    d[k] = v
539            return d
540
541        local_optimizers_map = merge_dict(prof1[0].local_optimizers_map,
542                                          prof2[0].local_optimizers_map)
543        new_opt = GraphToGPU(local_optimizers, local_optimizers_map)
544
545        toposort_timing = prof1[1] + prof2[1]
546        time_opts = merge_dict(prof1[2], prof2[2])
547        node_created = merge_dict(prof1[3], prof2[3])
548        process_count = merge_dict(prof1[4], prof2[4])
549        return (new_opt,
550                toposort_timing,
551                time_opts,
552                node_created,
553                process_count)
554
555    def print_summary(self, stream=sys.stdout, level=0, depth=-1):
556        print("%s%s (%i)" % (
557            (' ' * level), self.__class__.__name__, id(self)), file=stream)
558        if depth != 0:
559            map_values = []
560            for opts in self.local_optimizers_map.values():
561                map_values += opts
562            for opt in self.local_optimizers_all + map_values:
563                opt.print_summary(stream, level=(level + 2), depth=(depth - 1))
564
565
566@local_optimizer([GpuFromHost, GpuToGpu, HostFromGpu])
567def local_cut_gpu_transfers(node):
568    # gpu[ab] -> host -> gpub
569    if (isinstance(node.op, GpuFromHost) and
570            node.inputs[0].owner and
571            isinstance(node.inputs[0].owner.op, HostFromGpu)):
572        other = node.inputs[0].owner.inputs[0]
573        if node.op.context_name == other.type.context_name:
574            return [other]
575        else:
576            return [GpuToGpu(node.op.context_name)(other)]
577
578    # ? -> gpua -> host
579    elif (isinstance(node.op, HostFromGpu) and
580          node.inputs[0].owner):
581        n2 = node.inputs[0].owner
582
583        # host ->
584        if isinstance(n2.op, GpuFromHost):
585            return [n2.inputs[0]]
586
587        # gpub ->
588        if isinstance(n2.op, GpuToGpu):
589            return [n2.inputs[0].transfer('cpu')]
590
591    # ? -> gpua -> gpub
592    elif isinstance(node.op, GpuToGpu):
593        # Transfer within same context
594        if node.inputs[0].type.context_name == node.op.context_name:
595            return [node.inputs[0]]
596
597        if node.inputs[0].owner:
598            n2 = node.inputs[0].owner
599
600            # host ->
601            if isinstance(n2.op, GpuFromHost):
602                return [as_gpuarray_variable(n2.inputs[0],
603                                             node.op.context_name)]
604
605            # gpuc ->
606            if isinstance(n2.op, GpuToGpu):
607                if node.op.context_name == n2.inputs[0].type.context_name:
608                    return [n2.inputs[0]]
609                else:
610                    return [node.op(n2.inputs[0])]
611
612
613gpu_cut_copies.register('cut_gpua_host_transfers', local_cut_gpu_transfers,
614                        'fast_compile', 'fast_run', 'gpuarray')
615gpu_cut_copies.register('cut_gpua_constant_transfers',
616                        tensor.opt.constant_folding,
617                        'fast_compile', 'fast_run', 'gpuarray')
618optdb['canonicalize'].register('local_cut_gpua_host_gpua',
619                               local_cut_gpu_transfers,
620                               'fast_compile', 'fast_run', 'gpuarray')
621
622
623@register_opt('fast_compile')
624@local_optimizer([tensor.Alloc])
625def local_gpua_alloc2(node):
626    """
627    Join(axis, {Alloc or HostFromGPU}, ...) -> Join(axis, GpuAlloc, Alloc, ...)
628
629    Moves an alloc that is an input to join to the gpu.
630
631    """
632    try:
633        get_context(None)
634    except ContextNotDefined:
635        # If there is no default context then we do not perform the move here.
636        return
637    if (isinstance(node.op, tensor.Alloc) and
638        all(c != 'output' and
639            isinstance(c.op, tensor.Join) and
640            all(i.owner and
641                i.owner.op in [host_from_gpu, tensor.alloc]
642                for i in c.inputs[1:])
643            for c, idx in node.outputs[0].clients)):
644        return [GpuAlloc(None)(*node.inputs).transfer('cpu')]
645
646
647@register_opt('fast_compile')
648@op_lifter([tensor.Alloc])
649@register_opt2([tensor.Alloc], 'fast_compile')
650def local_gpuaalloc(op, context_name, inputs, outputs):
651    return GpuAlloc(context_name)(*inputs)
652
653
654@register_opt('fast_compile')
655@op_lifter([tensor.AllocEmpty])
656@register_opt2([tensor.AllocEmpty], 'fast_compile')
657def local_gpua_alloc_empty(op, context_name, inputs, outputs):
658    # We use _props_dict() to make sure that the GPU op know all the
659    # CPU op props.
660    return GpuAllocEmpty(context_name=context_name, **op._props_dict())(*inputs)
661
662
663@register_opt()
664@local_optimizer([GpuAlloc])
665def local_gpualloc_memset_0(node):
666    if isinstance(node.op, GpuAlloc) and not node.op.memset_0:
667        inp = node.inputs[0]
668        if (isinstance(inp, GpuArrayConstant) and
669                inp.data.size == 1 and
670                (np.asarray(inp.data) == 0).all()):
671            new_op = GpuAlloc(node.op.context_name, memset_0=True)
672            with inherit_stack_trace(node.outputs):
673                return new_op(*node.inputs, return_list=True)
674
675
676# Don't register by default.
677@gof.local_optimizer([GpuAllocEmpty])
678def local_gpua_alloc_empty_to_zeros(node):
679    if isinstance(node.op, GpuAllocEmpty):
680        context_name = infer_context_name(*node.inputs)
681        z = np.asarray(0, dtype=node.outputs[0].dtype)
682        with inherit_stack_trace(node.outputs):
683            return [GpuAlloc(context_name)(
684                as_gpuarray_variable(z, context_name), *node.inputs)]
685optdb.register('local_gpua_alloc_empty_to_zeros',
686               theano.tensor.opt.in2out(local_gpua_alloc_empty_to_zeros),
687               # After move to gpu and merge2, before inplace.
688               49.3,
689               'alloc_empty_to_zeros',)
690
691
692@register_opt()
693@local_optimizer([GpuContiguous])
694def local_gpu_contiguous_gpu_contiguous(node):
695    """
696    gpu_contiguous(gpu_contiguous(x)) -> gpu_contiguous(x)
697
698    """
699    if isinstance(node.op, GpuContiguous):
700        inp = node.inputs[0]
701        if inp.owner and isinstance(inp.owner.op, GpuContiguous):
702            return [inp]
703
704
705@register_opt('fast_compile')
706@op_lifter([tensor.extra_ops.CpuContiguous])
707@register_opt2([tensor.extra_ops.CpuContiguous], 'fast_compile')
708def local_gpua_contiguous(op, context_name, inputs, outputs):
709    return gpu_contiguous
710
711
712@register_opt('fast_compile')
713@op_lifter([tensor.Reshape])
714@register_opt2([tensor.Reshape], 'fast_compile')
715def local_gpua_reshape(op, context_name, inputs, outputs):
716    res = GpuReshape(op.ndim)
717    return res
718
719
720@register_opt('fast_compile')
721@op_lifter([tensor.Rebroadcast])
722@register_opt2([tensor.Rebroadcast], 'fast_compile')
723def local_gpua_rebroadcast(op, context_name, inputs, outputs):
724    return op(as_gpuarray_variable(inputs[0], context_name))
725
726
727@register_opt('fast_compile')
728@op_lifter([tensor.Flatten])
729@register_opt2([tensor.Flatten], 'fast_compile')
730def local_gpua_flatten(op, context_name, inputs, outputs):
731    shp = []
732    if op.outdim != 1:
733        shp = [inputs[0].shape[i] for i in range(op.outdim - 1)]
734    shp += [-1]
735    res = GpuReshape(op.outdim)
736    o = res(inputs[0], theano.tensor.as_tensor_variable(shp))
737    return o
738
739
740@register_opt('fast_compile')
741@op_lifter([tensor.Elemwise])
742@register_opt2([tensor.Elemwise], 'fast_compile')
743def local_gpua_elemwise(op, context_name, inputs, outputs):
744    scal_op = op.scalar_op
745    name = op.name
746    if name:
747        name = 'Gpu' + name
748    if len(outputs) > 1:
749        return
750
751    have_cuda = False
752    have_opencl = False
753    if inputs and isinstance(inputs[0].type, GpuArrayType):
754        kind = inputs[0].type.context.kind
755        if kind.startswith(b'opencl'):
756            have_opencl = True
757        elif kind.startswith(b'cuda'):
758            have_cuda = True
759    convert = {Erfinv: gpu_erfinv,
760               Erfcinv: gpu_erfcinv}
761
762    if scal_op.__class__ in convert:
763        scal_op = convert[scal_op.__class__]
764        if have_opencl:
765            _logger.warning(
766                'Function "%s" is not supported with OpenCL. Use "device=cuda" instead.' %
767                scal_op)
768        if not have_cuda:
769            return None
770    if not scal_op.supports_c_code(inputs, outputs):
771        return
772    res = GpuElemwise(scal_op, name=name,
773                      inplace_pattern=copy.copy(op.inplace_pattern),
774                      nfunc_spec=op.nfunc_spec)
775
776    # If the elemwise operation is a pow, casts might be required on the
777    # inputs and or outputs because only the (float, float)->float and
778    # (double, double)->double cases are implemented at the moment.
779    if isinstance(op.scalar_op, Pow):
780
781        # Only transfer the computation on the gpu if the output dtype is
782        # floating point. Else, give up on the transfer to the gpu.
783        out_dtype = outputs[0].dtype
784        if out_dtype not in ['float16', 'float32', 'float64']:
785            return
786
787        # Transfer the inputs on the GPU and cast them to the right dtype.
788        new_inputs = []
789        for inp in inputs:
790            if inp.dtype != out_dtype:
791                gpu_cast_op = GpuElemwise(Cast(Scalar(out_dtype)))
792                new_inputs.append(gpu_cast_op(as_gpuarray_variable(inp, context_name)))
793            else:
794                new_inputs.append(as_gpuarray_variable(inp, context_name))
795
796        # Perform the exponent on the gpu and transfer the output back to the
797        # cpu.
798        gpu_output = res(*new_inputs)
799        return [gpu_output]
800    elif op.scalar_op in (scalar.add, scalar.mul):
801        try:
802            return [split_inputs(inputs, max_inputs_to_GpuElemwise(outputs), res)]
803        except ValueError:
804            return False
805    else:
806        return res
807
808
809def split_inputs(inputs, max_nb_inputs, op):
810    """
811    For some ops like add and mul, a large number of inputs can make nvcc fail
812    compilation of our current code. We don't want node in the graph that can't
813    execute as this break DebugMode.
814
815    This should not happen for other GpuElemwise as their is only the fusion
816    that can generate op with too much input and it check for that.
817
818    Parameters
819    ----------
820    inputs: List of theano variables.
821            List of inputs to node.
822    max_nb_inputs: int
823                   Maximum number of inputs the node can handle without
824                   compilation fail.
825    op : Theano operator instance.
826         Operator that should be used to rebuild the computation graph with smaller
827         number of inputs per node.
828    """
829    if max_nb_inputs <= 1 and len(inputs) > 1:
830        raise ValueError("Can not split nodes because inputs' dimensionality and/or"
831                         " number of outputs is too large")
832
833    while len(inputs) > max_nb_inputs:
834        inner_ops = []
835        for i in range(0, len(inputs), max_nb_inputs):
836            inner_ops.append(op(*inputs[i: i + max_nb_inputs]))
837        inputs = inner_ops
838
839    return op(*inputs)
840
841
842gpu_local_elemwise_fusion = tensor.opt.local_elemwise_fusion_op(
843    GpuElemwise,
844    max_inputs_to_GpuElemwise)
845optdb.register('gpua_elemwise_fusion',
846               # 48.5 move to gpu
847               # 48.6 specialize
848               # 49 cpu fusion
849               # 49.5 add destroy handler
850               tensor.opt.FusionOptimizer(gpu_local_elemwise_fusion), 49,
851               'fast_run', 'fusion', 'local_elemwise_fusion', 'gpuarray')
852
853inplace_gpu_elemwise_opt = tensor.opt.InplaceElemwiseOptimizer(
854    GpuElemwise)
855optdb.register('gpua_inplace_opt', inplace_gpu_elemwise_opt, 75,
856               'inplace_elemwise_optimizer', 'fast_run', 'inplace', 'gpuarray')
857
858register_opt(tensor.opt.local_useless_elemwise)
859
860
861@register_opt('fast_compile')
862@op_lifter([tensor.DimShuffle])
863@register_opt2([tensor.DimShuffle], 'fast_compile')
864def local_gpua_dimshuffle(op, context_name, inputs, outputs):
865    return GpuDimShuffle(op.input_broadcastable,
866                         op.new_order)
867
868
869@register_opt('fast_compile')
870@op_lifter([tensor.SpecifyShape])
871@register_opt2([tensor.SpecifyShape], 'fast_compile')
872def local_gpua_specifyShape(op, context_name, inputs, outputs):
873    if isinstance(inputs[0].type, GpuArrayType):
874        return
875    return local_gpua_specifyShape_graph(op, context_name, inputs, outputs)
876
877
878@register_opt2([tensor.SpecifyShape], 'fast_compile')
879def local_gpua_specifyShape_graph(op, context_name, inputs, outputs):
880    inp = [as_gpuarray_variable(inputs[0], context_name)]
881    inp += inputs[1:]
882    return tensor.specify_shape(*inp)
883
884
885@register_opt('fast_compile')
886@op_lifter([theano.compile.ops.Shape])
887def local_gpua_shape(op, context_name, inputs, outputs):
888    # op_lifter will call this opt too frequently as the output is
889    # always on the CPU.
890    if isinstance(inputs[0].type, GpuArrayType):
891        return
892    return local_gpua_shape_graph(op, context_name, inputs, outputs)
893
894
895@register_opt2([tensor.compile.ops.Shape], 'fast_compile')
896def local_gpua_shape_graph(op, context_name, inputs, outputs):
897    return [as_gpuarray_variable(inputs[0], context_name).shape]
898
899
900def gpu_print_wrapper(op, cnda):
901    op.old_op.global_fn(op.old_op, np.asarray(cnda))
902
903
904@register_opt('fast_compile')
905@op_lifter([tensor.printing.Print])
906@register_opt2([tensor.printing.Print], 'fast_compile')
907def local_gpua_print_op(op, context_name, inputs, outputs):
908    x, = inputs
909    with inherit_stack_trace(outputs):
910        gpu_x = as_gpuarray_variable(x, context_name=context_name)
911        new_op = op.__class__(global_fn=gpu_print_wrapper)
912        new_op.old_op = op
913        return new_op(gpu_x)
914
915
916@register_opt('fast_compile')
917@local_optimizer([PdbBreakpoint])
918def local_gpu_pdbbreakpoint_op(node):
919    if isinstance(node.op, PdbBreakpoint):
920
921        old_inputs = node.inputs
922        old_outputs = node.outputs
923
924        new_inputs = node.inputs[:1]
925        input_transfered = []
926
927        # Go through the monitored variables, only transferring on GPU those
928        # for which the input comes from the GPU or the output will be
929        # transferred on the GPU.
930        nb_monitored_vars = len(node.outputs)
931        for i in range(nb_monitored_vars):
932
933            inp = old_inputs[i + 1]
934            out = old_outputs[i]
935
936            input_is_from_gpu = (inp.owner and
937                                 isinstance(inp.owner.op, HostFromGpu))
938            output_goes_to_gpu = False
939            for c in out.clients:
940                if c == 'output':
941                    continue
942                if isinstance(c[0].op, GpuFromHost):
943                    output_goes_to_gpu = True
944                    context_name = c[0].op.context_name
945                    break
946
947            if input_is_from_gpu:
948                # The op should be applied on the GPU version of the input
949                new_inputs.append(inp.owner.inputs[0])
950                input_transfered.append(True)
951
952            elif output_goes_to_gpu:
953                # The input should be transferred to the gpu
954                new_inputs.append(as_gpuarray_variable(inp, context_name))
955                input_transfered.append(True)
956
957            else:
958                # No transfer is required.
959                new_inputs.append(inp)
960                input_transfered.append(False)
961
962        # Only continue the optimization if at least one input has been
963        # transferred to the gpu
964        if not any(input_transfered):
965            return False
966
967        # Apply the op on the new inputs
968        with inherit_stack_trace(node.outputs):
969            new_op_outputs = node.op(*new_inputs, return_list=True)
970
971            # Propagate the transfer to the gpu through the outputs that require
972            # it
973            new_outputs = []
974            for i in range(len(new_op_outputs)):
975                if input_transfered[i]:
976                    new_outputs.append(new_op_outputs[i].transfer('cpu'))
977                else:
978                    new_outputs.append(new_op_outputs[i])
979
980            return new_outputs
981
982    return False
983
984
985@register_opt('fast_compile')
986@op_lifter([IfElse])
987@register_opt2([IfElse], 'fast_compile')
988def local_gpua_lazy_ifelse(op, context_name, inputs, outputs):
989    if op.gpu:
990        return
991    c = inputs[0]
992    inps = []
993    falses = []
994    # ifelse need corresponding true/false inputs variables to be of the same type.
995    # But we can't rely on inputs to respect that, as GraphToGPU don't enforce that.
996    # So we need to take care of this here.
997    for v1, v2 in zip(inputs[1:1 + op.n_outs], inputs[1 + op.n_outs:]):
998        if ((isinstance(v1.type, tensor.TensorType) and move_to_gpu(v1)) or
999                isinstance(v1.type, GpuArrayType) or
1000                isinstance(v2.type, GpuArrayType)):
1001            inps.append(as_gpuarray_variable(v1, context_name))
1002            falses.append(as_gpuarray_variable(v2, context_name))
1003        else:
1004            inps.append(v1)
1005            falses.append(v2)
1006    inps.extend(falses)
1007    return IfElse(op.n_outs, gpu=True)(c, *inps, return_list=True)
1008
1009
1010@register_opt('fast_compile')
1011@op_lifter([tensor.Join])
1012@register_opt2([tensor.Join], 'fast_compile')
1013def local_gpua_join(op, context_name, inputs, outputs):
1014    return gpu_join
1015
1016
1017@register_opt('fast_compile')
1018@local_optimizer([GpuJoin])
1019def local_gpua_join_1(node):
1020    # join of a single element
1021    if (isinstance(node.op, GpuJoin) and
1022            len(node.inputs) == 2):
1023        return [node.inputs[1]]
1024
1025
1026@register_opt('fast_compile')
1027@op_lifter([tensor.Split])
1028@register_opt2([tensor.Split], 'fast_compile')
1029def local_gpua_split(op, context_name, inputs, outputs):
1030    # TODO use props
1031    return GpuSplit(op.len_splits)
1032
1033
1034@register_opt('fast_compile')
1035@op_lifter([tensor.Subtensor])
1036def local_gpua_subtensor(op, context_name, inputs, outputs):
1037    x = inputs[0]
1038    if (x.owner and isinstance(x.owner.op, HostFromGpu)):
1039        gpu_x = x.owner.inputs[0]
1040        if (gpu_x.owner and
1041                isinstance(gpu_x.owner.op, GpuFromHost) and
1042                # And it is a shared var or an input of the graph.
1043                not gpu_x.owner.inputs[0].owner):
1044            if len(x.clients) == 1:
1045                if any([n == 'output' or any([isinstance(v.type, GpuArrayType)
1046                                              for v in n.inputs + n.outputs])
1047                        for n, _ in outputs[0].clients]):
1048                    return
1049                else:
1050                    return [gpu_x.owner.op(outputs[0]).transfer('cpu')]
1051
1052    return GpuSubtensor(op.idx_list)
1053
1054
1055@register_opt2([tensor.Subtensor], 'fast_compile')
1056def local_gpua_subtensor_graph(op, context_name, inputs, outputs):
1057    # We need different code as the condition is different as inputs
1058    # aren't the same.
1059    x = inputs[0]
1060    # We don't want to move the subtensor to the GPU if the inputs is
1061    # on the CPU and the only client of the CPU node is this
1062    # subtensor. This allow to have a smaller transfer.
1063
1064    if (x.owner and isinstance(x.owner.op, GpuFromHost)):
1065        cpu_x = x.owner.inputs[0]
1066        # And it is a shared var or an input of the graph.
1067        # and is used by only 1 node.
1068        # x is in the new graph, so we can't tests its number of clients.
1069        if not cpu_x.owner and len(cpu_x.clients) == 1:
1070            c = outputs[0].clients
1071            # If the subtensor have only 1 client, do it on the CPU.
1072            # We let the other optimization to take care to move the
1073            # next node or not.
1074            if len(c) == 1:
1075                return
1076    return GpuSubtensor(op.idx_list)
1077
1078
1079@register_opt('fast_compile')
1080@op_lifter([tensor.IncSubtensor])
1081@register_opt2([tensor.IncSubtensor], 'fast_compile')
1082def local_gpua_inc_subtensor(op, context_name, inputs, outputs):
1083    op = GpuIncSubtensor(op.idx_list, op.inplace,
1084                         op.set_instead_of_inc,
1085                         op.destroyhandler_tolerate_aliased)
1086    ret = op(*inputs)
1087    val = getattr(outputs[0].tag, 'nan_guard_mode_check', True)
1088    ret.tag.nan_guard_mode_check = val
1089    return ret
1090
1091
1092@register_opt('fast_compile')
1093@op_lifter([tensor.AdvancedSubtensor1])
1094@register_opt2([tensor.AdvancedSubtensor1], 'fast_compile')
1095def local_gpua_advanced_subtensor1(op, context_name, inputs, outputs):
1096    return GpuAdvancedSubtensor1()
1097
1098
1099@register_opt('fast_compile')
1100@op_lifter([tensor.AdvancedSubtensor])
1101@register_opt2([tensor.AdvancedSubtensor], 'fast_compile')
1102def local_gpua_advanced_subtensor(op, context_name, inputs, outputs):
1103    return GpuAdvancedSubtensor()
1104
1105
1106@register_opt('fast_compile')
1107@op_lifter([tensor.AdvancedBooleanSubtensor])
1108@register_opt2([tensor.AdvancedBooleanSubtensor], 'fast_compile')
1109def local_gpua_advanced_boolean_subtensor(op, context_name, inputs, outputs):
1110    return GpuAdvancedBooleanSubtensor()
1111
1112
1113@register_opt('fast_compile')
1114@op_lifter([tensor.AdvancedIncSubtensor1])
1115@register_opt2([tensor.AdvancedIncSubtensor1], 'fast_compile')
1116def local_gpua_advanced_incsubtensor1(op, context_name, inputs, outputs):
1117    x, y, ilist = inputs
1118
1119    set_instead_of_inc = op.set_instead_of_inc
1120
1121    if (x.ndim == 1 and y.ndim == 0 and
1122        config.deterministic == 'default' and
1123            x.dtype not in ('int8', 'int16')):
1124        x = x.dimshuffle(0, 'x')
1125        y = y.dimshuffle('x', 'x')
1126        ret = GpuAdvancedIncSubtensor1_dev20(
1127            set_instead_of_inc=set_instead_of_inc)(x, y, ilist)
1128        ret = GpuDimShuffle(ret.type.broadcastable, [0])(ret)
1129        return ret
1130    elif (x.ndim != 2 or y.ndim != 2 or
1131          config.deterministic == 'more' or
1132          x.dtype in ('int8', 'int16')):
1133        return GpuAdvancedIncSubtensor1(
1134            set_instead_of_inc=set_instead_of_inc)
1135    else:
1136        return GpuAdvancedIncSubtensor1_dev20(
1137            set_instead_of_inc=set_instead_of_inc)
1138
1139
1140# Do not register this optimization for now, as it slows down the
1141# execution by a lot in important cases.
1142# @register_opt('fast_compile')
1143# @op_lifter([tensor.AdvancedIncSubtensor])
1144# @register_opt2([tensor.AdvancedIncSubtensor], 'fast_compile')
1145def local_gpua_advanced_incsubtensor(op, context_name, inputs, outputs):
1146    if not op.set_instead_of_inc:
1147        return GpuAdvancedIncSubtensor()
1148    else:
1149        return False
1150
1151
1152# Do not register this optimization for now, as it slows down the
1153# execution by a lot in important cases.
1154# @register_opt('fast_compile')
1155# @op_lifter([tensor.AdvancedBooleanIncSubtensor])
1156# @register_opt2([tensor.AdvancedBooleanIncSubtensor], 'fast_compile')
1157def local_gpua_advanced_boolean_incsubtensor(op, context_name, inputs, outputs):
1158    # GpuAdvancedIncSubtensor only works with a single boolean mask,
1159    # but not with fancy combinations.
1160    if not op.set_instead_of_inc and len(inputs) == 3:
1161        return GpuAdvancedBooleanIncSubtensor()
1162    else:
1163        return False
1164
1165
1166@register_inplace()
1167@local_optimizer([GpuAdvancedIncSubtensor1, GpuAdvancedIncSubtensor1_dev20])
1168def local_advincsub1_gpua_inplace(node):
1169    if isinstance(node.op, (GpuAdvancedIncSubtensor1,
1170                            GpuAdvancedIncSubtensor1_dev20)):
1171        if not node.op.inplace:
1172            return [node.op.clone_inplace()(*node.inputs)]
1173
1174
1175# AllocDiag
1176@register_opt('fast_compile')
1177@op_lifter([tensor.AllocDiag])
1178@register_opt2([theano.tensor.AllocDiag], 'fast_compile')
1179def local_gpu_alloc_diag(op, context_name, inputs, outputs):
1180    if outputs[0].ndim != 2:
1181        # AllocDiag only supports 2d output
1182        return False
1183    return GpuAllocDiag(offset=op.offset)
1184
1185
1186# ExtractDiag
1187@register_opt('fast_compile')
1188@op_lifter([tensor.ExtractDiag])
1189@register_opt2([theano.tensor.ExtractDiag], 'fast_compile')
1190def local_gpu_extract_diag(op, context_name, inputs, outputs):
1191    return GpuExtractDiag(offset=op.offset, axis1=op.axis1, axis2=op.axis2, view=op.view)
1192
1193
1194@register_opt('fast_compile')
1195@op_lifter([tensor.CAReduce, tensor.Sum, tensor.elemwise.Prod])
1196@register_opt2([tensor.CAReduce, tensor.Sum, tensor.elemwise.Prod], 'fast_compile')
1197def local_gpua_careduce(op, context_name, inputs, outputs):
1198    if isinstance(op.scalar_op, (scalar.Add, scalar.Mul,
1199                                 scalar.Maximum, scalar.Minimum)):
1200
1201        ctx = get_context(context_name)
1202        if ctx.kind == b'opencl':
1203            op2 = GpuCAReduceCPY
1204            if op.scalar_op not in [scalar.add, scalar.mul]:
1205                # We don't support yet all reduction with cpy code.
1206                return
1207        elif ctx.kind == b'cuda':
1208            op2 = GpuCAReduceCuda
1209        else:
1210            return False
1211        x, = inputs
1212        idtype = x.dtype
1213        adtype = getattr(op, 'acc_dtype', None)
1214        odtype = getattr(op, 'dtype', outputs[0].dtype)
1215
1216        # Force accumulator to float32 for float32 inputs since tree
1217        # reduction will not loose as much precision as linear
1218        # accumulation and float64 is much slower on GPU.
1219        if idtype == 'float32' and odtype == 'float32':
1220            adtype = 'float32'
1221
1222        greduce = op2(
1223            op.scalar_op, axis=op.axis,
1224            dtype=odtype,
1225            acc_dtype=adtype)
1226        with inherit_stack_trace(outputs):
1227            gvar = greduce(x)
1228        # We need to have the make node called, otherwise the mask can
1229        # be None
1230        if (op2 is GpuCAReduceCPY or
1231                gvar.owner.op.supports_c_code([
1232                    as_gpuarray_variable(x, context_name)])):
1233            return greduce
1234        else:
1235            # Try to make a simpler pattern based on reshaping
1236            # The principle is that if two adjacent dimensions have
1237            # the same value in the reduce_mask, then we can reshape
1238            # to make them a single dimension, do the reduction, and
1239            # then reshape to get them back.
1240
1241            if op.axis is None:
1242                reduce_mask = [1] * x.type.ndim
1243            else:
1244                reduce_mask = [0] * x.type.ndim
1245                for a in op.axis:
1246                    assert reduce_mask[a] == 0
1247                    reduce_mask[a] = 1
1248
1249            new_in_shp = [shape_i(x, 0)]
1250            new_mask = [reduce_mask[0]]
1251            for i in xrange(1, x.type.ndim):
1252                if reduce_mask[i] == reduce_mask[i - 1]:
1253                    new_in_shp[-1] *= shape_i(x, i)
1254                else:
1255                    new_mask.append(reduce_mask[i])
1256                    new_in_shp.append(shape_i(x, i))
1257            new_axis = []
1258            for idx, m in enumerate(new_mask):
1259                if m == 1:
1260                    new_axis.append(idx)
1261            greduce = op2(
1262                op.scalar_op,
1263                axis=new_axis, reduce_mask=new_mask,
1264                dtype=odtype,
1265                acc_dtype=adtype)
1266            with inherit_stack_trace(outputs):
1267                reshaped_x = x.reshape(tensor.stack(new_in_shp))
1268                gpu_reshaped_x = as_gpuarray_variable(reshaped_x, context_name)
1269                # We need to have the make node called, otherwise the mask can
1270                # be None
1271                gvar = greduce(gpu_reshaped_x)
1272                reshaped_gpu_inputs = [gpu_reshaped_x]
1273                if greduce.supports_c_code(reshaped_gpu_inputs):
1274                    reduce_reshaped_x = greduce(gpu_reshaped_x)
1275
1276                    if reduce_reshaped_x.ndim != outputs[0].ndim:
1277                        out_shp = []
1278                        for i in range(x.ndim):
1279                            if i not in op.axis:
1280                                out_shp.append(shape_i(x, i))
1281                        unreshaped_reduce = GpuReshape(len(out_shp))(
1282                            reduce_reshaped_x,
1283                            tensor.stack(out_shp))
1284                    else:
1285                        unreshaped_reduce = reduce_reshaped_x
1286                    return [unreshaped_reduce]
1287
1288
1289@register_opt('fast_compile')
1290@op_lifter([tensor.blas.Gemv, tensor.blas_c.CGemv])
1291@register_opt2([tensor.blas.Gemv], 'fast_compile')
1292def local_gpua_gemv(op, context_name, inputs, outputs):
1293    if inputs[0].dtype == 'float16':
1294        # Use gemm implementation as cublas gemv don't support float16
1295        return gpugemm_no_inplace(inputs[0][:, None],
1296                                  inputs[1],
1297                                  inputs[2],
1298                                  inputs[3][:, None],
1299                                  inputs[4]).dimshuffle(0)
1300
1301    if inputs[0].dtype not in ['float32', 'float64']:
1302        return
1303    if op.inplace:
1304        return gpugemv_inplace
1305    else:
1306        return gpugemv_no_inplace
1307
1308
1309@register_opt('fast_compile')
1310@op_lifter([tensor.blas.Gemm])
1311@register_opt2([tensor.blas.Gemm], 'fast_compile')
1312def local_gpua_gemm(op, context_name, inputs, outputs):
1313    if inputs[0].dtype not in ['float16', 'float32', 'float64']:
1314        return
1315    if op.inplace:
1316        return gpugemm_inplace
1317    else:
1318        return gpugemm_no_inplace
1319
1320
1321@register_opt('fast_compile')
1322@op_lifter([tensor.blas.BatchedDot])
1323@register_opt2([tensor.blas.BatchedDot], 'fast_compile')
1324def local_gpua_gemmbatch(op, context_name, inputs, outputs):
1325    if inputs[0].dtype not in ['float16', 'float32', 'float64']:
1326        return
1327    with inherit_stack_trace(outputs):
1328        a, b = inputs
1329        # Since GpuGemmBatch only supports 3D inputs and output,
1330        # we need to add broadcastable dims to the inputs, and drop
1331        # them from outputs
1332        output_dims = [0, 1, 2]
1333        if a.ndim == 2:
1334            a = GpuDimShuffle(a.broadcastable, (0, 'x', 1))(a)
1335            del output_dims[1]
1336        if b.ndim == 2:
1337            b = GpuDimShuffle(b.broadcastable, (0, 1, 'x'))(b)
1338            del output_dims[-1]
1339        # In case of mismatched dtypes, we also have to upcast
1340        out_dtype = outputs[0].dtype
1341        if a.dtype != out_dtype or b.dtype != out_dtype:
1342            gpu_cast_op = GpuElemwise(Cast(Scalar(out_dtype)))
1343            if a.dtype != out_dtype:
1344                a = gpu_cast_op(a)
1345            if b.dtype != out_dtype:
1346                b = gpu_cast_op(b)
1347
1348        c = GpuAllocEmpty(out_dtype, context_name)(
1349            a.shape[0], a.shape[1], b.shape[2])
1350        out = gpugemmbatch_no_inplace(c, np.asarray(1.0, dtype=out_dtype),
1351                                      a, b, np.asarray(0.0, dtype=out_dtype))
1352        if len(output_dims) != 3:
1353            out = GpuDimShuffle(out.broadcastable, output_dims)(out)
1354        return out
1355
1356
1357@register_opt()
1358@alpha_merge(GpuGemm, alpha_in=1, beta_in=4)
1359def local_gpua_gemm_alpha_merge(node, *inputs):
1360    return [gpugemm_no_inplace(*inputs)]
1361
1362
1363@register_opt()
1364@output_merge(GpuGemm, alpha_in=1, beta_in=4, out_in=0)
1365def local_gpua_gemm_output_merge(node, *inputs):
1366    return [gpugemm_no_inplace(*inputs)]
1367
1368
1369@register_opt()
1370@alpha_merge(GpuGemmBatch, alpha_in=1, beta_in=4)
1371def local_gpua_gemmbatch_alpha_merge(node, *inputs):
1372    return [gpugemmbatch_no_inplace(*inputs)]
1373
1374
1375@register_opt()
1376@output_merge(GpuGemmBatch, alpha_in=1, beta_in=4, out_in=0)
1377def local_gpua_gemmbatch_output_merge(node, *inputs):
1378    return [gpugemmbatch_no_inplace(*inputs)]
1379
1380
1381@register_opt('fast_compile')
1382@op_lifter([tensor.blas.Ger, tensor.blas_c.CGer, tensor.blas_scipy.ScipyGer])
1383@register_opt2([tensor.blas.Ger, tensor.blas_c.CGer, tensor.blas_scipy.ScipyGer], 'fast_compile')
1384def local_gpua_ger(op, context_name, inputs, outputs):
1385    if inputs[0].dtype not in ['float32', 'float64']:
1386        return
1387    return GpuGer(inplace=op.destructive)
1388
1389
1390@register_opt('fast_compile')
1391@op_lifter([tensor.blas.Dot22])
1392@register_opt2([tensor.blas.Dot22], 'fast_compile')
1393def local_gpua_dot22(op, context_name, inputs, outputs):
1394    return gpu_dot22
1395
1396
1397@register_opt('fast_compile')
1398@op_lifter([tensor.blas.Dot22Scalar])
1399@register_opt2([tensor.blas.Dot22Scalar], 'fast_compile')
1400def local_gpua_dot22scalar(op, context_name, inputs, outputs):
1401    with inherit_stack_trace(outputs):
1402        x, y, a = inputs
1403        x = as_gpuarray_variable(x, context_name)
1404        y = as_gpuarray_variable(y, context_name)
1405        z = GpuAllocEmpty(x.dtype, context_name)(x.shape[0], y.shape[1])
1406        return [gpugemm_no_inplace(z, a, x, y, 0)]
1407
1408
1409@register_opt('fast_compile')
1410@op_lifter([tensor.basic.Eye])
1411@register_opt2([tensor.basic.Eye], 'fast_compile')
1412def local_gpua_eye(op, context_name, inputs, outputs):
1413    return GpuEye(dtype=op.dtype, context_name=context_name)
1414
1415
1416@register_opt('fast_compile')
1417@op_lifter([tensor.basic.Tri])
1418@register_opt2([tensor.basic.Tri], 'fast_compile')
1419def local_gpua_tri(op, context_name, inputs, outputs):
1420    return GpuTri(dtype=op.dtype, context_name=context_name)
1421
1422
1423@register_opt('fast_compile')
1424@op_lifter([tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias])
1425@register_opt2([tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias], 'fast_compile')
1426def local_gpua_crossentropysoftmaxargmax1hotwithbias(op, context_name, inputs, outputs):
1427    return gpu_crossentropy_softmax_argmax_1hot_with_bias
1428
1429
1430@register_opt('fast_compile')
1431@op_lifter([tensor.nnet.CrossentropySoftmax1HotWithBiasDx])
1432@register_opt2([tensor.nnet.CrossentropySoftmax1HotWithBiasDx], 'fast_compile')
1433def local_gpua_crossentropysoftmax1hotwithbiasdx(op, context_name, inputs, outputs):
1434    return gpu_crossentropy_softmax_1hot_with_bias_dx
1435
1436
1437@register_opt('fast_compile')
1438@op_lifter([tensor.nnet.Softmax])
1439@register_opt2([tensor.nnet.Softmax], 'fast_compile')
1440def local_gpua_softmax(op, context_name, inputs, outputs):
1441    return gpu_softmax
1442
1443
1444@register_opt('fast_compile')
1445@op_lifter([tensor.nnet.SoftmaxWithBias])
1446@register_opt2([tensor.nnet.SoftmaxWithBias], 'fast_compile')
1447def local_gpua_softmaxwithbias(op, context_name, inputs, outputs):
1448    return gpu_softmax_with_bias
1449
1450
1451@register_opt('fast_compile')
1452@op_lifter([tensor.nnet.CrossentropyCategorical1Hot])
1453@register_opt2([tensor.nnet.CrossentropyCategorical1Hot], 'fast_compile')
1454def local_gpu_crossentropycategorical1hot(op, context_name, inputs, outputs):
1455    # There is no corresponding GPU Op, but we can express it as:
1456    #   coding, one_of_n = inputs
1457    #   -log(coding[arange(coding.shape[0]), one_of_n])
1458    coding, one_of_n = inputs
1459    idx0 = theano.tensor.arange(shape_i(coding, 0))
1460    return [gpu_neg(gpu_log(coding[idx0, one_of_n]))]
1461
1462
1463@register_opt('fast_compile')
1464@op_lifter([tensor.nnet.CrossentropyCategorical1HotGrad])
1465@register_opt2([tensor.nnet.CrossentropyCategorical1HotGrad], 'fast_compile')
1466def local_gpu_crossentropycategorical1hotgrad(op, context_name, inputs, outputs):
1467    # There is no corresponding GPU Op, but we can express it as:
1468    #   gy, coding, one_of_n = inputs
1469    #   gcoding = zeros_like(coding)
1470    #   gcoding[arange(coding.shape[0]), one_of_n] = -g / (
1471    #       coding[arange(coding.shape[0]), one_of_n])
1472    gy, coding, one_of_n = inputs
1473    idx0 = theano.tensor.arange(shape_i(coding, 0))
1474    z = GpuAlloc(context_name, memset_0=True)(
1475        as_gpuarray_variable(np.zeros((), dtype=coding.dtype), context_name),
1476        *[shape_i(coding, i) for i in xrange(coding.ndim)])
1477    gcoding = tensor.set_subtensor(
1478        z[idx0, one_of_n],
1479        gpu_neg(gpu_true_div(gy, coding[idx0, one_of_n])))
1480    return [gcoding.transfer(context_name)]
1481
1482
1483@register_opt('fast_compile')
1484@op_lifter([theano.tensor.opt.Assert])
1485def local_gpua_assert(op, context_name, inputs, outputs):
1486    if isinstance(inputs[0].type, GpuArrayType):
1487        return
1488    return local_gpua_assert_graph(op, context_name, inputs, outputs)
1489
1490
1491@register_opt2([theano.tensor.opt.Assert], 'fast_compile')
1492def local_gpua_assert_graph(op, context_name, inputs, outputs):
1493    return [op(as_gpuarray_variable(inputs[0], context_name),
1494               *inputs[1:])]
1495
1496
1497@register_opt('fast_compile')
1498@op_lifter([ConvOp])
1499@register_opt2([ConvOp], 'fast_compile')
1500def local_gpua_error_convop(op, context_name, inputs, outputs):
1501    assert False, """
1502ConvOp does not work with the gpuarray backend.
1503
1504Use the new convolution interface to have GPU convolution working:
1505theano.tensor.nnet.conv2d()
1506"""
1507
1508
1509@register_opt('fast_compile')
1510@op_lifter([SparseBlockGemv])
1511@register_opt2([SparseBlockGemv], 'fast_compile')
1512def local_gpua_sparseblockgemv(op, context_name, inputs, outputs):
1513    if inputs[0].dtype == 'float16':
1514        return
1515    if op.inplace:
1516        return gpu_sparse_block_gemv_inplace
1517    else:
1518        return gpu_sparse_block_gemv
1519
1520
1521@register_opt('fast_compile')
1522@op_lifter([SparseBlockOuter])
1523@register_opt2([SparseBlockOuter], 'fast_compile')
1524def local_gpua_sparseblockouter(op, context_name, inputs, outputs):
1525    if inputs[0].dtype == 'float16':
1526        return
1527    if op.inplace:
1528        return gpu_sparse_block_outer_inplace
1529    else:
1530        return gpu_sparse_block_outer
1531
1532
1533@register_inplace()
1534@local_optimizer([GpuSparseBlockGemv], inplace=True)
1535def local_inplace_sparseblockgemv(node):
1536    if isinstance(node.op, GpuSparseBlockGemv) and not node.op.inplace:
1537        return [gpu_sparse_block_gemv_inplace(*node.inputs)]
1538
1539
1540@register_inplace()
1541@local_optimizer([GpuSparseBlockOuter], inplace=True)
1542def local_inplace_sparseblockouter(node):
1543    if isinstance(node.op, GpuSparseBlockOuter) and not node.op.inplace:
1544        return [GpuSparseBlockOuter(inplace=True)(*node.inputs)]
1545
1546
1547# Move to Gpu optimization
1548@local_optimizer([GpuFromHost,
1549                  AbstractConv2d,
1550                  AbstractConv2d_gradWeights,
1551                  AbstractConv2d_gradInputs,
1552                  AbstractConv3d,
1553                  AbstractConv3d_gradWeights,
1554                  AbstractConv3d_gradInputs])
1555def local_conv_gpu_conv(node):
1556    """
1557    gpu_from_host(AbstractConv) -> AbstractConv(gpu_from_host)
1558
1559    AbstractConv(host_from_gpu) -> host_from_gpu(AbstractConv)
1560    """
1561    if isinstance(node.op, GpuFromHost):
1562        host_input = node.inputs[0]
1563        if host_input.owner and isinstance(host_input.owner.op,
1564                                           BaseAbstractConv):
1565
1566            conv = host_input.owner.op
1567            inps = list(host_input.owner.inputs)
1568            ctx = infer_context_name(*inps)
1569            inps[0] = as_gpuarray_variable(inps[0], context_name=ctx)
1570            inps[1] = as_gpuarray_variable(inps[1], context_name=ctx)
1571            out = conv(*inps)
1572            # out is on the GPU because both inputs are.
1573            out = theano.tensor.patternbroadcast(out,
1574                                                 node.outputs[0].broadcastable)
1575            return [out]
1576
1577    if isinstance(node.op, BaseAbstractConv):
1578        # conv(host_from_gpu) -> host_from_gpu(gpu_conv)
1579        inp1 = node.inputs[0]
1580        inp2 = node.inputs[1]
1581        if ((isinstance(inp1.type, GpuArrayType) and
1582             isinstance(inp2.type, GpuArrayType))):
1583            # Both inputs are already directly on the GPU, nothing to do
1584            return
1585
1586        inp1_on_gpu = (isinstance(inp1.type, GpuArrayType) or
1587                       (inp1.owner and isinstance(inp1.owner.op, HostFromGpu)))
1588        inp2_on_gpu = (isinstance(inp2.type, GpuArrayType) or
1589                       (inp2.owner and isinstance(inp2.owner.op, HostFromGpu)))
1590
1591        if inp1_on_gpu or inp2_on_gpu:
1592            conv = node.op
1593            inps = list(node.inputs)
1594            ctx = infer_context_name(*inps)
1595            inps[0] = as_gpuarray_variable(inps[0], context_name=ctx)
1596            inps[1] = as_gpuarray_variable(inps[1], context_name=ctx)
1597            out = conv(*inps)
1598            # out is on the GPU because both inputs are.
1599            out = theano.tensor.patternbroadcast(
1600                out,
1601                node.outputs[0].broadcastable)
1602            # If the original output was on CPU, we have to transfer it
1603            if isinstance(node.outputs[0].type, tensor.TensorType):
1604                return [tensor.as_tensor_variable(out)]
1605            else:
1606                return [out]
1607
1608
1609register_opt()(local_conv_gpu_conv)
1610
1611
1612# CorrMM opt
1613@local_optimizer([AbstractConv2d])
1614def local_abstractconv_gemm(node):
1615    if not isinstance(node.op, AbstractConv2d):
1616        return None
1617    img, kern = node.inputs
1618    if (not isinstance(img.type, GpuArrayType) or
1619            not isinstance(kern.type, GpuArrayType)):
1620        return None
1621    ctx = infer_context_name(img, kern)
1622
1623    border_mode = node.op.border_mode
1624    subsample = node.op.subsample
1625    filter_dilation = node.op.filter_dilation
1626    num_groups = node.op.num_groups
1627    unshared = node.op.unshared
1628
1629    flip = (slice(None),) * (kern.ndim - 2) + \
1630        (slice(None, None, -1),) * 2
1631    kern_axes = (1, 0) + tuple(i for i in range(2, kern.ndim))
1632    if ((border_mode == 'full') and (subsample == (1, 1)) and num_groups == 1 and not unshared):
1633        if not node.op.filter_flip:
1634            kern = kern[flip]
1635        # need to dimshuffle the kernel for full convolution
1636        kern = kern.dimshuffle(kern_axes)
1637        # call GpuCorrMM_gradInputs
1638        rval = GpuCorrMM_gradInputs('valid',
1639                                    subsample,
1640                                    filter_dilation)(
1641            gpu_contiguous(kern), gpu_contiguous(img))
1642    else:
1643        # need to flip the kernel if necessary
1644        if node.op.filter_flip:
1645            kern = kern[flip]
1646        # By default use GpuCorrMM
1647        rval = GpuCorrMM(border_mode,
1648                         subsample,
1649                         filter_dilation,
1650                         num_groups,
1651                         unshared)(gpu_contiguous(img),
1652                                   gpu_contiguous(kern))
1653
1654        # call GpuCorrMM_gradWeights if good
1655        # (the latter is faster if batchsize * kernelHeight * kernelWidth
1656        # is larger than inputChannels * outputHeight * outputWidth.
1657        # GpuConv does not always store information on the batchsize and
1658        # channels, though, so we only use what information we have.)
1659        if ((subsample == (1, 1)) and (filter_dilation == (1, 1)) and
1660                (node.op.imshp is not None) and
1661                (None not in node.op.imshp[-2:]) and
1662                (node.op.kshp is not None) and
1663                (None not in node.op.kshp) and
1664                border_mode != "half" and
1665                num_groups == 1 and
1666                not unshared):
1667            # we know the kernel and output size
1668            prod1 = node.op.kshp[0] * node.op.kshp[-3]
1669            prod2 = ((node.op.imshp[-2] - node.op.kshp[0] + 1) *
1670                     (node.op.imshp[-1] - node.op.kshp[-3] + 1))
1671            if (None not in node.op.imshp[:1]):
1672                # we also know batchsize and input channels
1673                prod1 *= node.op.imshp[0]
1674                prod2 *= node.op.imshp[1]
1675            # compare to decide
1676            if prod1 > prod2:
1677                rval = GpuCorrMM_gradWeights(border_mode,
1678                                             subsample,
1679                                             filter_dilation)(
1680                    gpu_contiguous(img.dimshuffle(1, 0, 2, 3)),
1681                    gpu_contiguous(kern.dimshuffle(1, 0, 2, 3)))
1682                # (we need to wrap the result in as_gpuarray_variable,
1683                # because we are not allowed to replace a GpuArray with
1684                # a DimShuffle instance in a graph optimization)
1685                rval = as_gpuarray_variable(
1686                    rval.dimshuffle(1, 0, 2, 3),
1687                    context_name=ctx)
1688    return [rval]
1689
1690
1691# CorrMM opt used for Meta-optimizer
1692@local_optimizer([AbstractConv2d])
1693def local_abstractconv_gemm_def(node):
1694    if not isinstance(node.op, AbstractConv2d):
1695        return None
1696    img, kern = node.inputs
1697    if (not isinstance(img.type, GpuArrayType) or
1698            not isinstance(kern.type, GpuArrayType)):
1699        return None
1700
1701    border_mode = node.op.border_mode
1702    subsample = node.op.subsample
1703    filter_dilation = node.op.filter_dilation
1704    num_groups = node.op.num_groups
1705    unshared = node.op.unshared
1706
1707    if node.op.filter_flip:
1708        flip = (slice(None),) * (kern.ndim - 2) + \
1709            (slice(None, None, -1),) * 2
1710        kern = kern[flip]
1711    rval = GpuCorrMM(border_mode,
1712                     subsample,
1713                     filter_dilation,
1714                     num_groups,
1715                     unshared)(gpu_contiguous(img),
1716                               gpu_contiguous(kern))
1717    return [rval]
1718
1719
1720@local_optimizer([AbstractConv2d])
1721def local_abstractconv_gemm_alt(node):
1722    if not isinstance(node.op, AbstractConv2d):
1723        return None
1724    img, kern = node.inputs
1725    if (not isinstance(img.type, GpuArrayType) or
1726            not isinstance(kern.type, GpuArrayType)):
1727        return None
1728    ctx = infer_context_name(img, kern)
1729
1730    border_mode = node.op.border_mode
1731    subsample = node.op.subsample
1732    filter_dilation = node.op.filter_dilation
1733    num_groups = node.op.num_groups
1734    unshared = node.op.unshared
1735
1736    if border_mode == 'full' and subsample == (1, 1) and num_groups == 1 and not unshared:
1737        if not node.op.filter_flip:
1738            kern = kern[:, :, ::-1, ::-1]
1739
1740        kern = kern.dimshuffle(1, 0, 2, 3)
1741        rval = GpuCorrMM_gradInputs('valid',
1742                                    subsample,
1743                                    filter_dilation)(
1744            gpu_contiguous(kern), gpu_contiguous(img))
1745
1746    elif (border_mode == 'valid' and subsample == (1, 1) and filter_dilation == (1, 1) and
1747          num_groups == 1 and not unshared):
1748        if node.op.filter_flip:
1749            kern = kern[:, :, ::-1, ::-1]
1750
1751        rval = GpuCorrMM_gradWeights(border_mode,
1752                                     subsample,
1753                                     filter_dilation)(
1754            gpu_contiguous(img.dimshuffle(1, 0, 2, 3)),
1755            gpu_contiguous(kern.dimshuffle(1, 0, 2, 3)))
1756        rval = as_gpuarray_variable(rval.dimshuffle(1, 0, 2, 3),
1757                                    context_name=ctx)
1758    else:
1759        return None
1760
1761    return [rval]
1762
1763
1764@local_optimizer([AbstractConv3d])
1765def local_abstractconv3d_gemm(node):
1766    if not isinstance(node.op, AbstractConv3d):
1767        return None
1768    img, kern = node.inputs
1769    if (not isinstance(img.type, GpuArrayType) or
1770            not isinstance(kern.type, GpuArrayType)):
1771        return None
1772    ctx = infer_context_name(img, kern)
1773
1774    border_mode = node.op.border_mode
1775    subsample = node.op.subsample
1776    filter_dilation = node.op.filter_dilation
1777    num_groups = node.op.num_groups
1778    if ((border_mode == 'full') and (subsample == (1, 1, 1)) and num_groups == 1):
1779        if not node.op.filter_flip:
1780            kern = kern[:, :, ::-1, ::-1, ::-1]
1781        # need to dimshuffle the kernel for full convolution
1782        kern = kern.dimshuffle(1, 0, 2, 3, 4)
1783        # call GpuCorr3dMM_gradInputs
1784        rval = GpuCorr3dMM_gradInputs('valid',
1785                                      subsample,
1786                                      filter_dilation)(
1787            gpu_contiguous(kern), gpu_contiguous(img))
1788    else:
1789        # need to flip the kernel if necessary
1790        if node.op.filter_flip:
1791            kern = kern[:, :, ::-1, ::-1, ::-1]
1792        # By default use GpuCorr3dMM
1793        rval = GpuCorr3dMM(border_mode,
1794                           subsample,
1795                           filter_dilation,
1796                           num_groups)(gpu_contiguous(img),
1797                                       gpu_contiguous(kern))
1798
1799        # call GpuCorr3dMM_gradWeights if good
1800        # (the latter is faster if batchsize * kernelHeight * kernelWidth * kernelDepth
1801        # is larger than inputChannels * outputHeight * outputWidth * outputDepth.
1802        # GpuConv does not always store information on the batchsize and
1803        # channels, though, so we only use what information we have.)
1804        if ((subsample == (1, 1, 1)) and (filter_dilation == (1, 1, 1)) and
1805                (node.op.imshp is not None) and
1806                (None not in node.op.imshp[-3:]) and
1807                (node.op.kshp is not None) and
1808                (None not in node.op.kshp) and
1809                border_mode != "half" and
1810                num_groups == 1):
1811            # we know the kernel and output size
1812            prod1 = node.op.kshp[0] * node.op.kshp[1] * node.op.kshp[2]
1813            prod2 = ((node.op.imshp[-3] - node.op.kshp[0] + 1) *
1814                     (node.op.imshp[-2] - node.op.kshp[1] + 1) *
1815                     (node.op.imshp[-1] - node.op.kshp[2] + 1))
1816            if (None not in node.op.imshp[:1]):
1817                # we also know batchsize and input channels
1818                prod1 *= node.op.imshp[0]
1819                prod2 *= node.op.imshp[1]
1820            # compare to decide
1821            if prod1 > prod2:
1822                rval = GpuCorr3dMM_gradWeights(border_mode,
1823                                               subsample,
1824                                               filter_dilation)(
1825                    gpu_contiguous(img.dimshuffle(1, 0, 2, 3, 4)),
1826                    gpu_contiguous(kern.dimshuffle(1, 0, 2, 3, 4)))
1827                # (we need to wrap the result in as_gpuarray_variable,
1828                # because we are not allowed to replace a GpuArray with
1829                # a DimShuffle instance in a graph optimization)
1830                rval = as_gpuarray_variable(
1831                    rval.dimshuffle(1, 0, 2, 3, 4),
1832                    context_name=ctx)
1833    return [rval]
1834
1835
1836# Corr3dMM opt used for Meta-optimizer
1837@local_optimizer([AbstractConv3d])
1838def local_abstractconv3d_gemm_def(node):
1839    if not isinstance(node.op, AbstractConv3d):
1840        return None
1841    img, kern = node.inputs
1842    if (not isinstance(img.type, GpuArrayType) or
1843            not isinstance(kern.type, GpuArrayType)):
1844        return None
1845
1846    border_mode = node.op.border_mode
1847    subsample = node.op.subsample
1848    filter_dilation = node.op.filter_dilation
1849    if node.op.filter_flip:
1850        kern = kern[:, :, ::-1, ::-1, ::-1]
1851    # By default use GpuCorr3dMM
1852    rval = GpuCorr3dMM(border_mode,
1853                       subsample,
1854                       filter_dilation,
1855                       node.op.num_groups)(gpu_contiguous(img),
1856                                           gpu_contiguous(kern))
1857    return [rval]
1858
1859
1860@local_optimizer([AbstractConv3d])
1861def local_abstractconv3d_alt(node):
1862    if not isinstance(node.op, AbstractConv3d):
1863        return None
1864    img, kern = node.inputs
1865    if (not isinstance(img.type, GpuArrayType) or
1866            not isinstance(kern.type, GpuArrayType)):
1867        return None
1868    ctx = infer_context_name(img, kern)
1869
1870    border_mode = node.op.border_mode
1871    subsample = node.op.subsample
1872    filter_dilation = node.op.filter_dilation
1873    num_groups = node.op.num_groups
1874
1875    if((border_mode == 'full') and (subsample == (1, 1, 1)) and
1876       (num_groups == 1)):
1877        if not node.op.filter_flip:
1878            kern = kern[:, :, ::-1, ::-1, ::-1]
1879        kern = kern.dimshuffle(1, 0, 2, 3, 4)
1880        rval = GpuCorr3dMM_gradInputs('valid',
1881                                      subsample,
1882                                      filter_dilation)(
1883            gpu_contiguous(kern), gpu_contiguous(img))
1884
1885    elif(subsample == (1, 1, 1) and filter_dilation == (1, 1, 1) and
1886         border_mode == 'valid' and num_groups == 1):
1887        if node.op.filter_flip:
1888            kern = kern[:, :, ::-1, ::-1, ::-1]
1889        rval = GpuCorr3dMM_gradWeights(border_mode,
1890                                       subsample,
1891                                       filter_dilation)(
1892            gpu_contiguous(img.dimshuffle(1, 0, 2, 3, 4)),
1893            gpu_contiguous(kern.dimshuffle(1, 0, 2, 3, 4)))
1894        rval = as_gpuarray_variable(rval.dimshuffle(1, 0, 2, 3, 4),
1895                                    context_name=ctx)
1896    else:
1897        return None
1898    return [rval]
1899
1900
1901@local_optimizer([AbstractConv3d])
1902def local_abstractconv3d2d(node):
1903    if not isinstance(node.op, AbstractConv3d):
1904        return None
1905    img, kern = node.inputs
1906    if (not isinstance(img.type, GpuArrayType) or
1907            not isinstance(kern.type, GpuArrayType)):
1908        return None
1909
1910    ctx = infer_context_name(img, kern)
1911    border_mode = node.op.border_mode
1912    subsample = node.op.subsample
1913    filter_dilation = node.op.filter_dilation
1914    num_groups = node.op.num_groups
1915
1916    if(subsample == (1, 1, 1) and filter_dilation == (1, 1, 1) and
1917       num_groups == 1):
1918        reorder_array = [0, 2, 1, 3, 4]
1919        rval = conv3d2d.conv3d(gpu_contiguous(img.dimshuffle(*reorder_array)),
1920                               gpu_contiguous(kern.dimshuffle(*reorder_array)),
1921                               [node.op.imshp[i] for i in reorder_array],
1922                               [node.op.kshp[i] for i in reorder_array],
1923                               border_mode=border_mode)
1924        rval = as_gpuarray_variable(rval.dimshuffle(*reorder_array),
1925                                    context_name=ctx)
1926
1927        return [rval]
1928    else:
1929        return None
1930
1931
1932@local_optimizer([AbstractConv2d_gradWeights])
1933def local_abstractconv_gradweights_gemm(node):
1934    if not isinstance(node.op, AbstractConv2d_gradWeights):
1935        return None
1936    img, topgrad, shape = node.inputs
1937    if not isinstance(img.type, GpuArrayType) or \
1938            not isinstance(topgrad.type, GpuArrayType):
1939        return None
1940    ctx = infer_context_name(img, topgrad)
1941
1942    rval = GpuCorrMM_gradWeights(border_mode=node.op.border_mode,
1943                                 subsample=node.op.subsample,
1944                                 filter_dilation=node.op.filter_dilation,
1945                                 num_groups=node.op.num_groups,
1946                                 unshared=node.op.unshared)(
1947        gpu_contiguous(img), gpu_contiguous(topgrad), shape)
1948    flip = (slice(None),) * (rval.ndim - 2) + \
1949        (slice(None, None, -1),) * 2
1950    if node.op.filter_flip:
1951        rval = rval[flip]
1952    rval = tensor.patternbroadcast(rval, node.outputs[0].broadcastable)
1953    rval = as_gpuarray_variable(rval, context_name=ctx)
1954    return [rval]
1955
1956
1957@local_optimizer([AbstractConv2d_gradWeights])
1958def local_abstractconv_gemm_gradweights_alt(node):
1959    if not isinstance(node.op, AbstractConv2d_gradWeights):
1960        return None
1961    img, topgrad, shape = node.inputs
1962    if not isinstance(img.type, GpuArrayType) or \
1963            not isinstance(topgrad.type, GpuArrayType):
1964        return None
1965    ctx = infer_context_name(img, topgrad)
1966    border_mode = node.op.border_mode
1967    subsample = node.op.subsample
1968    filter_dilation = node.op.filter_dilation
1969    num_groups = node.op.num_groups
1970    unshared = node.op.unshared
1971
1972    if(border_mode == 'valid' and subsample == (1, 1) and filter_dilation == (1, 1) and
1973       num_groups == 1 and not unshared):
1974        rval = GpuCorrMM(border_mode,
1975                         subsample,
1976                         filter_dilation)(
1977            gpu_contiguous(img.dimshuffle(1, 0, 2, 3)),
1978            gpu_contiguous(topgrad.dimshuffle(1, 0, 2, 3)))
1979
1980        if node.op.filter_flip:
1981            rval = rval[:, :, ::-1, ::-1]
1982
1983        rval = rval.dimshuffle(1, 0, 2, 3)
1984        rval = tensor.patternbroadcast(rval, node.outputs[0].broadcastable)
1985        rval = as_gpuarray_variable(rval, context_name=ctx)
1986        return [rval]
1987    else:
1988        return None
1989
1990
1991@local_optimizer([AbstractConv3d_gradWeights])
1992def local_abstractconv3d_gemm_gradweights_alt(node):
1993    if not isinstance(node.op, AbstractConv3d_gradWeights):
1994        return None
1995    img, topgrad, shape = node.inputs
1996    if not isinstance(img.type, GpuArrayType) or \
1997            not isinstance(topgrad.type, GpuArrayType):
1998        return None
1999    ctx = infer_context_name(img, topgrad)
2000    border_mode = node.op.border_mode
2001    subsample = node.op.subsample
2002    filter_dilation = node.op.filter_dilation
2003    num_groups = node.op.num_groups
2004
2005    if(border_mode == 'valid' and subsample == (1, 1, 1) and
2006       filter_dilation == (1, 1, 1) and num_groups == 1):
2007        rval = GpuCorr3dMM(border_mode,
2008                           subsample,
2009                           filter_dilation)(
2010            gpu_contiguous(img.dimshuffle(1, 0, 2, 3, 4)),
2011            gpu_contiguous(topgrad.dimshuffle(1, 0, 2, 3, 4)))
2012
2013        if node.op.filter_flip:
2014            rval = rval[:, :, ::-1, ::-1, ::-1]
2015
2016        rval = rval.dimshuffle(1, 0, 2, 3, 4)
2017        rval = tensor.patternbroadcast(rval, node.outputs[0].broadcastable)
2018        rval = as_gpuarray_variable(rval, context_name=ctx)
2019        return [rval]
2020    else:
2021        return None
2022
2023
2024@local_optimizer([AbstractConv3d_gradWeights])
2025def local_abstractconv3d_gradweights_gemm(node):
2026    if not isinstance(node.op, AbstractConv3d_gradWeights):
2027        return None
2028    img, topgrad, shape = node.inputs
2029    if not isinstance(img.type, GpuArrayType) or \
2030            not isinstance(topgrad.type, GpuArrayType):
2031        return None
2032    ctx = infer_context_name(img, topgrad)
2033
2034    rval = GpuCorr3dMM_gradWeights(border_mode=node.op.border_mode,
2035                                   subsample=node.op.subsample,
2036                                   filter_dilation=node.op.filter_dilation,
2037                                   num_groups=node.op.num_groups)(
2038        gpu_contiguous(img), gpu_contiguous(topgrad), shape)
2039    if node.op.filter_flip:
2040        rval = rval[:, :, ::-1, ::-1, ::-1]
2041    rval = tensor.patternbroadcast(rval, node.outputs[0].broadcastable)
2042    rval = as_gpuarray_variable(rval, context_name=ctx)
2043    return [rval]
2044
2045
2046@local_optimizer([AbstractConv2d_gradInputs])
2047def local_abstractconv_gradinputs_gemm(node):
2048    if not isinstance(node.op, AbstractConv2d_gradInputs):
2049        return None
2050    kern, topgrad, shape = node.inputs
2051    if not isinstance(kern.type, GpuArrayType) or \
2052            not isinstance(topgrad.type, GpuArrayType):
2053        return None
2054
2055    if node.op.filter_flip:
2056        flip = (slice(None),) * (kern.ndim - 2) + \
2057            (slice(None, None, -1),) * 2
2058        kern = kern[flip]
2059
2060    rval = GpuCorrMM_gradInputs(border_mode=node.op.border_mode,
2061                                subsample=node.op.subsample,
2062                                filter_dilation=node.op.filter_dilation,
2063                                num_groups=node.op.num_groups,
2064                                unshared=node.op.unshared)(
2065        gpu_contiguous(kern), gpu_contiguous(topgrad), shape)
2066    return [rval]
2067
2068
2069@local_optimizer([AbstractConv2d_gradInputs])
2070def local_abstractconv_gradinputs_gemm_alt(node):
2071    if not isinstance(node.op, AbstractConv2d_gradInputs):
2072        return None
2073    kern, topgrad, shape = node.inputs
2074    if not isinstance(kern.type, GpuArrayType) or \
2075            not isinstance(topgrad.type, GpuArrayType):
2076        return None
2077    border_mode = node.op.border_mode
2078    subsample = node.op.subsample
2079    filter_dilation = node.op.filter_dilation
2080    num_groups = node.op.num_groups
2081    unshared = node.op.unshared
2082
2083    if border_mode == 'valid' and subsample == (1, 1) and num_groups == 1 and not unshared:
2084        if not node.op.filter_flip:
2085            kern = kern[:, :, ::-1, ::-1]
2086
2087        rval = GpuCorrMM(border_mode='full',
2088                         subsample=subsample,
2089                         filter_dilation=filter_dilation)(
2090            gpu_contiguous(topgrad),
2091            gpu_contiguous(kern.dimshuffle(1, 0, 2, 3)))
2092        return [rval]
2093    else:
2094        return None
2095
2096
2097@local_optimizer([AbstractConv3d_gradInputs])
2098def local_abstractconv3d_gradinputs_gemm(node):
2099    if not isinstance(node.op, AbstractConv3d_gradInputs):
2100        return None
2101    kern, topgrad, shape = node.inputs
2102    if not isinstance(kern.type, GpuArrayType) or \
2103            not isinstance(topgrad.type, GpuArrayType):
2104        return None
2105
2106    if node.op.filter_flip:
2107        kern = kern[:, :, ::-1, ::-1, ::-1]
2108
2109    rval = GpuCorr3dMM_gradInputs(border_mode=node.op.border_mode,
2110                                  subsample=node.op.subsample,
2111                                  filter_dilation=node.op.filter_dilation,
2112                                  num_groups=node.op.num_groups)(
2113        gpu_contiguous(kern), gpu_contiguous(topgrad), shape)
2114    return [rval]
2115
2116
2117@local_optimizer([AbstractConv3d_gradInputs])
2118def local_abstractconv3d_gradinputs_gemm_alt(node):
2119    if not isinstance(node.op, AbstractConv3d_gradInputs):
2120        return None
2121    kern, topgrad, shape = node.inputs
2122    if not isinstance(kern.type, GpuArrayType) or \
2123            not isinstance(topgrad.type, GpuArrayType):
2124        return None
2125    border_mode = node.op.border_mode
2126    subsample = node.op.subsample
2127    filter_dilation = node.op.filter_dilation
2128    num_groups = node.op.num_groups
2129
2130    if(border_mode == 'valid' and subsample == (1, 1, 1) and
2131       num_groups == 1):
2132        if not node.op.filter_flip:
2133            kern = kern[:, :, ::-1, ::-1, ::-1]
2134        rval = GpuCorr3dMM(border_mode='full',
2135                           subsample=subsample,
2136                           filter_dilation=filter_dilation)(
2137            gpu_contiguous(topgrad),
2138            gpu_contiguous(kern.dimshuffle(1, 0, 2, 3, 4)))
2139        return [rval]
2140    else:
2141        return None
2142
2143
2144class ConvMetaOptimizer(LocalMetaOptimizer):
2145
2146    def __init__(self):
2147        super(ConvMetaOptimizer, self).__init__()
2148
2149    def time_call(self, fn):
2150        start = time.time()
2151        fn()[0].sync()
2152        return time.time() - start
2153
2154    def provide_inputs(self, node, inputs):
2155        result = {}
2156
2157        shapes = (node.op.imshp, node.op.kshp)
2158        if(node.op.imshp is None or node.op.kshp is None or
2159                any([s is None for shape in shapes for s in shape])):
2160            return result
2161
2162        if type(node.op) in [AbstractConv2d, AbstractConv3d]:
2163            img, kern = node.inputs
2164            for(var, shape) in zip((img, kern), shapes):
2165                result[var] = theano.shared(np.random.random(shape).astype(var.dtype),
2166                                            var.name,
2167                                            broadcastable=var.broadcastable,
2168                                            borrow=True)
2169
2170        if type(node.op) in [AbstractConv2d_gradWeights, AbstractConv3d_gradWeights]:
2171            img, top, kshape = node.inputs
2172
2173            tshp = get_conv_output_shape(node.op.imshp,
2174                                         node.op.kshp,
2175                                         node.op.border_mode,
2176                                         node.op.subsample,
2177                                         node.op.filter_dilation)
2178            convdim = img.ndim - 2
2179
2180            result[kshape] = theano.tensor.as_tensor_variable(node.op.kshp[-convdim:])
2181
2182            for(var, shape) in zip((img, top), (node.op.imshp, tshp)):
2183                result[var] = theano.shared(np.random.random(shape).astype(var.dtype),
2184                                            var.name,
2185                                            broadcastable=var.broadcastable,
2186                                            borrow=True)
2187
2188        if type(node.op) in [AbstractConv2d_gradInputs, AbstractConv3d_gradInputs]:
2189            kern, top, ishape = node.inputs
2190
2191            tshp = get_conv_output_shape(node.op.imshp,
2192                                         node.op.kshp,
2193                                         node.op.border_mode,
2194                                         node.op.subsample,
2195                                         node.op.filter_dilation)
2196
2197            result[ishape] = theano.tensor.as_tensor_variable(node.op.imshp[2:])
2198
2199            for(var, shape) in zip((kern, top), (node.op.kshp, tshp)):
2200                result[var] = theano.shared(np.random.random(shape).astype(var.dtype),
2201                                            var.name,
2202                                            broadcastable=var.broadcastable,
2203                                            borrow=True)
2204
2205        return result
2206
2207    def get_opts(self, node):
2208        opts = Counter([opt for opt in self.track_dict[type(node.op)]
2209                       if opt in self.tag_dict['default']])
2210        include_tags = config.metaopt.optimizer_including.split(':')
2211        exclude_tags = config.metaopt.optimizer_excluding.split(':')
2212
2213        for in_opt in include_tags:
2214            opts.update([opt for opt in self.track_dict[type(node.op)]
2215                        if opt in self.tag_dict[in_opt]])
2216
2217        for ex_opt in exclude_tags:
2218            opts.subtract([opt for opt in self.track_dict[type(node.op)]
2219                          if opt in self.tag_dict[ex_opt]])
2220
2221        opts = list(opts + Counter())
2222        return opts
2223
2224
2225# This deals with any abstract convs that have a transfer somewhere
2226@register_opt('fast_compile', 'conv_dnn', 'cudnn')
2227@op_lifter([AbstractConv2d,
2228            AbstractConv2d_gradWeights,
2229            AbstractConv2d_gradInputs,
2230            AbstractConv3d,
2231            AbstractConv3d_gradWeights,
2232            AbstractConv3d_gradInputs])
2233def local_gpua_abstractconv(op, context_name, inputs, outputs):
2234    if isinstance(outputs[0].type, GpuArrayType):
2235        # Don't handle this node here, it's already on the GPU.
2236        return
2237    return local_gpua_lift_abstractconv_graph(op, context_name, inputs, outputs)
2238
2239
2240@register_opt2([AbstractConv2d,
2241                AbstractConv2d_gradWeights,
2242                AbstractConv2d_gradInputs,
2243                AbstractConv3d,
2244                AbstractConv3d_gradWeights,
2245                AbstractConv3d_gradInputs], 'fast_compile')
2246def local_gpua_lift_abstractconv_graph(op, context_name, inputs, outputs):
2247    inps = list(inputs)
2248    inps[0] = as_gpuarray_variable(inputs[0],
2249                                   context_name=context_name)
2250    inps[1] = as_gpuarray_variable(inputs[1],
2251                                   context_name=context_name)
2252    return [op(*inps)]
2253
2254
2255def local_gpu_pool(op, ctx_name, inputs, outputs):
2256    assert op.__props__ == ('ignore_border', 'mode', 'ndim')
2257    inp, ws, stride, pad = inputs
2258    nd = op.ndim
2259    if nd not in (2, 3):
2260        return
2261    inp = gpu_contiguous(as_gpuarray_variable(inp, ctx_name))
2262
2263    op = GpuPool(op.ignore_border, op.mode, op.ndim)
2264    if inp.ndim == nd + 2:
2265        return op(inp, ws, stride, pad)
2266    else:
2267        # reshape to 4D or 5D with 2 non-pooling dimensions
2268        inp_padded = pad_dims(inp, 2, nd)
2269        ret_padded = op(inp_padded, ws, stride, pad)
2270        return unpad_dims(ret_padded, inp, 2, nd)
2271
2272
2273pool_db = LocalGroupDB()
2274pool_db2 = LocalGroupDB(local_opt=theano.gof.opt.GraphToGPULocalOptGroup)
2275pool_db2.__name__ = "pool_db2"
2276lifter = op_lifter([pool.Pool])(local_gpu_pool)
2277pool_db.register("local_gpu_pool", lifter,
2278                 'gpuarray', 'fast_compile', 'fast_run',
2279                 position=1)
2280pool_db2.register("local_gpu_pool",
2281                  local_optimizer([pool.Pool])(local_gpu_pool),
2282                  'gpuarray', 'fast_compile', 'fast_run',
2283                  position=1)
2284register_opt('fast_compile', name='pool_db')(pool_db)
2285register_opt2([pool.Pool], 'fast_compile', name='pool_db2')(pool_db2)
2286
2287
2288def local_gpu_max_pool_grad(op, ctx_name, inputs, outputs):
2289    assert op.__props__ == ('ignore_border', 'mode', 'ndim')
2290
2291    inp, out, out_grad, ws, stride, pad = inputs
2292    nd = op.ndim
2293    if nd not in (2, 3):
2294        return
2295    inp = gpu_contiguous(as_gpuarray_variable(inp, ctx_name))
2296    out = gpu_contiguous(as_gpuarray_variable(out, ctx_name))
2297    out_grad = gpu_contiguous(as_gpuarray_variable(out_grad, ctx_name))
2298
2299    op = GpuMaxPoolGrad(op.ignore_border, op.mode, op.ndim)
2300    if inp.ndim == nd + 2:
2301        return op(inp, out, out_grad, ws, stride, pad)
2302    else:
2303        # reshape to 4D or 5D with 2 non-pooling dimensions
2304        inp_padded = pad_dims(inp, 2, nd)
2305        out_padded = pad_dims(out, 2, nd)
2306        out_grad_padded = pad_dims(out_grad, 2, nd)
2307        ret_padded = op(inp_padded, out_padded, out_grad_padded,
2308                        ws, stride, pad)
2309        return unpad_dims(ret_padded, inp, 2, nd)
2310
2311
2312lifter = op_lifter([pool.MaxPoolGrad])(local_gpu_max_pool_grad)
2313pool_db.register("local_gpu_max_pool_grad", lifter,
2314                 'gpuarray', 'fast_compile', 'fast_run',
2315                 position=1)
2316pool_db2.register("local_gpu_max_pool_grad",
2317                  local_optimizer([pool.MaxPoolGrad])(local_gpu_max_pool_grad),
2318                  'gpuarray', 'fast_compile', 'fast_run',
2319                  position=1)
2320
2321
2322def local_gpu_average_pool_grad(op, ctx_name, inputs, outputs):
2323    assert op.__props__ == ('ignore_border', 'mode', 'ndim')
2324
2325    inp, out_grad, ws, stride, pad = inputs
2326    nd = op.ndim
2327    if nd not in (2, 3):
2328        return
2329    inp = gpu_contiguous(as_gpuarray_variable(inp, ctx_name))
2330    out_grad = gpu_contiguous(as_gpuarray_variable(out_grad, ctx_name))
2331
2332    op = GpuAveragePoolGrad(op.ignore_border, op.mode, op.ndim)
2333    if inp.ndim == nd + 2:
2334        return op(inp, out_grad, ws, stride, pad)
2335    else:
2336        # reshape to 4D or 5D with 2 non-pooling dimensions
2337        inp_padded = pad_dims(inp, 2, nd)
2338        out_grad_padded = pad_dims(out_grad, 2, nd)
2339        ret_padded = op(inp_padded, out_grad_padded,
2340                        ws, stride, pad)
2341        return unpad_dims(ret_padded, inp, 2, nd)
2342
2343
2344lifter = op_lifter([pool.AveragePoolGrad])(local_gpu_average_pool_grad)
2345pool_db.register("local_gpu_average_pool_grad", lifter,
2346                 'gpuarray', 'fast_compile', 'fast_run',
2347                 position=1)
2348pool_db2.register("local_gpu_average_pool_grad",
2349                  local_optimizer([pool.AveragePoolGrad])(local_gpu_average_pool_grad),
2350                  'gpuarray', 'fast_compile', 'fast_run',
2351                  position=1)
2352
2353
2354@register_opt()
2355@op_lifter([pool.DownsampleFactorMaxGradGrad])
2356@register_opt2([pool.DownsampleFactorMaxGradGrad])
2357def local_gpu_downsample_factor_max_grad_grad(op, ctx_name, inputs, outputs):
2358    assert op.__props__ == ('ignore_border', 'mode', 'ndim')
2359    inp, out, out_grad, ws, stride, pad = inputs
2360    nd = op.ndim
2361    if nd not in (2, 3):
2362        return
2363    inp = gpu_contiguous(as_gpuarray_variable(inp, ctx_name))
2364    out = gpu_contiguous(as_gpuarray_variable(out, ctx_name))
2365    out_grad = gpu_contiguous(as_gpuarray_variable(out_grad, ctx_name))
2366
2367    op = GpuDownsampleFactorMaxGradGrad(op.ignore_border, op.mode, op.ndim)
2368    if inp.ndim == nd + 2:
2369        return op(inp, out, out_grad, ws, stride, pad)
2370    else:
2371        # reshape to 4D or 5D with 2 non-pooling dimensions
2372        inp_padded = pad_dims(inp, 2, nd)
2373        out_padded = pad_dims(out, 2, nd)
2374        out_grad_padded = pad_dims(out_grad, 2, nd)
2375        ret_padded = op(inp_padded, out_padded, out_grad_padded,
2376                        ws, stride, pad)
2377        return unpad_dims(ret_padded, inp, 2, nd)
2378
2379
2380@register_opt()
2381@op_lifter([pool.MaxPoolRop])
2382@register_opt2([pool.MaxPoolRop])
2383def local_gpu_max_pool_rop(op, ctx_name, inputs, outputs):
2384    assert op.__props__ == ('ignore_border', 'mode', 'ndim')
2385    inp, eval_inp, ws, stride, pad = inputs
2386    nd = op.ndim
2387    if nd not in (2, 3):
2388        return
2389    inp = gpu_contiguous(as_gpuarray_variable(inp, ctx_name))
2390    eval_inp = gpu_contiguous(as_gpuarray_variable(eval_inp, ctx_name))
2391
2392    op = GpuMaxPoolRop(op.ignore_border, op.mode, op.ndim)
2393    if inp.ndim == nd + 2:
2394        return op(inp, eval_inp, ws, stride, pad)
2395    else:
2396        # reshape to 4D or 5D with 2 non-pooling dimensions
2397        inp_padded = pad_dims(inp, 2, nd)
2398        eval_inp_padded = pad_dims(eval_inp, 2, nd)
2399        ret_padded = op(inp_padded, eval_inp_padded, ws, stride, pad)
2400        return unpad_dims(ret_padded, inp, 2, nd)
2401
2402
2403@register_opt("low_memory")
2404@local_optimizer([GpuCAReduceCuda])
2405def local_gpu_elemwise_careduce(node):
2406    """
2407    Merge some GpuCAReduceCuda and GPUElemwise.
2408    Currently merged:
2409     - SUM(X^2)
2410     - SUM(ABS(X))
2411
2412    """
2413    if (isinstance(node.op, GpuCAReduceCuda) and
2414            node.op.pre_scalar_op is None and
2415            node.inputs[0].owner and
2416            isinstance(node.inputs[0].owner.op, GpuElemwise) and
2417            # The Op support all scalar with 1 inputs.  We don't
2418            # automatically add more case, as some like trigonometic
2419            # operation with some reduction pattern will probably results
2420            # in slow down.
2421            isinstance(node.inputs[0].owner.op.scalar_op, (scalar.basic.Sqr,
2422                                                           scalar.basic.Abs))):
2423        inp = node.inputs[0].owner.inputs[0]
2424        props = node.op._props_dict()
2425        props["pre_scalar_op"] = node.inputs[0].owner.op.scalar_op
2426        with inherit_stack_trace(node.outputs):
2427            out = GpuCAReduceCuda(**props)(inp)
2428            return [out]
2429
2430
2431@local_optimizer(None)
2432def local_assert_no_cpu_op(node):
2433    if (all([var.owner and isinstance(var.owner.op, HostFromGpu)
2434             for var in node.inputs]) and
2435        any([[c for c in var.clients if isinstance(c[0].op, GpuFromHost)]
2436             for var in node.outputs])):
2437
2438            if config.assert_no_cpu_op == "warn":
2439                _logger.warning(("CPU Op %s is detected in the computation "
2440                                 "graph") % node)
2441            elif config.assert_no_cpu_op == "raise":
2442                raise AssertionError("The Op %s is on CPU." % node)
2443            elif config.assert_no_cpu_op == "pdb":
2444                pdb.set_trace()
2445
2446
2447# Register the local_assert_no_cpu_op:
2448assert_no_cpu_op = theano.tensor.opt.in2out(local_assert_no_cpu_op,
2449                                            name='assert_no_cpu_op')
2450# 49.2 is after device specialization & fusion optimizations for last transfers
2451optdb.register('gpua_assert_no_cpu_op', assert_no_cpu_op, 49.2,
2452               'assert_no_cpu_op')
2453
2454
2455def tensor_to_gpu(x, context_name):
2456    if isinstance(x.type, tensor.TensorType):
2457        y = GpuArrayType(broadcastable=x.type.broadcastable,
2458                         context_name=context_name,
2459                         dtype=x.type.dtype)()
2460        if x.name:
2461            y.name = x.name + '[Gpua]'
2462        return y
2463    else:
2464        return x
2465
2466
2467def gpu_safe_new(x, tag=''):
2468    """
2469    Internal function that constructs a new variable from x with the same
2470    type, but with a different name (old name + tag). This function is used
2471    by gradient, or the R-op to construct new variables for the inputs of
2472    the inner graph such that there is no interference between the original
2473    graph and the newly constructed graph.
2474
2475    """
2476    if hasattr(x, 'name') and x.name is not None:
2477        nw_name = x.name + tag
2478    else:
2479        nw_name = None
2480
2481    if isinstance(x, theano.Constant):
2482        return x.clone()
2483
2484    nw_x = x.type()
2485    nw_x.name = nw_name
2486    return nw_x
2487
2488
2489def gpu_reconstruct_graph(inputs, outputs, tag=None):
2490    """
2491    Different interface to clone, that allows you to pass inputs.
2492    Compared to clone, this method always replaces the inputs with
2493    new variables of the same type, and returns those (in the same
2494    order as the original inputs).
2495
2496    """
2497    if tag is None:
2498        tag = ''
2499    nw_inputs = [gpu_safe_new(x, tag) for x in inputs]
2500    givens = {}
2501    for nw_x, x in zip(nw_inputs, inputs):
2502        givens[x] = nw_x
2503    nw_outputs = scan_utils.clone(outputs, replace=givens)
2504    return (nw_inputs, nw_outputs)
2505
2506
2507@register_opt('scan', 'fast_compile')
2508@op_lifter([scan_op.Scan])
2509@register_opt2([scan_op.Scan], 'fast_compile')
2510def local_gpua_scan_to_gpua(op, context_name, inputs, outputs):
2511    info = copy.deepcopy(op.info)
2512    if info.get('gpua', False):
2513        return
2514    info['gpua'] = True
2515    nw_ins = [inputs[0]]
2516    e = (1 +
2517         op.n_seqs +
2518         op.n_mit_mot +
2519         op.n_mit_sot +
2520         op.n_sit_sot +
2521         op.n_shared_outs)
2522    nw_ins += [safe_to_gpu(x, context_name) for x in inputs[1:e]]
2523    b = e
2524    e = e + op.n_nit_sot
2525    nw_ins += inputs[b:e]
2526    nw_ins += [safe_to_gpu(x, context_name) for x in inputs[e:]]
2527    scan_ins = [tensor_to_gpu(x, context_name) for x in op.inputs]
2528
2529    # The inner output corresponding to the looping condition should not be
2530    # moved to the gpu
2531    if op.info['as_while']:
2532        scan_outs = [safe_to_gpu(x, context_name) for x in op.outputs[:-1]]
2533        scan_outs += [op.outputs[-1]]
2534    else:
2535        scan_outs = [safe_to_gpu(x, context_name) for x in op.outputs]
2536    scan_outs = scan_utils.clone(
2537        scan_outs,
2538        replace=list(zip(op.inputs,
2539                         (safe_to_cpu(x) for x in scan_ins))))
2540
2541    # We need to construct the hash here, because scan
2542    # __init__ does not know about the gpu and can not
2543    # handle graphs with inputs being on the gpu
2544    tmp_in, tmp_out = gpu_reconstruct_graph(scan_ins, scan_outs)
2545    local_fgraph = gof.FunctionGraph(tmp_in, tmp_out, clone=True)
2546    _cmodule_key = gof.CLinker().cmodule_key_(local_fgraph, [])
2547    info['gpu_hash'] = hash(_cmodule_key)
2548
2549    def typebuild(dtype, broadcastable, context_name=context_name):
2550        return GpuArrayType(dtype=dtype, broadcastable=broadcastable,
2551                            context_name=context_name)
2552
2553    nw_op = scan_op.Scan(scan_ins, scan_outs, info,
2554                         typeConstructor=typebuild).make_node(*nw_ins)
2555    return nw_op.outputs
2556
2557
2558def _scan_type_infer(node):
2559    context_name = infer_context_name(*node.inputs)
2560
2561    def typebuild(dtype, broadcastable, context_name=context_name):
2562        return GpuArrayType(dtype=dtype, broadcastable=broadcastable,
2563                            context_name=context_name)
2564    return typebuild
2565
2566
2567# Add optimization : maxandargmax (CPU -> GPU)
2568@register_opt('fast_compile')
2569@op_lifter([tensor.MaxAndArgmax])
2570@register_opt2([tensor.MaxAndArgmax], 'fast_compile')
2571def local_gpu_maxandargmax(op, context_name, inputs, outputs):
2572    op = GpuMaxAndArgmax(op.get_params(None))
2573    if inputs[0].dtype == "float16":
2574        # For now it is better to copy/cast on the GPU then transfer to the CPU
2575        casted_inputs = inputs[0].astype('float32')
2576        ret = op(casted_inputs)
2577        return [ret[0].astype('float16'), ret[1]]
2578    return op
2579
2580
2581@register_opt('fast_compile')
2582@op_lifter([Images2Neibs])
2583@register_opt2([Images2Neibs], 'fast_compile')
2584def local_gpua_images2neibs(op, context_name, inputs, outputs):
2585    if op.mode in ['valid', 'half', 'full', 'ignore_borders', 'wrap_centered']:
2586        return GpuImages2Neibs(op.mode)
2587
2588
2589# solve
2590@register_opt('fast_compile')
2591@op_lifter([slinalg.Solve])
2592@register_opt2([theano.tensor.slinalg.Solve], 'fast_compile')
2593def local_gpu_solve(op, context_name, inputs, outputs):
2594    if inputs[0].dtype not in ['float16', 'float32', 'float64']:
2595        return
2596    if op.A_structure not in MATRIX_STRUCTURES_SOLVE:
2597        return
2598
2599    if op.A_structure in ['lower_triangular', 'upper_triangular']:
2600        if not cublas_available:
2601            return
2602        lower = op.A_structure == 'lower_triangular'
2603        op = GpuCublasTriangularSolve(lower)
2604    else:
2605        if not cusolver_available:
2606            return
2607        op = GpuCusolverSolve(A_structure=op.A_structure)
2608
2609    if inputs[0].dtype == 'float16':
2610        return op(inputs[0].astype('float32'),
2611                  inputs[1].astype('float32')).astype('float16')
2612    return op
2613
2614
2615@register_inplace()
2616@local_optimizer([GpuCusolverSolve], inplace=True)
2617def local_inplace_gpu_solve(node):
2618    if isinstance(node.op, GpuCusolverSolve) and not node.op.inplace:
2619        with inherit_stack_trace(node.outputs):
2620            return [GpuCusolverSolve(A_structure=node.op.A_structure,
2621                                     trans=node.op.trans,
2622                                     inplace=True)(*node.inputs)]
2623
2624
2625# Cholesky decomposition
2626def local_gpu_cholesky(op, context_name, inputs, outputs):
2627    if not cusolver_available:
2628        return
2629    if inputs[0].dtype not in ['float16', 'float32', 'float64']:
2630        return
2631    op = GpuCholesky(lower=op.lower, inplace=op.destructive)
2632    if inputs[0].dtype == 'float16':
2633        return op(inputs[0].astype('float32')).astype('float16')
2634
2635    return op
2636matrix_ops_db = LocalGroupDB()
2637matrix_ops_db2 = LocalGroupDB(local_opt=theano.gof.opt.GraphToGPULocalOptGroup)
2638matrix_ops_db2.__name__ = "matrix_ops_db2"
2639
2640# For Cholesky decomposition, magma 2.2 is slower than cusolver 8 (tested for
2641# matrices of size 1000). Thus, cusolver is prioritized during graph
2642# optimizations. To explicitly use magma, you should disable cusolver using
2643# `optimizer_excluding=cusolver` in Theano config.
2644lifter = op_lifter([slinalg.Cholesky])(local_gpu_cholesky)
2645matrix_ops_db.register("local_gpu_cholesky", lifter,
2646                       'gpuarray', 'fast_compile', 'fast_run', 'cusolver',
2647                       position=0)
2648matrix_ops_db2.register("local_gpu_cholesky",
2649                        local_optimizer([slinalg.Cholesky])(local_gpu_cholesky),
2650                        'gpuarray', 'fast_compile', 'fast_run', 'cusolver',
2651                        position=0)
2652register_opt('fast_compile', name='matrix_ops_db')(matrix_ops_db)
2653register_opt2([slinalg.Solve], 'fast_compile', name='matrix_ops_db2')(matrix_ops_db2)
2654
2655
2656@register_inplace()
2657@local_optimizer([GpuCholesky], inplace=True)
2658def local_inplace_gpu_cholesky(node):
2659    if isinstance(node.op, GpuCholesky) and not node.op.inplace:
2660        with inherit_stack_trace(node.outputs):
2661            return [node.op.clone_inplace()(*node.inputs)]
2662
2663
2664def local_gpu_magma_cholesky(op, context_name, inputs, outputs):
2665    if not config.magma.enabled:
2666        return
2667    if inputs[0].dtype not in ['float16', 'float32']:
2668        return
2669    op = GpuMagmaCholesky(lower=op.lower, inplace=op.destructive)
2670    if inputs[0].dtype == 'float16':
2671        return op(inputs[0].astype('float32')).astype('float16')
2672    return op
2673lifter = op_lifter([slinalg.Cholesky])(local_gpu_magma_cholesky)
2674matrix_ops_db.register("local_gpu_magma_cholesky", lifter,
2675                       'gpuarray', 'fast_compile', 'fast_run', 'magma',
2676                       position=1)
2677matrix_ops_db2.register("local_gpu_magma_cholesky",
2678                        local_optimizer([slinalg.Cholesky])(local_gpu_magma_cholesky),
2679                        'gpuarray', 'fast_compile', 'fast_run', 'magma',
2680                        position=1)
2681
2682
2683@register_inplace()
2684@local_optimizer([GpuMagmaCholesky], inplace=True)
2685def local_inplace_gpu_magma_cholesky(node):
2686    if isinstance(node.op, GpuMagmaCholesky) and not node.op.inplace:
2687        return [node.op.clone_inplace()(*node.inputs)]
2688
2689
2690# QR decomposition
2691@register_opt('magma', 'fast_compile')
2692@op_lifter([nlinalg.QRFull])
2693@register_opt2([theano.tensor.nlinalg.QRFull], 'magma', 'fast_compile')
2694def local_gpu_magma_qr(op, context_name, inputs, outputs):
2695    if not config.magma.enabled or op.mode != 'reduced':
2696        return
2697    if inputs[0].dtype not in ['float16', 'float32']:
2698        return
2699    x = inputs[0]
2700    if inputs[0].dtype == 'float16':
2701        x = inputs[0].astype('float32')
2702    out = gpu_qr(x, complete=True)
2703    if inputs[0].dtype == 'float16':
2704        return [o.astype('float16') for o in out]
2705    return out
2706
2707
2708@register_opt('magma', 'fast_compile')
2709@op_lifter([nlinalg.QRIncomplete])
2710@register_opt2([theano.tensor.nlinalg.QRIncomplete], 'magma', 'fast_compile')
2711def local_gpu_magma_qr_incomplete(op, context_name, inputs, outputs):
2712    if not config.magma.enabled:
2713        return
2714    if inputs[0].dtype not in ['float16', 'float32']:
2715        return
2716    x = inputs[0]
2717    if inputs[0].dtype == 'float16':
2718        x = inputs[0].astype('float32')
2719    out = gpu_qr(x, complete=False)
2720    if inputs[0].dtype == 'float16':
2721        return [out.astype('float16')]
2722    return out
2723
2724
2725# Matrix inverse
2726@register_opt('magma', 'fast_compile')
2727@op_lifter([nlinalg.MatrixInverse])
2728@register_opt2([theano.tensor.nlinalg.MatrixInverse], 'magma', 'fast_compile')
2729def local_gpu_magma_matrix_inverse(op, context_name, inputs, outputs):
2730    if not config.magma.enabled:
2731        return
2732    if inputs[0].dtype not in ['float16', 'float32']:
2733        return
2734    op = GpuMagmaMatrixInverse()
2735    if inputs[0].dtype == 'float16':
2736        return op(inputs[0].astype('float32')).astype('float16')
2737    return op
2738
2739
2740@register_inplace()
2741@local_optimizer([GpuMagmaMatrixInverse])
2742def local_inplace_gpu_magma_matrix_inverse(node):
2743    if isinstance(node.op, GpuMagmaMatrixInverse) and not node.op.inplace:
2744        with inherit_stack_trace(node.outputs):
2745            return [node.op.clone_inplace()(*node.inputs)]
2746
2747
2748# Eigen decomposition of a symmetric matrix
2749@register_opt('magma', 'fast_compile')
2750@op_lifter([nlinalg.Eigh])
2751@register_opt2([theano.tensor.nlinalg.Eigh], 'magma', 'fast_compile')
2752def local_gpu_magma_eigh(op, context_name, inputs, outputs):
2753    if not config.magma.enabled:
2754        return
2755    if inputs[0].dtype not in ['float16', 'float32']:
2756        return
2757    op = GpuMagmaEigh(UPLO=op.UPLO, compute_v=True)
2758    if inputs[0].dtype == 'float16':
2759        return op(inputs[0].astype('float32')).astype('float16')
2760    return op
2761
2762
2763# Singular Value Decomposition
2764@register_opt('magma', 'fast_compile')
2765@op_lifter([nlinalg.SVD])
2766@register_opt2([theano.tensor.nlinalg.SVD], 'magma', 'fast_compile')
2767def local_gpu_magma_svd(op, context_name, inputs, outputs):
2768    if not config.magma.enabled:
2769        return
2770    if inputs[0].dtype not in ['float16', 'float32']:
2771        return
2772    x = inputs[0]
2773    if inputs[0].dtype == 'float16':
2774        x = inputs[0].astype('float32')
2775    out = gpu_svd(x, compute_uv=op.compute_uv, full_matrices=op.full_matrices)
2776    if inputs[0].dtype == 'float16':
2777        if op.compute_uv:
2778            out = [o.astype('float16') for o in out]
2779        else:
2780            out = [out.astype('float16')]
2781    return out
2782
2783
2784@register_opt('ctc', 'fast_compile')
2785@op_lifter([theano.tensor.nnet.ctc.ConnectionistTemporalClassification])
2786@register_opt2([ConnectionistTemporalClassification], 'ctc', 'fast_compile')
2787def local_gpu_ctc(op, context_name, inputs, outputs):
2788    op = GpuConnectionistTemporalClassification(compute_grad=op.compute_grad)
2789    return op.make_node(*inputs).outputs
2790
2791
2792# Do not register in fast_run or fast_compile.
2793# It will be added to fast_run if the GPU is enabled.
2794optdb.register('gpua_scanOp_make_inplace',
2795               scan_opt.ScanInplaceOptimizer(typeInfer=_scan_type_infer,
2796                                             gpua_flag=True),
2797               75,
2798               'gpuarray',
2799               'inplace',
2800               'scan')
2801
2802
2803# Register GPU convolution implementation
2804# They are tried in a specific order so we can control
2805# which ones take precedence over others.
2806abstractconv_groupopt = theano.gof.optdb.LocalGroupDB()
2807abstractconv_groupopt.__name__ = "gpuarray_abstractconv_opts"
2808register_opt('fast_compile')(abstractconv_groupopt)
2809
2810# We import these opts here instead of at the top of this file
2811# to avoid a circular dependency problem with dnn
2812from .dnn import (local_abstractconv_cudnn,
2813                  local_abstractconv_gw_cudnn,
2814                  local_abstractconv_gi_cudnn,     # noqa: 402
2815                  local_abstractconv_cudnn_alt,
2816                  local_abstractconv3d_cudnn_alt)
2817
2818abstractconv_groupopt.register('local_abstractconv_dnn',
2819                               local_abstractconv_cudnn, 20,
2820                               'conv_dnn',
2821                               'gpuarray', 'fast_compile', 'fast_run', 'cudnn')
2822abstractconv_groupopt.register('local_abstractconv_gw_dnn',
2823                               local_abstractconv_gw_cudnn, 20,
2824                               'conv_dnn',
2825                               'gpuarray', 'fast_compile', 'fast_run', 'cudnn')
2826abstractconv_groupopt.register('local_abstractconv_gi_dnn',
2827                               local_abstractconv_gi_cudnn, 20,
2828                               'conv_dnn',
2829                               'gpuarray', 'fast_compile', 'fast_run', 'cudnn')
2830# The GEMM-based convolution comes last to catch all remaining cases.
2831# It can be disabled by excluding 'conv_gemm'.
2832abstractconv_groupopt.register('local_abstractconv_gemm', local_abstractconv_gemm, 30,
2833                               'conv_gemm',
2834                               'gpuarray', 'fast_compile', 'fast_run')
2835abstractconv_groupopt.register('local_abstractconv3d_gemm', local_abstractconv3d_gemm, 30,
2836                               'conv_gemm',
2837                               'gpuarray', 'fast_compile', 'fast_run')
2838abstractconv_groupopt.register('local_abstractconv_gradweights_gemm',
2839                               local_abstractconv_gradweights_gemm, 30,
2840                               'conv_gemm',
2841                               'gpuarray', 'fast_compile', 'fast_run')
2842abstractconv_groupopt.register('local_abstractconv3d_gradweights_gemm',
2843                               local_abstractconv3d_gradweights_gemm, 30,
2844                               'conv_gemm',
2845                               'gpuarray', 'fast_compile', 'fast_run')
2846abstractconv_groupopt.register('local_abstractconv_gradinputs',
2847                               local_abstractconv_gradinputs_gemm, 30,
2848                               'conv_gemm',
2849                               'gpuarray', 'fast_compile', 'fast_run')
2850abstractconv_groupopt.register('local_abstractconv3d_gradinputs',
2851                               local_abstractconv3d_gradinputs_gemm, 30,
2852                               'conv_gemm',
2853                               'gpuarray', 'fast_compile', 'fast_run')
2854
2855conv_metaopt = ConvMetaOptimizer()
2856
2857conv_metaopt.register(local_abstractconv_cudnn,
2858                      ['default', 'cudnn', 'conv_dnn'])
2859conv_metaopt.register(local_abstractconv_gw_cudnn,
2860                      ['default', 'cudnn', 'conv_dnn'])
2861conv_metaopt.register(local_abstractconv_gi_cudnn,
2862                      ['default', 'cudnn', 'conv_dnn'])
2863conv_metaopt.register(local_abstractconv_gemm_def,
2864                      ['default', 'conv_gemm'])
2865conv_metaopt.register(local_abstractconv3d_gemm_def,
2866                      ['default', 'conv_gemm'])
2867conv_metaopt.register(local_abstractconv_gradweights_gemm,
2868                      ['default', 'conv_gemm'])
2869conv_metaopt.register(local_abstractconv3d_gradweights_gemm,
2870                      ['default', 'conv_gemm'])
2871conv_metaopt.register(local_abstractconv_gradinputs_gemm,
2872                      ['default', 'conv_gemm'])
2873conv_metaopt.register(local_abstractconv3d_gradinputs_gemm,
2874                      ['default', 'conv_gemm'])
2875conv_metaopt.register(local_abstractconv_gemm_alt,
2876                      ['default', 'alternative', 'conv_gemm'])
2877conv_metaopt.register(local_abstractconv_gemm_gradweights_alt,
2878                      ['default', 'alternative', 'conv_gemm'])
2879conv_metaopt.register(local_abstractconv_gradinputs_gemm_alt,
2880                      ['default', 'alternative', 'conv_gemm'])
2881conv_metaopt.register(local_abstractconv_cudnn_alt,
2882                      ['default', 'alternative', 'cudnn', 'conv_dnn'])
2883conv_metaopt.register(local_abstractconv3d_cudnn_alt,
2884                      ['default', 'alternative', 'cudnn', 'conv_dnn'])
2885conv_metaopt.register(local_abstractconv3d_alt,
2886                      ['default', 'alternative', 'conv_gemm'])
2887conv_metaopt.register(local_abstractconv3d_gemm_gradweights_alt,
2888                      ['default', 'alternative', 'conv_gemm'])
2889conv_metaopt.register(local_abstractconv3d_gradinputs_gemm_alt,
2890                      ['default', 'alternative', 'conv_gemm'])
2891conv_metaopt.register(local_abstractconv3d2d,
2892                      ['alternative', 'conv3d2d'])
2893
2894abstractconv_groupopt.register('conv_metaopt', conv_metaopt, 'conv_meta', position=0)
2895
2896# Register cuDNN batch normalization implementation
2897
2898# We import these opts here instead of at the top of this file
2899# to avoid a circular dependency problem with dnn
2900from .dnn import (local_abstract_batch_norm_train_cudnn,
2901                  local_abstract_batch_norm_train_grad_cudnn,
2902                  local_abstract_batch_norm_inference_cudnn)     # noqa: 402
2903
2904abstract_batch_norm_groupopt = theano.gof.optdb.LocalGroupDB()
2905abstract_batch_norm_groupopt.__name__ = "gpuarray_batchnorm_opts"
2906register_opt('fast_compile')(abstract_batch_norm_groupopt)
2907
2908abstract_batch_norm_db = LocalGroupDB()
2909abstract_batch_norm_db2 = LocalGroupDB(
2910    local_opt=theano.gof.opt.GraphToGPULocalOptGroup)
2911abstract_batch_norm_db2.__name__ = "abstract_batch_norm_db2"
2912register_opt('fast_compile', name='abstract_batch_norm_db')(
2913    abstract_batch_norm_db)
2914register_opt2([bn.AbstractBatchNormTrain,
2915               bn.AbstractBatchNormTrainGrad,
2916               bn.AbstractBatchNormInference],
2917              'fast_compile', name='abstract_batch_norm_db2')(
2918    abstract_batch_norm_db2)
2919
2920for op, fct, cpu in [(bn.AbstractBatchNormTrain,
2921                      local_abstract_batch_norm_train_cudnn,
2922                      bn.local_abstract_batch_norm_train),
2923                     (bn.AbstractBatchNormTrainGrad,
2924                      local_abstract_batch_norm_train_grad_cudnn,
2925                      bn.local_abstract_batch_norm_train_grad),
2926                     (bn.AbstractBatchNormInference,
2927                      local_abstract_batch_norm_inference_cudnn,
2928                      bn.local_abstract_batch_norm_inference)]:
2929    lifter = op_lifter([op])(fct)
2930    abstract_batch_norm_db.register(fct.__name__,
2931                                    lifter,
2932                                    'gpuarray', 'fast_compile', 'fast_run',
2933                                    'cudnn', 'batchnorm_dnn',
2934                                    position=1)
2935    abstract_batch_norm_db2.register(fct.__name__,
2936                                     local_optimizer([op])(fct),
2937                                     'gpuarray', 'fast_compile', 'fast_run',
2938                                     'cudnn', 'batchnorm_dnn',
2939                                     position=1)
2940    # cpu is a normal optimization. We can't register it in
2941    # GraphToGPU.  So for now, only add it to the slower EQ phase.  If
2942    # there is no cuDNN, we still want to move it to the GPU now with
2943    # a Theano graph so to have this graph on the GPU.
2944    abstract_batch_norm_db.register(cpu.__name__, cpu,
2945                                    'gpuarray', 'fast_compile', 'fast_run',
2946                                    position='last')
2947