1from __future__ import absolute_import, print_function, division 2import copy 3import numpy as np 4import logging 5import pdb 6import time 7from six import iteritems 8from six.moves import xrange 9import sys 10 11import theano 12from theano import tensor, scalar, gof, config 13from theano.compile import optdb 14from theano.compile.ops import shape_i 15from theano.gof import (local_optimizer, EquilibriumDB, TopoOptimizer, 16 LocalGroupDB, 17 SequenceDB, Optimizer, DB, toolbox, graph) 18from theano.gof.opt import (LocalMetaOptimizer, copy_stack_trace, 19 inherit_stack_trace) 20from theano.ifelse import IfElse 21from theano.misc.ordered_set import OrderedSet 22 23from theano.scalar.basic import Scalar, Pow, Cast 24from theano.scalar.basic import log, neg, true_div 25from theano.scalar.basic_scipy import Erfinv, Erfcinv 26from theano.scan_module import scan_utils, scan_op, scan_opt 27 28from theano.tensor.nnet import bn, conv3d2d 29from theano.tensor.nnet.conv import ConvOp 30from theano.tensor.nnet.blocksparse import SparseBlockGemv, SparseBlockOuter 31from theano.tensor.nnet.abstract_conv import (BaseAbstractConv, 32 AbstractConv2d, 33 AbstractConv2d_gradWeights, 34 AbstractConv2d_gradInputs, 35 AbstractConv3d, 36 AbstractConv3d_gradWeights, 37 AbstractConv3d_gradInputs, 38 get_conv_output_shape) 39from theano.tensor.nnet.neighbours import Images2Neibs 40from theano.tensor.nnet.ctc import ConnectionistTemporalClassification 41import theano.tensor.nlinalg as nlinalg 42import theano.tensor.signal.pool as pool 43import theano.tensor.slinalg as slinalg 44from collections import Counter 45 46from theano.tests.breakpoint import PdbBreakpoint 47 48from .type import (GpuArrayType, GpuArrayConstant, get_context, 49 ContextNotDefined, move_to_gpu) 50from .basic_ops import (as_gpuarray_variable, infer_context_name, 51 host_from_gpu, GpuToGpu, 52 HostFromGpu, GpuFromHost, 53 GpuSplit, GpuContiguous, gpu_contiguous, 54 GpuAlloc, GpuAllocEmpty, GpuReshape, 55 GpuEye, GpuTri, gpu_join, GpuJoin) 56from .blas import (gpu_dot22, GpuGemm, GpuGer, GpuGemmBatch, 57 gpugemm_no_inplace, gpugemm_inplace, 58 gpugemmbatch_no_inplace, 59 gpugemv_no_inplace, gpugemv_inplace, 60 GpuCorrMM, GpuCorrMM_gradInputs, GpuCorrMM_gradWeights, 61 GpuCorr3dMM, GpuCorr3dMM_gradInputs, GpuCorr3dMM_gradWeights) 62from .pool import (GpuPool, GpuMaxPoolGrad, GpuAveragePoolGrad, GpuMaxPoolRop, 63 GpuDownsampleFactorMaxGradGrad) 64from .blocksparse import (GpuSparseBlockGemv, GpuSparseBlockOuter, 65 gpu_sparse_block_outer, 66 gpu_sparse_block_outer_inplace, 67 gpu_sparse_block_gemv, gpu_sparse_block_gemv_inplace) 68from .nnet import (gpu_crossentropy_softmax_1hot_with_bias_dx, 69 gpu_crossentropy_softmax_argmax_1hot_with_bias, 70 gpu_softmax_with_bias, gpu_softmax) 71from .elemwise import (GpuElemwise, GpuDimShuffle, GpuCAReduceCuda, 72 GpuCAReduceCPY, gpu_erfinv, gpu_erfcinv, 73 max_inputs_to_GpuElemwise) 74from .subtensor import (GpuIncSubtensor, GpuSubtensor, 75 GpuAdvancedSubtensor, 76 GpuAdvancedSubtensor1, 77 GpuAdvancedBooleanSubtensor, 78 GpuAdvancedIncSubtensor, 79 GpuAdvancedIncSubtensor1, 80 GpuAdvancedIncSubtensor1_dev20, 81 GpuAdvancedBooleanIncSubtensor, 82 GpuAllocDiag, GpuExtractDiag) 83from .opt_util import alpha_merge, output_merge, pad_dims, unpad_dims 84from .reduction import GpuMaxAndArgmax 85from .linalg import (GpuCusolverSolve, MATRIX_STRUCTURES_SOLVE, GpuCholesky, 86 cusolver_available, GpuMagmaMatrixInverse, gpu_svd, 87 GpuMagmaCholesky, gpu_qr, GpuMagmaEigh, 88 GpuCublasTriangularSolve, cublas_available) 89from .neighbours import GpuImages2Neibs 90from .ctc import GpuConnectionistTemporalClassification 91 92_logger = logging.getLogger("theano.gpuarray.opt") 93 94 95gpu_optimizer = EquilibriumDB() 96gpu_cut_copies = EquilibriumDB() 97 98# Not used for an EquilibriumOptimizer. It has the "tracks" that we need for GraphToGPUDB. 99gpu_optimizer2 = EquilibriumDB() 100 101 102class GraphToGPUDB(DB): 103 """ 104 Retrieves the list local optimizers based on the optimizer flag's value 105 from EquilibriumOptimizer by calling the method query. 106 107 """ 108 109 def query(self, *tags, **kwtags): 110 opt = gpu_optimizer2.query(*tags, **kwtags) 111 return GraphToGPU(opt.local_optimizers_all, opt.local_optimizers_map) 112 113 114gpu_seqopt = SequenceDB() 115 116gpu_seqopt.register('gpuarray_graph_optimization', GraphToGPUDB(), -0.5, 117 'fast_compile', 'fast_run', 'gpuarray') 118 119gpu_seqopt.register('gpuarray_local_optimizations', gpu_optimizer, 1, 120 'fast_compile', 'fast_run', 'gpuarray', 'gpuarray_local_optimiziations') 121gpu_seqopt.register('gpuarray_cut_transfers', gpu_cut_copies, 2, 122 'fast_compile', 'fast_run', 'gpuarray') 123 124# do not add 'fast_run' to these two as this would always enable gpuarray mode 125optdb.register('gpuarray_opt', gpu_seqopt, 126 optdb.__position__.get('add_destroy_handler', 49.5) - 1, 127 'gpuarray') 128 129 130def register_opt(*tags, **kwargs): 131 def f(local_opt): 132 name = (kwargs and kwargs.pop('name')) or local_opt.__name__ 133 gpu_optimizer.register(name, local_opt, 'fast_run', 'gpuarray', *tags) 134 return local_opt 135 return f 136 137 138def register_opt2(tracks, *tags, **kwargs): 139 ''' 140 Decorator for the new GraphToGPU optimizer. 141 Takes an extra parameter(Op) compared to register_opt decorator. 142 143 Parameters 144 ---------- 145 tracks : List of Op class Or Op instance or None 146 The Node's Op to which optimization is being applied. 147 148 tags : String 149 The optimization tag to which the optimizer will be registered. 150 151 ''' 152 def f(local_opt): 153 name = (kwargs and kwargs.pop('name')) or local_opt.__name__ 154 if isinstance(local_opt, theano.gof.DB): 155 opt = local_opt 156 else: 157 opt = theano.gof.local_optimizer(tracks)(local_opt) 158 gpu_optimizer2.register(name, opt, 'fast_run', 'gpuarray', *tags) 159 return local_opt 160 return f 161 162 163def register_inplace(*tags, **kwargs): 164 def f(local_opt): 165 name = (kwargs and kwargs.pop('name')) or local_opt.__name__ 166 optdb.register( 167 name, TopoOptimizer( 168 local_opt, failure_callback=TopoOptimizer.warn_inplace), 169 60, 'fast_run', 'inplace', 'gpuarray', *tags) 170 return local_opt 171 return f 172 173 174register_opt('fast_compile')(theano.tensor.opt.local_track_shape_i) 175register_opt(final_opt=True, name='gpua_constant_folding')( 176 tensor.opt.constant_folding) 177gpu_optimizer.register('local_remove_all_assert', 178 theano.tensor.opt.local_remove_all_assert, 179 'unsafe') 180 181 182# Define a few operations to use in optimizations, 183# in order to avoid introducin new CPU Ops, or useless ones. 184def safe_to_gpu(x, ctx_name): 185 if isinstance(x.type, tensor.TensorType): 186 return GpuFromHost(ctx_name)(x) 187 else: 188 return x 189 190 191def safe_to_cpu(x): 192 if isinstance(x.type, GpuArrayType): 193 return x.transfer('cpu') 194 else: 195 return x 196 197gpu_log = GpuElemwise(log) 198gpu_neg = GpuElemwise(neg) 199gpu_true_div = GpuElemwise(true_div) 200 201 202def op_lifter(OP, cuda_only=False): 203 """ 204 OP(..., host_from_gpu(), ...) -> host_from_gpu(GpuOP(...)) 205 206 gpu_from_host(OP(inp0, ...)) -> GpuOP(inp0, ...) 207 208 """ 209 def f(maker): 210 def local_opt(node): 211 if type(node.op) in OP: 212 # Either one of our inputs is on the gpu or 213 # all of our clients are on the gpu 214 replace = False 215 # TODO: Maybe set context_name with infer_context_name()? 216 context_name = None 217 # We replace if any input is a host_from_gpu 218 for i in node.inputs: 219 if (i.owner and i.owner.op == host_from_gpu and 220 move_to_gpu(i)): 221 context_name = i.owner.inputs[0].type.context_name 222 replace = True 223 break 224 225 if not replace: 226 # We replace if *all* clients are on the GPU 227 clients = [c for o in node.outputs for c in o.clients] 228 replace = len(clients) != 0 229 for c, idx in clients: 230 if (c == 'output' or 231 not isinstance(c.op, GpuFromHost)): 232 replace = False 233 # TODO: check that the clients want the same context? 234 if replace: 235 # All clients are GpuFromHost and we have at least one 236 context_name = clients[0][0].op.context_name 237 238 # Check if we should replace 239 if (not replace or 240 (cuda_only and 241 get_context(context_name).kind != b'cuda') or 242 any(["complex" in getattr(i, 'dtype', "") 243 for i in node.inputs])): 244 return False 245 246 # tag the inputs with the context in case 247 # the context was derived from the outputs 248 for i in node.inputs: 249 i.tag.context_name = context_name 250 251 new_op = maker(node.op, context_name, node.inputs, node.outputs) 252 253 # This is needed as sometimes new_op inherits from OP. 254 if new_op and new_op != node.op: 255 if isinstance(new_op, theano.Op): 256 new_outputs = new_op(*node.inputs, return_list=True) 257 to_cpu_fn = safe_to_cpu 258 elif isinstance(new_op, (tuple, list)): 259 new_outputs = new_op 260 to_cpu_fn = safe_to_cpu 261 else: # suppose it is a variable on the GPU 262 new_outputs = [new_op] 263 264 def to_cpu_fn(x): 265 return x.transfer('cpu') 266 # copy stack traces onto gpu outputs 267 # also copy the stack traces onto HostFromGpu outputs 268 on_cpu = [] 269 for old_output, new_output in zip(node.outputs, new_outputs): 270 copy_stack_trace(old_output, new_output) 271 cpu = to_cpu_fn(new_output) 272 on_cpu.append(cpu) 273 copy_stack_trace(old_output, cpu) 274 return on_cpu 275 return False 276 local_opt.__name__ = maker.__name__ 277 return local_optimizer(OP)(local_opt) 278 return f 279 280 281class InputToGpuOptimizer(Optimizer): 282 """ 283 Transfer the input to the gpu to start the rolling wave. 284 285 """ 286 def add_requirements(self, fgraph): 287 fgraph.attach_feature(toolbox.ReplaceValidate()) 288 289 def apply(self, fgraph): 290 for input in fgraph.inputs: 291 if isinstance(input.type, GpuArrayType): 292 continue 293 294 # If all clients are outputs or transfers don't do anything. 295 if (all(cl[0] == 'output' or isinstance(cl[0].op, GpuFromHost) 296 for cl in input.clients)): 297 continue 298 299 target = getattr(input.tag, 'target', None) 300 if target == 'cpu': 301 continue 302 if (isinstance(input.type, tensor.TensorType) and 303 not move_to_gpu(input)): 304 continue 305 306 try: 307 new_input = GpuFromHost(target)(input).transfer('cpu') 308 fgraph.replace_validate(input, new_input, 309 "InputToGpuOptimizer") 310 except TypeError: 311 # This could fail if the inputs are not TensorTypes 312 pass 313 except ContextNotDefined: 314 if hasattr(input.tag, 'target'): 315 raise 316 # If there is no context tag and no default context 317 # then it stays on the CPU 318 pass 319 320 321gpu_seqopt.register('InputToGpuArrayOptimizer', InputToGpuOptimizer(), 322 0, 'fast_run', 'fast_compile', 'merge') 323 324 325class GraphToGPU(Optimizer): 326 """ 327 Transfer the graph as a whole to GPU instead of transferring node by node. 328 329 Parameters 330 ---------- 331 local_optimizers_all : List or SortedSet 332 The local optimizations to apply to a node. 333 local_optimizers_map : Dict 334 Dictionary object containing the mapping of Op to list of 335 LocalOptimizers. 336 """ 337 338 def __init__(self, local_optimizers_all, local_optimizers_map): 339 self.local_optimizers_all = local_optimizers_all 340 self.local_optimizers_map = local_optimizers_map 341 342 def add_requirements(self, fgraph): 343 fgraph.attach_feature(toolbox.ReplaceValidate()) 344 345 def apply(self, fgraph): 346 mapping = {} 347 time_opts = {} 348 node_created = {} 349 process_count = {} 350 t_topo = time.time() 351 topo = fgraph.toposort() 352 time_topo = time.time() 353 toposort_timing = time_topo - t_topo 354 355 # Building a new graph 356 # Iterating through inputs of graph 357 target = infer_context_name(*fgraph.inputs) 358 for i in fgraph.inputs: 359 if isinstance(i.type, tensor.TensorType) and move_to_gpu(i): 360 mapping[i] = i.transfer(getattr(i.tag, 'target', target)) 361 else: 362 mapping[i] = i 363 for i in fgraph.variables: 364 if isinstance(i, theano.Constant): 365 mapping[i] = i 366 for node in topo: 367 for lopt in (self.local_optimizers_map.get(node.op, []) + 368 self.local_optimizers_map.get(type(node.op), []) + 369 self.local_optimizers_all): 370 process_count.setdefault(lopt, 0) 371 time_opts.setdefault(lopt, 0) 372 node_created.setdefault(lopt, 0) 373 374 for node in topo: 375 376 if isinstance(node.op, HostFromGpu): 377 mapping[node.outputs[0]] = mapping[node.inputs[0]] 378 continue 379 380 # Move only if any of the inputs are on the GPU. 381 move_to_GPU = False 382 383 context_name = None 384 for i in [mapping[i] for i in node.inputs]: 385 if isinstance(i.type, GpuArrayType): 386 context_name = i.type.context_name 387 move_to_GPU = True 388 break 389 if (not move_to_GPU and 390 isinstance(node.op, (theano.tensor.Alloc, 391 theano.tensor.AllocEmpty, 392 theano.tensor.basic.Eye, 393 theano.tensor.basic.Tri))): 394 # If the Alloc[Empty] have a client that will be moved 395 # to the GPU, we should move the Alloc* on the GPU. 396 397 # We approximate this by supposing that if we have an 398 # optimization for one of the clients op, then we will 399 # move the client to the GPU. 400 for c, _ in node.outputs[0].clients: 401 if (c != 'output' and 402 (self.local_optimizers_map.get(c.op, []) + 403 self.local_optimizers_map.get(type(c.op), []))): 404 move_to_GPU = True 405 new_ops = None 406 if move_to_GPU and any(["complex" in getattr(i, 'dtype', "") 407 for i in node.inputs]): 408 move_to_GPU = False 409 410 # Apply the lifter 411 if move_to_GPU: 412 for lopt in (self.local_optimizers_map.get(node.op, []) + 413 self.local_optimizers_map.get(type(node.op), []) + 414 self.local_optimizers_all): 415 t_opt = time.time() 416 new_ops = lopt.transform(node.op, context_name, 417 [mapping[i] for i in node.inputs], 418 node.outputs) 419 t_opt2 = time.time() 420 time_opts[lopt] += t_opt2 - t_opt 421 422 if new_ops: 423 process_count[lopt] += 1 424 break 425 outputs = [] 426 427 if isinstance(new_ops, theano.Op): 428 with inherit_stack_trace(node.outputs): 429 outputs = new_ops(*[mapping[i] for i in node.inputs], return_list=True) 430 elif not new_ops: 431 newnode = node.clone_with_new_inputs([mapping.get(i) for i in node.inputs]) 432 outputs = newnode.outputs 433 elif isinstance(new_ops, (tuple, list)): 434 outputs = new_ops 435 elif isinstance(new_ops, theano.Variable): 436 outputs = [new_ops] 437 438 for old_output, new_output in zip(node.outputs, outputs): 439 copy_stack_trace(old_output, new_output) 440 441 if new_ops: 442 node_created[lopt] += len(graph.ops([mapping[i] for i in node.inputs], outputs)) 443 if any([getattr(old_o, 'dtype', None) != getattr(new_o, 'dtype', None) 444 for old_o, new_o in zip(outputs, node.outputs)]): 445 _logger.warning( 446 "The optimization %s returned bad dtype. Skipping it." 447 " Write to theano-dev mailing list about this." % 448 str(lopt)) 449 newnode = node.clone_with_new_inputs([mapping.get(i) for i in node.inputs]) 450 outputs = newnode.outputs 451 452 for new_o, old_o in zip(outputs, node.outputs): 453 assert len(outputs) == len(node.outputs) 454 mapping[old_o] = new_o 455 456 new_nodes = [] 457 for o in fgraph.outputs: 458 new_o = mapping[o] 459 if new_o.type != o.type: 460 assert isinstance(o.type, tensor.TensorType) 461 assert isinstance(new_o.type, GpuArrayType) 462 463 # This condition is needed in the case one input is an 464 # output of the graph. Without this, it would 465 # introduce cycle as we don't replace correctly that 466 # case. It would also add extra transfer to/from the 467 # gpu. 468 if (new_o.owner and 469 isinstance(new_o.owner.op, GpuFromHost) and 470 new_o.owner.inputs[0].type == o.type): 471 new_o = new_o.owner.inputs[0] 472 else: 473 new_o = copy_stack_trace(o, safe_to_cpu(new_o)) 474 new_nodes.append(new_o) 475 fgraph.replace_all_validate(zip(fgraph.outputs, new_nodes), 476 reason=self.__class__.__name__) 477 478 return (self, toposort_timing, time_opts, node_created, process_count) 479 480 @staticmethod 481 def print_profile(stream, prof, level=0): 482 (opt, toposort_timing, time_opts, node_created, process_count) = prof 483 blanc = (' ' * level) 484 print(blanc, "GraphToGPUOptimizer", end=' ', file=stream) 485 486 print(blanc, getattr(opt, "name", 487 getattr(opt, "__name__", "")), file=stream) 488 489 print(blanc, " time io_toposort %.3fs" % toposort_timing, file=stream) 490 491 s = sum(time_opts.values()) 492 print(blanc, "Total time taken by local optimizers %.3fs " % s, file=stream) 493 494 count_opt = [] 495 not_used = [] 496 not_used_time = 0 497 498 for o, count in iteritems(process_count): 499 if count > 0: 500 count_opt.append((time_opts[o], count, 501 node_created[o], o)) 502 else: 503 not_used.append((time_opts[o], o)) 504 not_used_time += time_opts[o] 505 506 if count_opt: 507 print(blanc, 508 ' times - times applied - Node created - name:', 509 file=stream) 510 count_opt.sort() 511 for (t, count, n_created, o) in count_opt[::-1]: 512 print(blanc, ' %.3fs - %d - %d - %s' % ( 513 t, count, n_created, o), file=stream) 514 print(blanc, ' %.3fs - in %d optimization that were not used (display only those with a runtime > 0)' % ( 515 not_used_time, len(not_used)), file=stream) 516 not_used.sort(key=lambda nu: (nu[0], str(nu[1]))) 517 for (t, o) in not_used[::-1]: 518 if t > 0: 519 # Skip opt that have 0 times, they probably wasn't even tried. 520 print(blanc + " ", ' %.3fs - %s' % (t, o), file=stream) 521 print(file=stream) 522 523 @staticmethod 524 def merge_profile(prof1, prof2): 525 # (opt, toposort_timing, time_opts, node_created, process_count) = prof1 526 local_optimizers = OrderedSet(prof1[0].local_optimizers_all).union( 527 prof2[0].local_optimizers_all) 528 529 def merge_dict(d1, d2): 530 """ 531 merge 2 dicts by adding the values. 532 """ 533 d = d1.copy() 534 for k, v in iteritems(d2): 535 if k in d: 536 d[k] += v 537 else: 538 d[k] = v 539 return d 540 541 local_optimizers_map = merge_dict(prof1[0].local_optimizers_map, 542 prof2[0].local_optimizers_map) 543 new_opt = GraphToGPU(local_optimizers, local_optimizers_map) 544 545 toposort_timing = prof1[1] + prof2[1] 546 time_opts = merge_dict(prof1[2], prof2[2]) 547 node_created = merge_dict(prof1[3], prof2[3]) 548 process_count = merge_dict(prof1[4], prof2[4]) 549 return (new_opt, 550 toposort_timing, 551 time_opts, 552 node_created, 553 process_count) 554 555 def print_summary(self, stream=sys.stdout, level=0, depth=-1): 556 print("%s%s (%i)" % ( 557 (' ' * level), self.__class__.__name__, id(self)), file=stream) 558 if depth != 0: 559 map_values = [] 560 for opts in self.local_optimizers_map.values(): 561 map_values += opts 562 for opt in self.local_optimizers_all + map_values: 563 opt.print_summary(stream, level=(level + 2), depth=(depth - 1)) 564 565 566@local_optimizer([GpuFromHost, GpuToGpu, HostFromGpu]) 567def local_cut_gpu_transfers(node): 568 # gpu[ab] -> host -> gpub 569 if (isinstance(node.op, GpuFromHost) and 570 node.inputs[0].owner and 571 isinstance(node.inputs[0].owner.op, HostFromGpu)): 572 other = node.inputs[0].owner.inputs[0] 573 if node.op.context_name == other.type.context_name: 574 return [other] 575 else: 576 return [GpuToGpu(node.op.context_name)(other)] 577 578 # ? -> gpua -> host 579 elif (isinstance(node.op, HostFromGpu) and 580 node.inputs[0].owner): 581 n2 = node.inputs[0].owner 582 583 # host -> 584 if isinstance(n2.op, GpuFromHost): 585 return [n2.inputs[0]] 586 587 # gpub -> 588 if isinstance(n2.op, GpuToGpu): 589 return [n2.inputs[0].transfer('cpu')] 590 591 # ? -> gpua -> gpub 592 elif isinstance(node.op, GpuToGpu): 593 # Transfer within same context 594 if node.inputs[0].type.context_name == node.op.context_name: 595 return [node.inputs[0]] 596 597 if node.inputs[0].owner: 598 n2 = node.inputs[0].owner 599 600 # host -> 601 if isinstance(n2.op, GpuFromHost): 602 return [as_gpuarray_variable(n2.inputs[0], 603 node.op.context_name)] 604 605 # gpuc -> 606 if isinstance(n2.op, GpuToGpu): 607 if node.op.context_name == n2.inputs[0].type.context_name: 608 return [n2.inputs[0]] 609 else: 610 return [node.op(n2.inputs[0])] 611 612 613gpu_cut_copies.register('cut_gpua_host_transfers', local_cut_gpu_transfers, 614 'fast_compile', 'fast_run', 'gpuarray') 615gpu_cut_copies.register('cut_gpua_constant_transfers', 616 tensor.opt.constant_folding, 617 'fast_compile', 'fast_run', 'gpuarray') 618optdb['canonicalize'].register('local_cut_gpua_host_gpua', 619 local_cut_gpu_transfers, 620 'fast_compile', 'fast_run', 'gpuarray') 621 622 623@register_opt('fast_compile') 624@local_optimizer([tensor.Alloc]) 625def local_gpua_alloc2(node): 626 """ 627 Join(axis, {Alloc or HostFromGPU}, ...) -> Join(axis, GpuAlloc, Alloc, ...) 628 629 Moves an alloc that is an input to join to the gpu. 630 631 """ 632 try: 633 get_context(None) 634 except ContextNotDefined: 635 # If there is no default context then we do not perform the move here. 636 return 637 if (isinstance(node.op, tensor.Alloc) and 638 all(c != 'output' and 639 isinstance(c.op, tensor.Join) and 640 all(i.owner and 641 i.owner.op in [host_from_gpu, tensor.alloc] 642 for i in c.inputs[1:]) 643 for c, idx in node.outputs[0].clients)): 644 return [GpuAlloc(None)(*node.inputs).transfer('cpu')] 645 646 647@register_opt('fast_compile') 648@op_lifter([tensor.Alloc]) 649@register_opt2([tensor.Alloc], 'fast_compile') 650def local_gpuaalloc(op, context_name, inputs, outputs): 651 return GpuAlloc(context_name)(*inputs) 652 653 654@register_opt('fast_compile') 655@op_lifter([tensor.AllocEmpty]) 656@register_opt2([tensor.AllocEmpty], 'fast_compile') 657def local_gpua_alloc_empty(op, context_name, inputs, outputs): 658 # We use _props_dict() to make sure that the GPU op know all the 659 # CPU op props. 660 return GpuAllocEmpty(context_name=context_name, **op._props_dict())(*inputs) 661 662 663@register_opt() 664@local_optimizer([GpuAlloc]) 665def local_gpualloc_memset_0(node): 666 if isinstance(node.op, GpuAlloc) and not node.op.memset_0: 667 inp = node.inputs[0] 668 if (isinstance(inp, GpuArrayConstant) and 669 inp.data.size == 1 and 670 (np.asarray(inp.data) == 0).all()): 671 new_op = GpuAlloc(node.op.context_name, memset_0=True) 672 with inherit_stack_trace(node.outputs): 673 return new_op(*node.inputs, return_list=True) 674 675 676# Don't register by default. 677@gof.local_optimizer([GpuAllocEmpty]) 678def local_gpua_alloc_empty_to_zeros(node): 679 if isinstance(node.op, GpuAllocEmpty): 680 context_name = infer_context_name(*node.inputs) 681 z = np.asarray(0, dtype=node.outputs[0].dtype) 682 with inherit_stack_trace(node.outputs): 683 return [GpuAlloc(context_name)( 684 as_gpuarray_variable(z, context_name), *node.inputs)] 685optdb.register('local_gpua_alloc_empty_to_zeros', 686 theano.tensor.opt.in2out(local_gpua_alloc_empty_to_zeros), 687 # After move to gpu and merge2, before inplace. 688 49.3, 689 'alloc_empty_to_zeros',) 690 691 692@register_opt() 693@local_optimizer([GpuContiguous]) 694def local_gpu_contiguous_gpu_contiguous(node): 695 """ 696 gpu_contiguous(gpu_contiguous(x)) -> gpu_contiguous(x) 697 698 """ 699 if isinstance(node.op, GpuContiguous): 700 inp = node.inputs[0] 701 if inp.owner and isinstance(inp.owner.op, GpuContiguous): 702 return [inp] 703 704 705@register_opt('fast_compile') 706@op_lifter([tensor.extra_ops.CpuContiguous]) 707@register_opt2([tensor.extra_ops.CpuContiguous], 'fast_compile') 708def local_gpua_contiguous(op, context_name, inputs, outputs): 709 return gpu_contiguous 710 711 712@register_opt('fast_compile') 713@op_lifter([tensor.Reshape]) 714@register_opt2([tensor.Reshape], 'fast_compile') 715def local_gpua_reshape(op, context_name, inputs, outputs): 716 res = GpuReshape(op.ndim) 717 return res 718 719 720@register_opt('fast_compile') 721@op_lifter([tensor.Rebroadcast]) 722@register_opt2([tensor.Rebroadcast], 'fast_compile') 723def local_gpua_rebroadcast(op, context_name, inputs, outputs): 724 return op(as_gpuarray_variable(inputs[0], context_name)) 725 726 727@register_opt('fast_compile') 728@op_lifter([tensor.Flatten]) 729@register_opt2([tensor.Flatten], 'fast_compile') 730def local_gpua_flatten(op, context_name, inputs, outputs): 731 shp = [] 732 if op.outdim != 1: 733 shp = [inputs[0].shape[i] for i in range(op.outdim - 1)] 734 shp += [-1] 735 res = GpuReshape(op.outdim) 736 o = res(inputs[0], theano.tensor.as_tensor_variable(shp)) 737 return o 738 739 740@register_opt('fast_compile') 741@op_lifter([tensor.Elemwise]) 742@register_opt2([tensor.Elemwise], 'fast_compile') 743def local_gpua_elemwise(op, context_name, inputs, outputs): 744 scal_op = op.scalar_op 745 name = op.name 746 if name: 747 name = 'Gpu' + name 748 if len(outputs) > 1: 749 return 750 751 have_cuda = False 752 have_opencl = False 753 if inputs and isinstance(inputs[0].type, GpuArrayType): 754 kind = inputs[0].type.context.kind 755 if kind.startswith(b'opencl'): 756 have_opencl = True 757 elif kind.startswith(b'cuda'): 758 have_cuda = True 759 convert = {Erfinv: gpu_erfinv, 760 Erfcinv: gpu_erfcinv} 761 762 if scal_op.__class__ in convert: 763 scal_op = convert[scal_op.__class__] 764 if have_opencl: 765 _logger.warning( 766 'Function "%s" is not supported with OpenCL. Use "device=cuda" instead.' % 767 scal_op) 768 if not have_cuda: 769 return None 770 if not scal_op.supports_c_code(inputs, outputs): 771 return 772 res = GpuElemwise(scal_op, name=name, 773 inplace_pattern=copy.copy(op.inplace_pattern), 774 nfunc_spec=op.nfunc_spec) 775 776 # If the elemwise operation is a pow, casts might be required on the 777 # inputs and or outputs because only the (float, float)->float and 778 # (double, double)->double cases are implemented at the moment. 779 if isinstance(op.scalar_op, Pow): 780 781 # Only transfer the computation on the gpu if the output dtype is 782 # floating point. Else, give up on the transfer to the gpu. 783 out_dtype = outputs[0].dtype 784 if out_dtype not in ['float16', 'float32', 'float64']: 785 return 786 787 # Transfer the inputs on the GPU and cast them to the right dtype. 788 new_inputs = [] 789 for inp in inputs: 790 if inp.dtype != out_dtype: 791 gpu_cast_op = GpuElemwise(Cast(Scalar(out_dtype))) 792 new_inputs.append(gpu_cast_op(as_gpuarray_variable(inp, context_name))) 793 else: 794 new_inputs.append(as_gpuarray_variable(inp, context_name)) 795 796 # Perform the exponent on the gpu and transfer the output back to the 797 # cpu. 798 gpu_output = res(*new_inputs) 799 return [gpu_output] 800 elif op.scalar_op in (scalar.add, scalar.mul): 801 try: 802 return [split_inputs(inputs, max_inputs_to_GpuElemwise(outputs), res)] 803 except ValueError: 804 return False 805 else: 806 return res 807 808 809def split_inputs(inputs, max_nb_inputs, op): 810 """ 811 For some ops like add and mul, a large number of inputs can make nvcc fail 812 compilation of our current code. We don't want node in the graph that can't 813 execute as this break DebugMode. 814 815 This should not happen for other GpuElemwise as their is only the fusion 816 that can generate op with too much input and it check for that. 817 818 Parameters 819 ---------- 820 inputs: List of theano variables. 821 List of inputs to node. 822 max_nb_inputs: int 823 Maximum number of inputs the node can handle without 824 compilation fail. 825 op : Theano operator instance. 826 Operator that should be used to rebuild the computation graph with smaller 827 number of inputs per node. 828 """ 829 if max_nb_inputs <= 1 and len(inputs) > 1: 830 raise ValueError("Can not split nodes because inputs' dimensionality and/or" 831 " number of outputs is too large") 832 833 while len(inputs) > max_nb_inputs: 834 inner_ops = [] 835 for i in range(0, len(inputs), max_nb_inputs): 836 inner_ops.append(op(*inputs[i: i + max_nb_inputs])) 837 inputs = inner_ops 838 839 return op(*inputs) 840 841 842gpu_local_elemwise_fusion = tensor.opt.local_elemwise_fusion_op( 843 GpuElemwise, 844 max_inputs_to_GpuElemwise) 845optdb.register('gpua_elemwise_fusion', 846 # 48.5 move to gpu 847 # 48.6 specialize 848 # 49 cpu fusion 849 # 49.5 add destroy handler 850 tensor.opt.FusionOptimizer(gpu_local_elemwise_fusion), 49, 851 'fast_run', 'fusion', 'local_elemwise_fusion', 'gpuarray') 852 853inplace_gpu_elemwise_opt = tensor.opt.InplaceElemwiseOptimizer( 854 GpuElemwise) 855optdb.register('gpua_inplace_opt', inplace_gpu_elemwise_opt, 75, 856 'inplace_elemwise_optimizer', 'fast_run', 'inplace', 'gpuarray') 857 858register_opt(tensor.opt.local_useless_elemwise) 859 860 861@register_opt('fast_compile') 862@op_lifter([tensor.DimShuffle]) 863@register_opt2([tensor.DimShuffle], 'fast_compile') 864def local_gpua_dimshuffle(op, context_name, inputs, outputs): 865 return GpuDimShuffle(op.input_broadcastable, 866 op.new_order) 867 868 869@register_opt('fast_compile') 870@op_lifter([tensor.SpecifyShape]) 871@register_opt2([tensor.SpecifyShape], 'fast_compile') 872def local_gpua_specifyShape(op, context_name, inputs, outputs): 873 if isinstance(inputs[0].type, GpuArrayType): 874 return 875 return local_gpua_specifyShape_graph(op, context_name, inputs, outputs) 876 877 878@register_opt2([tensor.SpecifyShape], 'fast_compile') 879def local_gpua_specifyShape_graph(op, context_name, inputs, outputs): 880 inp = [as_gpuarray_variable(inputs[0], context_name)] 881 inp += inputs[1:] 882 return tensor.specify_shape(*inp) 883 884 885@register_opt('fast_compile') 886@op_lifter([theano.compile.ops.Shape]) 887def local_gpua_shape(op, context_name, inputs, outputs): 888 # op_lifter will call this opt too frequently as the output is 889 # always on the CPU. 890 if isinstance(inputs[0].type, GpuArrayType): 891 return 892 return local_gpua_shape_graph(op, context_name, inputs, outputs) 893 894 895@register_opt2([tensor.compile.ops.Shape], 'fast_compile') 896def local_gpua_shape_graph(op, context_name, inputs, outputs): 897 return [as_gpuarray_variable(inputs[0], context_name).shape] 898 899 900def gpu_print_wrapper(op, cnda): 901 op.old_op.global_fn(op.old_op, np.asarray(cnda)) 902 903 904@register_opt('fast_compile') 905@op_lifter([tensor.printing.Print]) 906@register_opt2([tensor.printing.Print], 'fast_compile') 907def local_gpua_print_op(op, context_name, inputs, outputs): 908 x, = inputs 909 with inherit_stack_trace(outputs): 910 gpu_x = as_gpuarray_variable(x, context_name=context_name) 911 new_op = op.__class__(global_fn=gpu_print_wrapper) 912 new_op.old_op = op 913 return new_op(gpu_x) 914 915 916@register_opt('fast_compile') 917@local_optimizer([PdbBreakpoint]) 918def local_gpu_pdbbreakpoint_op(node): 919 if isinstance(node.op, PdbBreakpoint): 920 921 old_inputs = node.inputs 922 old_outputs = node.outputs 923 924 new_inputs = node.inputs[:1] 925 input_transfered = [] 926 927 # Go through the monitored variables, only transferring on GPU those 928 # for which the input comes from the GPU or the output will be 929 # transferred on the GPU. 930 nb_monitored_vars = len(node.outputs) 931 for i in range(nb_monitored_vars): 932 933 inp = old_inputs[i + 1] 934 out = old_outputs[i] 935 936 input_is_from_gpu = (inp.owner and 937 isinstance(inp.owner.op, HostFromGpu)) 938 output_goes_to_gpu = False 939 for c in out.clients: 940 if c == 'output': 941 continue 942 if isinstance(c[0].op, GpuFromHost): 943 output_goes_to_gpu = True 944 context_name = c[0].op.context_name 945 break 946 947 if input_is_from_gpu: 948 # The op should be applied on the GPU version of the input 949 new_inputs.append(inp.owner.inputs[0]) 950 input_transfered.append(True) 951 952 elif output_goes_to_gpu: 953 # The input should be transferred to the gpu 954 new_inputs.append(as_gpuarray_variable(inp, context_name)) 955 input_transfered.append(True) 956 957 else: 958 # No transfer is required. 959 new_inputs.append(inp) 960 input_transfered.append(False) 961 962 # Only continue the optimization if at least one input has been 963 # transferred to the gpu 964 if not any(input_transfered): 965 return False 966 967 # Apply the op on the new inputs 968 with inherit_stack_trace(node.outputs): 969 new_op_outputs = node.op(*new_inputs, return_list=True) 970 971 # Propagate the transfer to the gpu through the outputs that require 972 # it 973 new_outputs = [] 974 for i in range(len(new_op_outputs)): 975 if input_transfered[i]: 976 new_outputs.append(new_op_outputs[i].transfer('cpu')) 977 else: 978 new_outputs.append(new_op_outputs[i]) 979 980 return new_outputs 981 982 return False 983 984 985@register_opt('fast_compile') 986@op_lifter([IfElse]) 987@register_opt2([IfElse], 'fast_compile') 988def local_gpua_lazy_ifelse(op, context_name, inputs, outputs): 989 if op.gpu: 990 return 991 c = inputs[0] 992 inps = [] 993 falses = [] 994 # ifelse need corresponding true/false inputs variables to be of the same type. 995 # But we can't rely on inputs to respect that, as GraphToGPU don't enforce that. 996 # So we need to take care of this here. 997 for v1, v2 in zip(inputs[1:1 + op.n_outs], inputs[1 + op.n_outs:]): 998 if ((isinstance(v1.type, tensor.TensorType) and move_to_gpu(v1)) or 999 isinstance(v1.type, GpuArrayType) or 1000 isinstance(v2.type, GpuArrayType)): 1001 inps.append(as_gpuarray_variable(v1, context_name)) 1002 falses.append(as_gpuarray_variable(v2, context_name)) 1003 else: 1004 inps.append(v1) 1005 falses.append(v2) 1006 inps.extend(falses) 1007 return IfElse(op.n_outs, gpu=True)(c, *inps, return_list=True) 1008 1009 1010@register_opt('fast_compile') 1011@op_lifter([tensor.Join]) 1012@register_opt2([tensor.Join], 'fast_compile') 1013def local_gpua_join(op, context_name, inputs, outputs): 1014 return gpu_join 1015 1016 1017@register_opt('fast_compile') 1018@local_optimizer([GpuJoin]) 1019def local_gpua_join_1(node): 1020 # join of a single element 1021 if (isinstance(node.op, GpuJoin) and 1022 len(node.inputs) == 2): 1023 return [node.inputs[1]] 1024 1025 1026@register_opt('fast_compile') 1027@op_lifter([tensor.Split]) 1028@register_opt2([tensor.Split], 'fast_compile') 1029def local_gpua_split(op, context_name, inputs, outputs): 1030 # TODO use props 1031 return GpuSplit(op.len_splits) 1032 1033 1034@register_opt('fast_compile') 1035@op_lifter([tensor.Subtensor]) 1036def local_gpua_subtensor(op, context_name, inputs, outputs): 1037 x = inputs[0] 1038 if (x.owner and isinstance(x.owner.op, HostFromGpu)): 1039 gpu_x = x.owner.inputs[0] 1040 if (gpu_x.owner and 1041 isinstance(gpu_x.owner.op, GpuFromHost) and 1042 # And it is a shared var or an input of the graph. 1043 not gpu_x.owner.inputs[0].owner): 1044 if len(x.clients) == 1: 1045 if any([n == 'output' or any([isinstance(v.type, GpuArrayType) 1046 for v in n.inputs + n.outputs]) 1047 for n, _ in outputs[0].clients]): 1048 return 1049 else: 1050 return [gpu_x.owner.op(outputs[0]).transfer('cpu')] 1051 1052 return GpuSubtensor(op.idx_list) 1053 1054 1055@register_opt2([tensor.Subtensor], 'fast_compile') 1056def local_gpua_subtensor_graph(op, context_name, inputs, outputs): 1057 # We need different code as the condition is different as inputs 1058 # aren't the same. 1059 x = inputs[0] 1060 # We don't want to move the subtensor to the GPU if the inputs is 1061 # on the CPU and the only client of the CPU node is this 1062 # subtensor. This allow to have a smaller transfer. 1063 1064 if (x.owner and isinstance(x.owner.op, GpuFromHost)): 1065 cpu_x = x.owner.inputs[0] 1066 # And it is a shared var or an input of the graph. 1067 # and is used by only 1 node. 1068 # x is in the new graph, so we can't tests its number of clients. 1069 if not cpu_x.owner and len(cpu_x.clients) == 1: 1070 c = outputs[0].clients 1071 # If the subtensor have only 1 client, do it on the CPU. 1072 # We let the other optimization to take care to move the 1073 # next node or not. 1074 if len(c) == 1: 1075 return 1076 return GpuSubtensor(op.idx_list) 1077 1078 1079@register_opt('fast_compile') 1080@op_lifter([tensor.IncSubtensor]) 1081@register_opt2([tensor.IncSubtensor], 'fast_compile') 1082def local_gpua_inc_subtensor(op, context_name, inputs, outputs): 1083 op = GpuIncSubtensor(op.idx_list, op.inplace, 1084 op.set_instead_of_inc, 1085 op.destroyhandler_tolerate_aliased) 1086 ret = op(*inputs) 1087 val = getattr(outputs[0].tag, 'nan_guard_mode_check', True) 1088 ret.tag.nan_guard_mode_check = val 1089 return ret 1090 1091 1092@register_opt('fast_compile') 1093@op_lifter([tensor.AdvancedSubtensor1]) 1094@register_opt2([tensor.AdvancedSubtensor1], 'fast_compile') 1095def local_gpua_advanced_subtensor1(op, context_name, inputs, outputs): 1096 return GpuAdvancedSubtensor1() 1097 1098 1099@register_opt('fast_compile') 1100@op_lifter([tensor.AdvancedSubtensor]) 1101@register_opt2([tensor.AdvancedSubtensor], 'fast_compile') 1102def local_gpua_advanced_subtensor(op, context_name, inputs, outputs): 1103 return GpuAdvancedSubtensor() 1104 1105 1106@register_opt('fast_compile') 1107@op_lifter([tensor.AdvancedBooleanSubtensor]) 1108@register_opt2([tensor.AdvancedBooleanSubtensor], 'fast_compile') 1109def local_gpua_advanced_boolean_subtensor(op, context_name, inputs, outputs): 1110 return GpuAdvancedBooleanSubtensor() 1111 1112 1113@register_opt('fast_compile') 1114@op_lifter([tensor.AdvancedIncSubtensor1]) 1115@register_opt2([tensor.AdvancedIncSubtensor1], 'fast_compile') 1116def local_gpua_advanced_incsubtensor1(op, context_name, inputs, outputs): 1117 x, y, ilist = inputs 1118 1119 set_instead_of_inc = op.set_instead_of_inc 1120 1121 if (x.ndim == 1 and y.ndim == 0 and 1122 config.deterministic == 'default' and 1123 x.dtype not in ('int8', 'int16')): 1124 x = x.dimshuffle(0, 'x') 1125 y = y.dimshuffle('x', 'x') 1126 ret = GpuAdvancedIncSubtensor1_dev20( 1127 set_instead_of_inc=set_instead_of_inc)(x, y, ilist) 1128 ret = GpuDimShuffle(ret.type.broadcastable, [0])(ret) 1129 return ret 1130 elif (x.ndim != 2 or y.ndim != 2 or 1131 config.deterministic == 'more' or 1132 x.dtype in ('int8', 'int16')): 1133 return GpuAdvancedIncSubtensor1( 1134 set_instead_of_inc=set_instead_of_inc) 1135 else: 1136 return GpuAdvancedIncSubtensor1_dev20( 1137 set_instead_of_inc=set_instead_of_inc) 1138 1139 1140# Do not register this optimization for now, as it slows down the 1141# execution by a lot in important cases. 1142# @register_opt('fast_compile') 1143# @op_lifter([tensor.AdvancedIncSubtensor]) 1144# @register_opt2([tensor.AdvancedIncSubtensor], 'fast_compile') 1145def local_gpua_advanced_incsubtensor(op, context_name, inputs, outputs): 1146 if not op.set_instead_of_inc: 1147 return GpuAdvancedIncSubtensor() 1148 else: 1149 return False 1150 1151 1152# Do not register this optimization for now, as it slows down the 1153# execution by a lot in important cases. 1154# @register_opt('fast_compile') 1155# @op_lifter([tensor.AdvancedBooleanIncSubtensor]) 1156# @register_opt2([tensor.AdvancedBooleanIncSubtensor], 'fast_compile') 1157def local_gpua_advanced_boolean_incsubtensor(op, context_name, inputs, outputs): 1158 # GpuAdvancedIncSubtensor only works with a single boolean mask, 1159 # but not with fancy combinations. 1160 if not op.set_instead_of_inc and len(inputs) == 3: 1161 return GpuAdvancedBooleanIncSubtensor() 1162 else: 1163 return False 1164 1165 1166@register_inplace() 1167@local_optimizer([GpuAdvancedIncSubtensor1, GpuAdvancedIncSubtensor1_dev20]) 1168def local_advincsub1_gpua_inplace(node): 1169 if isinstance(node.op, (GpuAdvancedIncSubtensor1, 1170 GpuAdvancedIncSubtensor1_dev20)): 1171 if not node.op.inplace: 1172 return [node.op.clone_inplace()(*node.inputs)] 1173 1174 1175# AllocDiag 1176@register_opt('fast_compile') 1177@op_lifter([tensor.AllocDiag]) 1178@register_opt2([theano.tensor.AllocDiag], 'fast_compile') 1179def local_gpu_alloc_diag(op, context_name, inputs, outputs): 1180 if outputs[0].ndim != 2: 1181 # AllocDiag only supports 2d output 1182 return False 1183 return GpuAllocDiag(offset=op.offset) 1184 1185 1186# ExtractDiag 1187@register_opt('fast_compile') 1188@op_lifter([tensor.ExtractDiag]) 1189@register_opt2([theano.tensor.ExtractDiag], 'fast_compile') 1190def local_gpu_extract_diag(op, context_name, inputs, outputs): 1191 return GpuExtractDiag(offset=op.offset, axis1=op.axis1, axis2=op.axis2, view=op.view) 1192 1193 1194@register_opt('fast_compile') 1195@op_lifter([tensor.CAReduce, tensor.Sum, tensor.elemwise.Prod]) 1196@register_opt2([tensor.CAReduce, tensor.Sum, tensor.elemwise.Prod], 'fast_compile') 1197def local_gpua_careduce(op, context_name, inputs, outputs): 1198 if isinstance(op.scalar_op, (scalar.Add, scalar.Mul, 1199 scalar.Maximum, scalar.Minimum)): 1200 1201 ctx = get_context(context_name) 1202 if ctx.kind == b'opencl': 1203 op2 = GpuCAReduceCPY 1204 if op.scalar_op not in [scalar.add, scalar.mul]: 1205 # We don't support yet all reduction with cpy code. 1206 return 1207 elif ctx.kind == b'cuda': 1208 op2 = GpuCAReduceCuda 1209 else: 1210 return False 1211 x, = inputs 1212 idtype = x.dtype 1213 adtype = getattr(op, 'acc_dtype', None) 1214 odtype = getattr(op, 'dtype', outputs[0].dtype) 1215 1216 # Force accumulator to float32 for float32 inputs since tree 1217 # reduction will not loose as much precision as linear 1218 # accumulation and float64 is much slower on GPU. 1219 if idtype == 'float32' and odtype == 'float32': 1220 adtype = 'float32' 1221 1222 greduce = op2( 1223 op.scalar_op, axis=op.axis, 1224 dtype=odtype, 1225 acc_dtype=adtype) 1226 with inherit_stack_trace(outputs): 1227 gvar = greduce(x) 1228 # We need to have the make node called, otherwise the mask can 1229 # be None 1230 if (op2 is GpuCAReduceCPY or 1231 gvar.owner.op.supports_c_code([ 1232 as_gpuarray_variable(x, context_name)])): 1233 return greduce 1234 else: 1235 # Try to make a simpler pattern based on reshaping 1236 # The principle is that if two adjacent dimensions have 1237 # the same value in the reduce_mask, then we can reshape 1238 # to make them a single dimension, do the reduction, and 1239 # then reshape to get them back. 1240 1241 if op.axis is None: 1242 reduce_mask = [1] * x.type.ndim 1243 else: 1244 reduce_mask = [0] * x.type.ndim 1245 for a in op.axis: 1246 assert reduce_mask[a] == 0 1247 reduce_mask[a] = 1 1248 1249 new_in_shp = [shape_i(x, 0)] 1250 new_mask = [reduce_mask[0]] 1251 for i in xrange(1, x.type.ndim): 1252 if reduce_mask[i] == reduce_mask[i - 1]: 1253 new_in_shp[-1] *= shape_i(x, i) 1254 else: 1255 new_mask.append(reduce_mask[i]) 1256 new_in_shp.append(shape_i(x, i)) 1257 new_axis = [] 1258 for idx, m in enumerate(new_mask): 1259 if m == 1: 1260 new_axis.append(idx) 1261 greduce = op2( 1262 op.scalar_op, 1263 axis=new_axis, reduce_mask=new_mask, 1264 dtype=odtype, 1265 acc_dtype=adtype) 1266 with inherit_stack_trace(outputs): 1267 reshaped_x = x.reshape(tensor.stack(new_in_shp)) 1268 gpu_reshaped_x = as_gpuarray_variable(reshaped_x, context_name) 1269 # We need to have the make node called, otherwise the mask can 1270 # be None 1271 gvar = greduce(gpu_reshaped_x) 1272 reshaped_gpu_inputs = [gpu_reshaped_x] 1273 if greduce.supports_c_code(reshaped_gpu_inputs): 1274 reduce_reshaped_x = greduce(gpu_reshaped_x) 1275 1276 if reduce_reshaped_x.ndim != outputs[0].ndim: 1277 out_shp = [] 1278 for i in range(x.ndim): 1279 if i not in op.axis: 1280 out_shp.append(shape_i(x, i)) 1281 unreshaped_reduce = GpuReshape(len(out_shp))( 1282 reduce_reshaped_x, 1283 tensor.stack(out_shp)) 1284 else: 1285 unreshaped_reduce = reduce_reshaped_x 1286 return [unreshaped_reduce] 1287 1288 1289@register_opt('fast_compile') 1290@op_lifter([tensor.blas.Gemv, tensor.blas_c.CGemv]) 1291@register_opt2([tensor.blas.Gemv], 'fast_compile') 1292def local_gpua_gemv(op, context_name, inputs, outputs): 1293 if inputs[0].dtype == 'float16': 1294 # Use gemm implementation as cublas gemv don't support float16 1295 return gpugemm_no_inplace(inputs[0][:, None], 1296 inputs[1], 1297 inputs[2], 1298 inputs[3][:, None], 1299 inputs[4]).dimshuffle(0) 1300 1301 if inputs[0].dtype not in ['float32', 'float64']: 1302 return 1303 if op.inplace: 1304 return gpugemv_inplace 1305 else: 1306 return gpugemv_no_inplace 1307 1308 1309@register_opt('fast_compile') 1310@op_lifter([tensor.blas.Gemm]) 1311@register_opt2([tensor.blas.Gemm], 'fast_compile') 1312def local_gpua_gemm(op, context_name, inputs, outputs): 1313 if inputs[0].dtype not in ['float16', 'float32', 'float64']: 1314 return 1315 if op.inplace: 1316 return gpugemm_inplace 1317 else: 1318 return gpugemm_no_inplace 1319 1320 1321@register_opt('fast_compile') 1322@op_lifter([tensor.blas.BatchedDot]) 1323@register_opt2([tensor.blas.BatchedDot], 'fast_compile') 1324def local_gpua_gemmbatch(op, context_name, inputs, outputs): 1325 if inputs[0].dtype not in ['float16', 'float32', 'float64']: 1326 return 1327 with inherit_stack_trace(outputs): 1328 a, b = inputs 1329 # Since GpuGemmBatch only supports 3D inputs and output, 1330 # we need to add broadcastable dims to the inputs, and drop 1331 # them from outputs 1332 output_dims = [0, 1, 2] 1333 if a.ndim == 2: 1334 a = GpuDimShuffle(a.broadcastable, (0, 'x', 1))(a) 1335 del output_dims[1] 1336 if b.ndim == 2: 1337 b = GpuDimShuffle(b.broadcastable, (0, 1, 'x'))(b) 1338 del output_dims[-1] 1339 # In case of mismatched dtypes, we also have to upcast 1340 out_dtype = outputs[0].dtype 1341 if a.dtype != out_dtype or b.dtype != out_dtype: 1342 gpu_cast_op = GpuElemwise(Cast(Scalar(out_dtype))) 1343 if a.dtype != out_dtype: 1344 a = gpu_cast_op(a) 1345 if b.dtype != out_dtype: 1346 b = gpu_cast_op(b) 1347 1348 c = GpuAllocEmpty(out_dtype, context_name)( 1349 a.shape[0], a.shape[1], b.shape[2]) 1350 out = gpugemmbatch_no_inplace(c, np.asarray(1.0, dtype=out_dtype), 1351 a, b, np.asarray(0.0, dtype=out_dtype)) 1352 if len(output_dims) != 3: 1353 out = GpuDimShuffle(out.broadcastable, output_dims)(out) 1354 return out 1355 1356 1357@register_opt() 1358@alpha_merge(GpuGemm, alpha_in=1, beta_in=4) 1359def local_gpua_gemm_alpha_merge(node, *inputs): 1360 return [gpugemm_no_inplace(*inputs)] 1361 1362 1363@register_opt() 1364@output_merge(GpuGemm, alpha_in=1, beta_in=4, out_in=0) 1365def local_gpua_gemm_output_merge(node, *inputs): 1366 return [gpugemm_no_inplace(*inputs)] 1367 1368 1369@register_opt() 1370@alpha_merge(GpuGemmBatch, alpha_in=1, beta_in=4) 1371def local_gpua_gemmbatch_alpha_merge(node, *inputs): 1372 return [gpugemmbatch_no_inplace(*inputs)] 1373 1374 1375@register_opt() 1376@output_merge(GpuGemmBatch, alpha_in=1, beta_in=4, out_in=0) 1377def local_gpua_gemmbatch_output_merge(node, *inputs): 1378 return [gpugemmbatch_no_inplace(*inputs)] 1379 1380 1381@register_opt('fast_compile') 1382@op_lifter([tensor.blas.Ger, tensor.blas_c.CGer, tensor.blas_scipy.ScipyGer]) 1383@register_opt2([tensor.blas.Ger, tensor.blas_c.CGer, tensor.blas_scipy.ScipyGer], 'fast_compile') 1384def local_gpua_ger(op, context_name, inputs, outputs): 1385 if inputs[0].dtype not in ['float32', 'float64']: 1386 return 1387 return GpuGer(inplace=op.destructive) 1388 1389 1390@register_opt('fast_compile') 1391@op_lifter([tensor.blas.Dot22]) 1392@register_opt2([tensor.blas.Dot22], 'fast_compile') 1393def local_gpua_dot22(op, context_name, inputs, outputs): 1394 return gpu_dot22 1395 1396 1397@register_opt('fast_compile') 1398@op_lifter([tensor.blas.Dot22Scalar]) 1399@register_opt2([tensor.blas.Dot22Scalar], 'fast_compile') 1400def local_gpua_dot22scalar(op, context_name, inputs, outputs): 1401 with inherit_stack_trace(outputs): 1402 x, y, a = inputs 1403 x = as_gpuarray_variable(x, context_name) 1404 y = as_gpuarray_variable(y, context_name) 1405 z = GpuAllocEmpty(x.dtype, context_name)(x.shape[0], y.shape[1]) 1406 return [gpugemm_no_inplace(z, a, x, y, 0)] 1407 1408 1409@register_opt('fast_compile') 1410@op_lifter([tensor.basic.Eye]) 1411@register_opt2([tensor.basic.Eye], 'fast_compile') 1412def local_gpua_eye(op, context_name, inputs, outputs): 1413 return GpuEye(dtype=op.dtype, context_name=context_name) 1414 1415 1416@register_opt('fast_compile') 1417@op_lifter([tensor.basic.Tri]) 1418@register_opt2([tensor.basic.Tri], 'fast_compile') 1419def local_gpua_tri(op, context_name, inputs, outputs): 1420 return GpuTri(dtype=op.dtype, context_name=context_name) 1421 1422 1423@register_opt('fast_compile') 1424@op_lifter([tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias]) 1425@register_opt2([tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias], 'fast_compile') 1426def local_gpua_crossentropysoftmaxargmax1hotwithbias(op, context_name, inputs, outputs): 1427 return gpu_crossentropy_softmax_argmax_1hot_with_bias 1428 1429 1430@register_opt('fast_compile') 1431@op_lifter([tensor.nnet.CrossentropySoftmax1HotWithBiasDx]) 1432@register_opt2([tensor.nnet.CrossentropySoftmax1HotWithBiasDx], 'fast_compile') 1433def local_gpua_crossentropysoftmax1hotwithbiasdx(op, context_name, inputs, outputs): 1434 return gpu_crossentropy_softmax_1hot_with_bias_dx 1435 1436 1437@register_opt('fast_compile') 1438@op_lifter([tensor.nnet.Softmax]) 1439@register_opt2([tensor.nnet.Softmax], 'fast_compile') 1440def local_gpua_softmax(op, context_name, inputs, outputs): 1441 return gpu_softmax 1442 1443 1444@register_opt('fast_compile') 1445@op_lifter([tensor.nnet.SoftmaxWithBias]) 1446@register_opt2([tensor.nnet.SoftmaxWithBias], 'fast_compile') 1447def local_gpua_softmaxwithbias(op, context_name, inputs, outputs): 1448 return gpu_softmax_with_bias 1449 1450 1451@register_opt('fast_compile') 1452@op_lifter([tensor.nnet.CrossentropyCategorical1Hot]) 1453@register_opt2([tensor.nnet.CrossentropyCategorical1Hot], 'fast_compile') 1454def local_gpu_crossentropycategorical1hot(op, context_name, inputs, outputs): 1455 # There is no corresponding GPU Op, but we can express it as: 1456 # coding, one_of_n = inputs 1457 # -log(coding[arange(coding.shape[0]), one_of_n]) 1458 coding, one_of_n = inputs 1459 idx0 = theano.tensor.arange(shape_i(coding, 0)) 1460 return [gpu_neg(gpu_log(coding[idx0, one_of_n]))] 1461 1462 1463@register_opt('fast_compile') 1464@op_lifter([tensor.nnet.CrossentropyCategorical1HotGrad]) 1465@register_opt2([tensor.nnet.CrossentropyCategorical1HotGrad], 'fast_compile') 1466def local_gpu_crossentropycategorical1hotgrad(op, context_name, inputs, outputs): 1467 # There is no corresponding GPU Op, but we can express it as: 1468 # gy, coding, one_of_n = inputs 1469 # gcoding = zeros_like(coding) 1470 # gcoding[arange(coding.shape[0]), one_of_n] = -g / ( 1471 # coding[arange(coding.shape[0]), one_of_n]) 1472 gy, coding, one_of_n = inputs 1473 idx0 = theano.tensor.arange(shape_i(coding, 0)) 1474 z = GpuAlloc(context_name, memset_0=True)( 1475 as_gpuarray_variable(np.zeros((), dtype=coding.dtype), context_name), 1476 *[shape_i(coding, i) for i in xrange(coding.ndim)]) 1477 gcoding = tensor.set_subtensor( 1478 z[idx0, one_of_n], 1479 gpu_neg(gpu_true_div(gy, coding[idx0, one_of_n]))) 1480 return [gcoding.transfer(context_name)] 1481 1482 1483@register_opt('fast_compile') 1484@op_lifter([theano.tensor.opt.Assert]) 1485def local_gpua_assert(op, context_name, inputs, outputs): 1486 if isinstance(inputs[0].type, GpuArrayType): 1487 return 1488 return local_gpua_assert_graph(op, context_name, inputs, outputs) 1489 1490 1491@register_opt2([theano.tensor.opt.Assert], 'fast_compile') 1492def local_gpua_assert_graph(op, context_name, inputs, outputs): 1493 return [op(as_gpuarray_variable(inputs[0], context_name), 1494 *inputs[1:])] 1495 1496 1497@register_opt('fast_compile') 1498@op_lifter([ConvOp]) 1499@register_opt2([ConvOp], 'fast_compile') 1500def local_gpua_error_convop(op, context_name, inputs, outputs): 1501 assert False, """ 1502ConvOp does not work with the gpuarray backend. 1503 1504Use the new convolution interface to have GPU convolution working: 1505theano.tensor.nnet.conv2d() 1506""" 1507 1508 1509@register_opt('fast_compile') 1510@op_lifter([SparseBlockGemv]) 1511@register_opt2([SparseBlockGemv], 'fast_compile') 1512def local_gpua_sparseblockgemv(op, context_name, inputs, outputs): 1513 if inputs[0].dtype == 'float16': 1514 return 1515 if op.inplace: 1516 return gpu_sparse_block_gemv_inplace 1517 else: 1518 return gpu_sparse_block_gemv 1519 1520 1521@register_opt('fast_compile') 1522@op_lifter([SparseBlockOuter]) 1523@register_opt2([SparseBlockOuter], 'fast_compile') 1524def local_gpua_sparseblockouter(op, context_name, inputs, outputs): 1525 if inputs[0].dtype == 'float16': 1526 return 1527 if op.inplace: 1528 return gpu_sparse_block_outer_inplace 1529 else: 1530 return gpu_sparse_block_outer 1531 1532 1533@register_inplace() 1534@local_optimizer([GpuSparseBlockGemv], inplace=True) 1535def local_inplace_sparseblockgemv(node): 1536 if isinstance(node.op, GpuSparseBlockGemv) and not node.op.inplace: 1537 return [gpu_sparse_block_gemv_inplace(*node.inputs)] 1538 1539 1540@register_inplace() 1541@local_optimizer([GpuSparseBlockOuter], inplace=True) 1542def local_inplace_sparseblockouter(node): 1543 if isinstance(node.op, GpuSparseBlockOuter) and not node.op.inplace: 1544 return [GpuSparseBlockOuter(inplace=True)(*node.inputs)] 1545 1546 1547# Move to Gpu optimization 1548@local_optimizer([GpuFromHost, 1549 AbstractConv2d, 1550 AbstractConv2d_gradWeights, 1551 AbstractConv2d_gradInputs, 1552 AbstractConv3d, 1553 AbstractConv3d_gradWeights, 1554 AbstractConv3d_gradInputs]) 1555def local_conv_gpu_conv(node): 1556 """ 1557 gpu_from_host(AbstractConv) -> AbstractConv(gpu_from_host) 1558 1559 AbstractConv(host_from_gpu) -> host_from_gpu(AbstractConv) 1560 """ 1561 if isinstance(node.op, GpuFromHost): 1562 host_input = node.inputs[0] 1563 if host_input.owner and isinstance(host_input.owner.op, 1564 BaseAbstractConv): 1565 1566 conv = host_input.owner.op 1567 inps = list(host_input.owner.inputs) 1568 ctx = infer_context_name(*inps) 1569 inps[0] = as_gpuarray_variable(inps[0], context_name=ctx) 1570 inps[1] = as_gpuarray_variable(inps[1], context_name=ctx) 1571 out = conv(*inps) 1572 # out is on the GPU because both inputs are. 1573 out = theano.tensor.patternbroadcast(out, 1574 node.outputs[0].broadcastable) 1575 return [out] 1576 1577 if isinstance(node.op, BaseAbstractConv): 1578 # conv(host_from_gpu) -> host_from_gpu(gpu_conv) 1579 inp1 = node.inputs[0] 1580 inp2 = node.inputs[1] 1581 if ((isinstance(inp1.type, GpuArrayType) and 1582 isinstance(inp2.type, GpuArrayType))): 1583 # Both inputs are already directly on the GPU, nothing to do 1584 return 1585 1586 inp1_on_gpu = (isinstance(inp1.type, GpuArrayType) or 1587 (inp1.owner and isinstance(inp1.owner.op, HostFromGpu))) 1588 inp2_on_gpu = (isinstance(inp2.type, GpuArrayType) or 1589 (inp2.owner and isinstance(inp2.owner.op, HostFromGpu))) 1590 1591 if inp1_on_gpu or inp2_on_gpu: 1592 conv = node.op 1593 inps = list(node.inputs) 1594 ctx = infer_context_name(*inps) 1595 inps[0] = as_gpuarray_variable(inps[0], context_name=ctx) 1596 inps[1] = as_gpuarray_variable(inps[1], context_name=ctx) 1597 out = conv(*inps) 1598 # out is on the GPU because both inputs are. 1599 out = theano.tensor.patternbroadcast( 1600 out, 1601 node.outputs[0].broadcastable) 1602 # If the original output was on CPU, we have to transfer it 1603 if isinstance(node.outputs[0].type, tensor.TensorType): 1604 return [tensor.as_tensor_variable(out)] 1605 else: 1606 return [out] 1607 1608 1609register_opt()(local_conv_gpu_conv) 1610 1611 1612# CorrMM opt 1613@local_optimizer([AbstractConv2d]) 1614def local_abstractconv_gemm(node): 1615 if not isinstance(node.op, AbstractConv2d): 1616 return None 1617 img, kern = node.inputs 1618 if (not isinstance(img.type, GpuArrayType) or 1619 not isinstance(kern.type, GpuArrayType)): 1620 return None 1621 ctx = infer_context_name(img, kern) 1622 1623 border_mode = node.op.border_mode 1624 subsample = node.op.subsample 1625 filter_dilation = node.op.filter_dilation 1626 num_groups = node.op.num_groups 1627 unshared = node.op.unshared 1628 1629 flip = (slice(None),) * (kern.ndim - 2) + \ 1630 (slice(None, None, -1),) * 2 1631 kern_axes = (1, 0) + tuple(i for i in range(2, kern.ndim)) 1632 if ((border_mode == 'full') and (subsample == (1, 1)) and num_groups == 1 and not unshared): 1633 if not node.op.filter_flip: 1634 kern = kern[flip] 1635 # need to dimshuffle the kernel for full convolution 1636 kern = kern.dimshuffle(kern_axes) 1637 # call GpuCorrMM_gradInputs 1638 rval = GpuCorrMM_gradInputs('valid', 1639 subsample, 1640 filter_dilation)( 1641 gpu_contiguous(kern), gpu_contiguous(img)) 1642 else: 1643 # need to flip the kernel if necessary 1644 if node.op.filter_flip: 1645 kern = kern[flip] 1646 # By default use GpuCorrMM 1647 rval = GpuCorrMM(border_mode, 1648 subsample, 1649 filter_dilation, 1650 num_groups, 1651 unshared)(gpu_contiguous(img), 1652 gpu_contiguous(kern)) 1653 1654 # call GpuCorrMM_gradWeights if good 1655 # (the latter is faster if batchsize * kernelHeight * kernelWidth 1656 # is larger than inputChannels * outputHeight * outputWidth. 1657 # GpuConv does not always store information on the batchsize and 1658 # channels, though, so we only use what information we have.) 1659 if ((subsample == (1, 1)) and (filter_dilation == (1, 1)) and 1660 (node.op.imshp is not None) and 1661 (None not in node.op.imshp[-2:]) and 1662 (node.op.kshp is not None) and 1663 (None not in node.op.kshp) and 1664 border_mode != "half" and 1665 num_groups == 1 and 1666 not unshared): 1667 # we know the kernel and output size 1668 prod1 = node.op.kshp[0] * node.op.kshp[-3] 1669 prod2 = ((node.op.imshp[-2] - node.op.kshp[0] + 1) * 1670 (node.op.imshp[-1] - node.op.kshp[-3] + 1)) 1671 if (None not in node.op.imshp[:1]): 1672 # we also know batchsize and input channels 1673 prod1 *= node.op.imshp[0] 1674 prod2 *= node.op.imshp[1] 1675 # compare to decide 1676 if prod1 > prod2: 1677 rval = GpuCorrMM_gradWeights(border_mode, 1678 subsample, 1679 filter_dilation)( 1680 gpu_contiguous(img.dimshuffle(1, 0, 2, 3)), 1681 gpu_contiguous(kern.dimshuffle(1, 0, 2, 3))) 1682 # (we need to wrap the result in as_gpuarray_variable, 1683 # because we are not allowed to replace a GpuArray with 1684 # a DimShuffle instance in a graph optimization) 1685 rval = as_gpuarray_variable( 1686 rval.dimshuffle(1, 0, 2, 3), 1687 context_name=ctx) 1688 return [rval] 1689 1690 1691# CorrMM opt used for Meta-optimizer 1692@local_optimizer([AbstractConv2d]) 1693def local_abstractconv_gemm_def(node): 1694 if not isinstance(node.op, AbstractConv2d): 1695 return None 1696 img, kern = node.inputs 1697 if (not isinstance(img.type, GpuArrayType) or 1698 not isinstance(kern.type, GpuArrayType)): 1699 return None 1700 1701 border_mode = node.op.border_mode 1702 subsample = node.op.subsample 1703 filter_dilation = node.op.filter_dilation 1704 num_groups = node.op.num_groups 1705 unshared = node.op.unshared 1706 1707 if node.op.filter_flip: 1708 flip = (slice(None),) * (kern.ndim - 2) + \ 1709 (slice(None, None, -1),) * 2 1710 kern = kern[flip] 1711 rval = GpuCorrMM(border_mode, 1712 subsample, 1713 filter_dilation, 1714 num_groups, 1715 unshared)(gpu_contiguous(img), 1716 gpu_contiguous(kern)) 1717 return [rval] 1718 1719 1720@local_optimizer([AbstractConv2d]) 1721def local_abstractconv_gemm_alt(node): 1722 if not isinstance(node.op, AbstractConv2d): 1723 return None 1724 img, kern = node.inputs 1725 if (not isinstance(img.type, GpuArrayType) or 1726 not isinstance(kern.type, GpuArrayType)): 1727 return None 1728 ctx = infer_context_name(img, kern) 1729 1730 border_mode = node.op.border_mode 1731 subsample = node.op.subsample 1732 filter_dilation = node.op.filter_dilation 1733 num_groups = node.op.num_groups 1734 unshared = node.op.unshared 1735 1736 if border_mode == 'full' and subsample == (1, 1) and num_groups == 1 and not unshared: 1737 if not node.op.filter_flip: 1738 kern = kern[:, :, ::-1, ::-1] 1739 1740 kern = kern.dimshuffle(1, 0, 2, 3) 1741 rval = GpuCorrMM_gradInputs('valid', 1742 subsample, 1743 filter_dilation)( 1744 gpu_contiguous(kern), gpu_contiguous(img)) 1745 1746 elif (border_mode == 'valid' and subsample == (1, 1) and filter_dilation == (1, 1) and 1747 num_groups == 1 and not unshared): 1748 if node.op.filter_flip: 1749 kern = kern[:, :, ::-1, ::-1] 1750 1751 rval = GpuCorrMM_gradWeights(border_mode, 1752 subsample, 1753 filter_dilation)( 1754 gpu_contiguous(img.dimshuffle(1, 0, 2, 3)), 1755 gpu_contiguous(kern.dimshuffle(1, 0, 2, 3))) 1756 rval = as_gpuarray_variable(rval.dimshuffle(1, 0, 2, 3), 1757 context_name=ctx) 1758 else: 1759 return None 1760 1761 return [rval] 1762 1763 1764@local_optimizer([AbstractConv3d]) 1765def local_abstractconv3d_gemm(node): 1766 if not isinstance(node.op, AbstractConv3d): 1767 return None 1768 img, kern = node.inputs 1769 if (not isinstance(img.type, GpuArrayType) or 1770 not isinstance(kern.type, GpuArrayType)): 1771 return None 1772 ctx = infer_context_name(img, kern) 1773 1774 border_mode = node.op.border_mode 1775 subsample = node.op.subsample 1776 filter_dilation = node.op.filter_dilation 1777 num_groups = node.op.num_groups 1778 if ((border_mode == 'full') and (subsample == (1, 1, 1)) and num_groups == 1): 1779 if not node.op.filter_flip: 1780 kern = kern[:, :, ::-1, ::-1, ::-1] 1781 # need to dimshuffle the kernel for full convolution 1782 kern = kern.dimshuffle(1, 0, 2, 3, 4) 1783 # call GpuCorr3dMM_gradInputs 1784 rval = GpuCorr3dMM_gradInputs('valid', 1785 subsample, 1786 filter_dilation)( 1787 gpu_contiguous(kern), gpu_contiguous(img)) 1788 else: 1789 # need to flip the kernel if necessary 1790 if node.op.filter_flip: 1791 kern = kern[:, :, ::-1, ::-1, ::-1] 1792 # By default use GpuCorr3dMM 1793 rval = GpuCorr3dMM(border_mode, 1794 subsample, 1795 filter_dilation, 1796 num_groups)(gpu_contiguous(img), 1797 gpu_contiguous(kern)) 1798 1799 # call GpuCorr3dMM_gradWeights if good 1800 # (the latter is faster if batchsize * kernelHeight * kernelWidth * kernelDepth 1801 # is larger than inputChannels * outputHeight * outputWidth * outputDepth. 1802 # GpuConv does not always store information on the batchsize and 1803 # channels, though, so we only use what information we have.) 1804 if ((subsample == (1, 1, 1)) and (filter_dilation == (1, 1, 1)) and 1805 (node.op.imshp is not None) and 1806 (None not in node.op.imshp[-3:]) and 1807 (node.op.kshp is not None) and 1808 (None not in node.op.kshp) and 1809 border_mode != "half" and 1810 num_groups == 1): 1811 # we know the kernel and output size 1812 prod1 = node.op.kshp[0] * node.op.kshp[1] * node.op.kshp[2] 1813 prod2 = ((node.op.imshp[-3] - node.op.kshp[0] + 1) * 1814 (node.op.imshp[-2] - node.op.kshp[1] + 1) * 1815 (node.op.imshp[-1] - node.op.kshp[2] + 1)) 1816 if (None not in node.op.imshp[:1]): 1817 # we also know batchsize and input channels 1818 prod1 *= node.op.imshp[0] 1819 prod2 *= node.op.imshp[1] 1820 # compare to decide 1821 if prod1 > prod2: 1822 rval = GpuCorr3dMM_gradWeights(border_mode, 1823 subsample, 1824 filter_dilation)( 1825 gpu_contiguous(img.dimshuffle(1, 0, 2, 3, 4)), 1826 gpu_contiguous(kern.dimshuffle(1, 0, 2, 3, 4))) 1827 # (we need to wrap the result in as_gpuarray_variable, 1828 # because we are not allowed to replace a GpuArray with 1829 # a DimShuffle instance in a graph optimization) 1830 rval = as_gpuarray_variable( 1831 rval.dimshuffle(1, 0, 2, 3, 4), 1832 context_name=ctx) 1833 return [rval] 1834 1835 1836# Corr3dMM opt used for Meta-optimizer 1837@local_optimizer([AbstractConv3d]) 1838def local_abstractconv3d_gemm_def(node): 1839 if not isinstance(node.op, AbstractConv3d): 1840 return None 1841 img, kern = node.inputs 1842 if (not isinstance(img.type, GpuArrayType) or 1843 not isinstance(kern.type, GpuArrayType)): 1844 return None 1845 1846 border_mode = node.op.border_mode 1847 subsample = node.op.subsample 1848 filter_dilation = node.op.filter_dilation 1849 if node.op.filter_flip: 1850 kern = kern[:, :, ::-1, ::-1, ::-1] 1851 # By default use GpuCorr3dMM 1852 rval = GpuCorr3dMM(border_mode, 1853 subsample, 1854 filter_dilation, 1855 node.op.num_groups)(gpu_contiguous(img), 1856 gpu_contiguous(kern)) 1857 return [rval] 1858 1859 1860@local_optimizer([AbstractConv3d]) 1861def local_abstractconv3d_alt(node): 1862 if not isinstance(node.op, AbstractConv3d): 1863 return None 1864 img, kern = node.inputs 1865 if (not isinstance(img.type, GpuArrayType) or 1866 not isinstance(kern.type, GpuArrayType)): 1867 return None 1868 ctx = infer_context_name(img, kern) 1869 1870 border_mode = node.op.border_mode 1871 subsample = node.op.subsample 1872 filter_dilation = node.op.filter_dilation 1873 num_groups = node.op.num_groups 1874 1875 if((border_mode == 'full') and (subsample == (1, 1, 1)) and 1876 (num_groups == 1)): 1877 if not node.op.filter_flip: 1878 kern = kern[:, :, ::-1, ::-1, ::-1] 1879 kern = kern.dimshuffle(1, 0, 2, 3, 4) 1880 rval = GpuCorr3dMM_gradInputs('valid', 1881 subsample, 1882 filter_dilation)( 1883 gpu_contiguous(kern), gpu_contiguous(img)) 1884 1885 elif(subsample == (1, 1, 1) and filter_dilation == (1, 1, 1) and 1886 border_mode == 'valid' and num_groups == 1): 1887 if node.op.filter_flip: 1888 kern = kern[:, :, ::-1, ::-1, ::-1] 1889 rval = GpuCorr3dMM_gradWeights(border_mode, 1890 subsample, 1891 filter_dilation)( 1892 gpu_contiguous(img.dimshuffle(1, 0, 2, 3, 4)), 1893 gpu_contiguous(kern.dimshuffle(1, 0, 2, 3, 4))) 1894 rval = as_gpuarray_variable(rval.dimshuffle(1, 0, 2, 3, 4), 1895 context_name=ctx) 1896 else: 1897 return None 1898 return [rval] 1899 1900 1901@local_optimizer([AbstractConv3d]) 1902def local_abstractconv3d2d(node): 1903 if not isinstance(node.op, AbstractConv3d): 1904 return None 1905 img, kern = node.inputs 1906 if (not isinstance(img.type, GpuArrayType) or 1907 not isinstance(kern.type, GpuArrayType)): 1908 return None 1909 1910 ctx = infer_context_name(img, kern) 1911 border_mode = node.op.border_mode 1912 subsample = node.op.subsample 1913 filter_dilation = node.op.filter_dilation 1914 num_groups = node.op.num_groups 1915 1916 if(subsample == (1, 1, 1) and filter_dilation == (1, 1, 1) and 1917 num_groups == 1): 1918 reorder_array = [0, 2, 1, 3, 4] 1919 rval = conv3d2d.conv3d(gpu_contiguous(img.dimshuffle(*reorder_array)), 1920 gpu_contiguous(kern.dimshuffle(*reorder_array)), 1921 [node.op.imshp[i] for i in reorder_array], 1922 [node.op.kshp[i] for i in reorder_array], 1923 border_mode=border_mode) 1924 rval = as_gpuarray_variable(rval.dimshuffle(*reorder_array), 1925 context_name=ctx) 1926 1927 return [rval] 1928 else: 1929 return None 1930 1931 1932@local_optimizer([AbstractConv2d_gradWeights]) 1933def local_abstractconv_gradweights_gemm(node): 1934 if not isinstance(node.op, AbstractConv2d_gradWeights): 1935 return None 1936 img, topgrad, shape = node.inputs 1937 if not isinstance(img.type, GpuArrayType) or \ 1938 not isinstance(topgrad.type, GpuArrayType): 1939 return None 1940 ctx = infer_context_name(img, topgrad) 1941 1942 rval = GpuCorrMM_gradWeights(border_mode=node.op.border_mode, 1943 subsample=node.op.subsample, 1944 filter_dilation=node.op.filter_dilation, 1945 num_groups=node.op.num_groups, 1946 unshared=node.op.unshared)( 1947 gpu_contiguous(img), gpu_contiguous(topgrad), shape) 1948 flip = (slice(None),) * (rval.ndim - 2) + \ 1949 (slice(None, None, -1),) * 2 1950 if node.op.filter_flip: 1951 rval = rval[flip] 1952 rval = tensor.patternbroadcast(rval, node.outputs[0].broadcastable) 1953 rval = as_gpuarray_variable(rval, context_name=ctx) 1954 return [rval] 1955 1956 1957@local_optimizer([AbstractConv2d_gradWeights]) 1958def local_abstractconv_gemm_gradweights_alt(node): 1959 if not isinstance(node.op, AbstractConv2d_gradWeights): 1960 return None 1961 img, topgrad, shape = node.inputs 1962 if not isinstance(img.type, GpuArrayType) or \ 1963 not isinstance(topgrad.type, GpuArrayType): 1964 return None 1965 ctx = infer_context_name(img, topgrad) 1966 border_mode = node.op.border_mode 1967 subsample = node.op.subsample 1968 filter_dilation = node.op.filter_dilation 1969 num_groups = node.op.num_groups 1970 unshared = node.op.unshared 1971 1972 if(border_mode == 'valid' and subsample == (1, 1) and filter_dilation == (1, 1) and 1973 num_groups == 1 and not unshared): 1974 rval = GpuCorrMM(border_mode, 1975 subsample, 1976 filter_dilation)( 1977 gpu_contiguous(img.dimshuffle(1, 0, 2, 3)), 1978 gpu_contiguous(topgrad.dimshuffle(1, 0, 2, 3))) 1979 1980 if node.op.filter_flip: 1981 rval = rval[:, :, ::-1, ::-1] 1982 1983 rval = rval.dimshuffle(1, 0, 2, 3) 1984 rval = tensor.patternbroadcast(rval, node.outputs[0].broadcastable) 1985 rval = as_gpuarray_variable(rval, context_name=ctx) 1986 return [rval] 1987 else: 1988 return None 1989 1990 1991@local_optimizer([AbstractConv3d_gradWeights]) 1992def local_abstractconv3d_gemm_gradweights_alt(node): 1993 if not isinstance(node.op, AbstractConv3d_gradWeights): 1994 return None 1995 img, topgrad, shape = node.inputs 1996 if not isinstance(img.type, GpuArrayType) or \ 1997 not isinstance(topgrad.type, GpuArrayType): 1998 return None 1999 ctx = infer_context_name(img, topgrad) 2000 border_mode = node.op.border_mode 2001 subsample = node.op.subsample 2002 filter_dilation = node.op.filter_dilation 2003 num_groups = node.op.num_groups 2004 2005 if(border_mode == 'valid' and subsample == (1, 1, 1) and 2006 filter_dilation == (1, 1, 1) and num_groups == 1): 2007 rval = GpuCorr3dMM(border_mode, 2008 subsample, 2009 filter_dilation)( 2010 gpu_contiguous(img.dimshuffle(1, 0, 2, 3, 4)), 2011 gpu_contiguous(topgrad.dimshuffle(1, 0, 2, 3, 4))) 2012 2013 if node.op.filter_flip: 2014 rval = rval[:, :, ::-1, ::-1, ::-1] 2015 2016 rval = rval.dimshuffle(1, 0, 2, 3, 4) 2017 rval = tensor.patternbroadcast(rval, node.outputs[0].broadcastable) 2018 rval = as_gpuarray_variable(rval, context_name=ctx) 2019 return [rval] 2020 else: 2021 return None 2022 2023 2024@local_optimizer([AbstractConv3d_gradWeights]) 2025def local_abstractconv3d_gradweights_gemm(node): 2026 if not isinstance(node.op, AbstractConv3d_gradWeights): 2027 return None 2028 img, topgrad, shape = node.inputs 2029 if not isinstance(img.type, GpuArrayType) or \ 2030 not isinstance(topgrad.type, GpuArrayType): 2031 return None 2032 ctx = infer_context_name(img, topgrad) 2033 2034 rval = GpuCorr3dMM_gradWeights(border_mode=node.op.border_mode, 2035 subsample=node.op.subsample, 2036 filter_dilation=node.op.filter_dilation, 2037 num_groups=node.op.num_groups)( 2038 gpu_contiguous(img), gpu_contiguous(topgrad), shape) 2039 if node.op.filter_flip: 2040 rval = rval[:, :, ::-1, ::-1, ::-1] 2041 rval = tensor.patternbroadcast(rval, node.outputs[0].broadcastable) 2042 rval = as_gpuarray_variable(rval, context_name=ctx) 2043 return [rval] 2044 2045 2046@local_optimizer([AbstractConv2d_gradInputs]) 2047def local_abstractconv_gradinputs_gemm(node): 2048 if not isinstance(node.op, AbstractConv2d_gradInputs): 2049 return None 2050 kern, topgrad, shape = node.inputs 2051 if not isinstance(kern.type, GpuArrayType) or \ 2052 not isinstance(topgrad.type, GpuArrayType): 2053 return None 2054 2055 if node.op.filter_flip: 2056 flip = (slice(None),) * (kern.ndim - 2) + \ 2057 (slice(None, None, -1),) * 2 2058 kern = kern[flip] 2059 2060 rval = GpuCorrMM_gradInputs(border_mode=node.op.border_mode, 2061 subsample=node.op.subsample, 2062 filter_dilation=node.op.filter_dilation, 2063 num_groups=node.op.num_groups, 2064 unshared=node.op.unshared)( 2065 gpu_contiguous(kern), gpu_contiguous(topgrad), shape) 2066 return [rval] 2067 2068 2069@local_optimizer([AbstractConv2d_gradInputs]) 2070def local_abstractconv_gradinputs_gemm_alt(node): 2071 if not isinstance(node.op, AbstractConv2d_gradInputs): 2072 return None 2073 kern, topgrad, shape = node.inputs 2074 if not isinstance(kern.type, GpuArrayType) or \ 2075 not isinstance(topgrad.type, GpuArrayType): 2076 return None 2077 border_mode = node.op.border_mode 2078 subsample = node.op.subsample 2079 filter_dilation = node.op.filter_dilation 2080 num_groups = node.op.num_groups 2081 unshared = node.op.unshared 2082 2083 if border_mode == 'valid' and subsample == (1, 1) and num_groups == 1 and not unshared: 2084 if not node.op.filter_flip: 2085 kern = kern[:, :, ::-1, ::-1] 2086 2087 rval = GpuCorrMM(border_mode='full', 2088 subsample=subsample, 2089 filter_dilation=filter_dilation)( 2090 gpu_contiguous(topgrad), 2091 gpu_contiguous(kern.dimshuffle(1, 0, 2, 3))) 2092 return [rval] 2093 else: 2094 return None 2095 2096 2097@local_optimizer([AbstractConv3d_gradInputs]) 2098def local_abstractconv3d_gradinputs_gemm(node): 2099 if not isinstance(node.op, AbstractConv3d_gradInputs): 2100 return None 2101 kern, topgrad, shape = node.inputs 2102 if not isinstance(kern.type, GpuArrayType) or \ 2103 not isinstance(topgrad.type, GpuArrayType): 2104 return None 2105 2106 if node.op.filter_flip: 2107 kern = kern[:, :, ::-1, ::-1, ::-1] 2108 2109 rval = GpuCorr3dMM_gradInputs(border_mode=node.op.border_mode, 2110 subsample=node.op.subsample, 2111 filter_dilation=node.op.filter_dilation, 2112 num_groups=node.op.num_groups)( 2113 gpu_contiguous(kern), gpu_contiguous(topgrad), shape) 2114 return [rval] 2115 2116 2117@local_optimizer([AbstractConv3d_gradInputs]) 2118def local_abstractconv3d_gradinputs_gemm_alt(node): 2119 if not isinstance(node.op, AbstractConv3d_gradInputs): 2120 return None 2121 kern, topgrad, shape = node.inputs 2122 if not isinstance(kern.type, GpuArrayType) or \ 2123 not isinstance(topgrad.type, GpuArrayType): 2124 return None 2125 border_mode = node.op.border_mode 2126 subsample = node.op.subsample 2127 filter_dilation = node.op.filter_dilation 2128 num_groups = node.op.num_groups 2129 2130 if(border_mode == 'valid' and subsample == (1, 1, 1) and 2131 num_groups == 1): 2132 if not node.op.filter_flip: 2133 kern = kern[:, :, ::-1, ::-1, ::-1] 2134 rval = GpuCorr3dMM(border_mode='full', 2135 subsample=subsample, 2136 filter_dilation=filter_dilation)( 2137 gpu_contiguous(topgrad), 2138 gpu_contiguous(kern.dimshuffle(1, 0, 2, 3, 4))) 2139 return [rval] 2140 else: 2141 return None 2142 2143 2144class ConvMetaOptimizer(LocalMetaOptimizer): 2145 2146 def __init__(self): 2147 super(ConvMetaOptimizer, self).__init__() 2148 2149 def time_call(self, fn): 2150 start = time.time() 2151 fn()[0].sync() 2152 return time.time() - start 2153 2154 def provide_inputs(self, node, inputs): 2155 result = {} 2156 2157 shapes = (node.op.imshp, node.op.kshp) 2158 if(node.op.imshp is None or node.op.kshp is None or 2159 any([s is None for shape in shapes for s in shape])): 2160 return result 2161 2162 if type(node.op) in [AbstractConv2d, AbstractConv3d]: 2163 img, kern = node.inputs 2164 for(var, shape) in zip((img, kern), shapes): 2165 result[var] = theano.shared(np.random.random(shape).astype(var.dtype), 2166 var.name, 2167 broadcastable=var.broadcastable, 2168 borrow=True) 2169 2170 if type(node.op) in [AbstractConv2d_gradWeights, AbstractConv3d_gradWeights]: 2171 img, top, kshape = node.inputs 2172 2173 tshp = get_conv_output_shape(node.op.imshp, 2174 node.op.kshp, 2175 node.op.border_mode, 2176 node.op.subsample, 2177 node.op.filter_dilation) 2178 convdim = img.ndim - 2 2179 2180 result[kshape] = theano.tensor.as_tensor_variable(node.op.kshp[-convdim:]) 2181 2182 for(var, shape) in zip((img, top), (node.op.imshp, tshp)): 2183 result[var] = theano.shared(np.random.random(shape).astype(var.dtype), 2184 var.name, 2185 broadcastable=var.broadcastable, 2186 borrow=True) 2187 2188 if type(node.op) in [AbstractConv2d_gradInputs, AbstractConv3d_gradInputs]: 2189 kern, top, ishape = node.inputs 2190 2191 tshp = get_conv_output_shape(node.op.imshp, 2192 node.op.kshp, 2193 node.op.border_mode, 2194 node.op.subsample, 2195 node.op.filter_dilation) 2196 2197 result[ishape] = theano.tensor.as_tensor_variable(node.op.imshp[2:]) 2198 2199 for(var, shape) in zip((kern, top), (node.op.kshp, tshp)): 2200 result[var] = theano.shared(np.random.random(shape).astype(var.dtype), 2201 var.name, 2202 broadcastable=var.broadcastable, 2203 borrow=True) 2204 2205 return result 2206 2207 def get_opts(self, node): 2208 opts = Counter([opt for opt in self.track_dict[type(node.op)] 2209 if opt in self.tag_dict['default']]) 2210 include_tags = config.metaopt.optimizer_including.split(':') 2211 exclude_tags = config.metaopt.optimizer_excluding.split(':') 2212 2213 for in_opt in include_tags: 2214 opts.update([opt for opt in self.track_dict[type(node.op)] 2215 if opt in self.tag_dict[in_opt]]) 2216 2217 for ex_opt in exclude_tags: 2218 opts.subtract([opt for opt in self.track_dict[type(node.op)] 2219 if opt in self.tag_dict[ex_opt]]) 2220 2221 opts = list(opts + Counter()) 2222 return opts 2223 2224 2225# This deals with any abstract convs that have a transfer somewhere 2226@register_opt('fast_compile', 'conv_dnn', 'cudnn') 2227@op_lifter([AbstractConv2d, 2228 AbstractConv2d_gradWeights, 2229 AbstractConv2d_gradInputs, 2230 AbstractConv3d, 2231 AbstractConv3d_gradWeights, 2232 AbstractConv3d_gradInputs]) 2233def local_gpua_abstractconv(op, context_name, inputs, outputs): 2234 if isinstance(outputs[0].type, GpuArrayType): 2235 # Don't handle this node here, it's already on the GPU. 2236 return 2237 return local_gpua_lift_abstractconv_graph(op, context_name, inputs, outputs) 2238 2239 2240@register_opt2([AbstractConv2d, 2241 AbstractConv2d_gradWeights, 2242 AbstractConv2d_gradInputs, 2243 AbstractConv3d, 2244 AbstractConv3d_gradWeights, 2245 AbstractConv3d_gradInputs], 'fast_compile') 2246def local_gpua_lift_abstractconv_graph(op, context_name, inputs, outputs): 2247 inps = list(inputs) 2248 inps[0] = as_gpuarray_variable(inputs[0], 2249 context_name=context_name) 2250 inps[1] = as_gpuarray_variable(inputs[1], 2251 context_name=context_name) 2252 return [op(*inps)] 2253 2254 2255def local_gpu_pool(op, ctx_name, inputs, outputs): 2256 assert op.__props__ == ('ignore_border', 'mode', 'ndim') 2257 inp, ws, stride, pad = inputs 2258 nd = op.ndim 2259 if nd not in (2, 3): 2260 return 2261 inp = gpu_contiguous(as_gpuarray_variable(inp, ctx_name)) 2262 2263 op = GpuPool(op.ignore_border, op.mode, op.ndim) 2264 if inp.ndim == nd + 2: 2265 return op(inp, ws, stride, pad) 2266 else: 2267 # reshape to 4D or 5D with 2 non-pooling dimensions 2268 inp_padded = pad_dims(inp, 2, nd) 2269 ret_padded = op(inp_padded, ws, stride, pad) 2270 return unpad_dims(ret_padded, inp, 2, nd) 2271 2272 2273pool_db = LocalGroupDB() 2274pool_db2 = LocalGroupDB(local_opt=theano.gof.opt.GraphToGPULocalOptGroup) 2275pool_db2.__name__ = "pool_db2" 2276lifter = op_lifter([pool.Pool])(local_gpu_pool) 2277pool_db.register("local_gpu_pool", lifter, 2278 'gpuarray', 'fast_compile', 'fast_run', 2279 position=1) 2280pool_db2.register("local_gpu_pool", 2281 local_optimizer([pool.Pool])(local_gpu_pool), 2282 'gpuarray', 'fast_compile', 'fast_run', 2283 position=1) 2284register_opt('fast_compile', name='pool_db')(pool_db) 2285register_opt2([pool.Pool], 'fast_compile', name='pool_db2')(pool_db2) 2286 2287 2288def local_gpu_max_pool_grad(op, ctx_name, inputs, outputs): 2289 assert op.__props__ == ('ignore_border', 'mode', 'ndim') 2290 2291 inp, out, out_grad, ws, stride, pad = inputs 2292 nd = op.ndim 2293 if nd not in (2, 3): 2294 return 2295 inp = gpu_contiguous(as_gpuarray_variable(inp, ctx_name)) 2296 out = gpu_contiguous(as_gpuarray_variable(out, ctx_name)) 2297 out_grad = gpu_contiguous(as_gpuarray_variable(out_grad, ctx_name)) 2298 2299 op = GpuMaxPoolGrad(op.ignore_border, op.mode, op.ndim) 2300 if inp.ndim == nd + 2: 2301 return op(inp, out, out_grad, ws, stride, pad) 2302 else: 2303 # reshape to 4D or 5D with 2 non-pooling dimensions 2304 inp_padded = pad_dims(inp, 2, nd) 2305 out_padded = pad_dims(out, 2, nd) 2306 out_grad_padded = pad_dims(out_grad, 2, nd) 2307 ret_padded = op(inp_padded, out_padded, out_grad_padded, 2308 ws, stride, pad) 2309 return unpad_dims(ret_padded, inp, 2, nd) 2310 2311 2312lifter = op_lifter([pool.MaxPoolGrad])(local_gpu_max_pool_grad) 2313pool_db.register("local_gpu_max_pool_grad", lifter, 2314 'gpuarray', 'fast_compile', 'fast_run', 2315 position=1) 2316pool_db2.register("local_gpu_max_pool_grad", 2317 local_optimizer([pool.MaxPoolGrad])(local_gpu_max_pool_grad), 2318 'gpuarray', 'fast_compile', 'fast_run', 2319 position=1) 2320 2321 2322def local_gpu_average_pool_grad(op, ctx_name, inputs, outputs): 2323 assert op.__props__ == ('ignore_border', 'mode', 'ndim') 2324 2325 inp, out_grad, ws, stride, pad = inputs 2326 nd = op.ndim 2327 if nd not in (2, 3): 2328 return 2329 inp = gpu_contiguous(as_gpuarray_variable(inp, ctx_name)) 2330 out_grad = gpu_contiguous(as_gpuarray_variable(out_grad, ctx_name)) 2331 2332 op = GpuAveragePoolGrad(op.ignore_border, op.mode, op.ndim) 2333 if inp.ndim == nd + 2: 2334 return op(inp, out_grad, ws, stride, pad) 2335 else: 2336 # reshape to 4D or 5D with 2 non-pooling dimensions 2337 inp_padded = pad_dims(inp, 2, nd) 2338 out_grad_padded = pad_dims(out_grad, 2, nd) 2339 ret_padded = op(inp_padded, out_grad_padded, 2340 ws, stride, pad) 2341 return unpad_dims(ret_padded, inp, 2, nd) 2342 2343 2344lifter = op_lifter([pool.AveragePoolGrad])(local_gpu_average_pool_grad) 2345pool_db.register("local_gpu_average_pool_grad", lifter, 2346 'gpuarray', 'fast_compile', 'fast_run', 2347 position=1) 2348pool_db2.register("local_gpu_average_pool_grad", 2349 local_optimizer([pool.AveragePoolGrad])(local_gpu_average_pool_grad), 2350 'gpuarray', 'fast_compile', 'fast_run', 2351 position=1) 2352 2353 2354@register_opt() 2355@op_lifter([pool.DownsampleFactorMaxGradGrad]) 2356@register_opt2([pool.DownsampleFactorMaxGradGrad]) 2357def local_gpu_downsample_factor_max_grad_grad(op, ctx_name, inputs, outputs): 2358 assert op.__props__ == ('ignore_border', 'mode', 'ndim') 2359 inp, out, out_grad, ws, stride, pad = inputs 2360 nd = op.ndim 2361 if nd not in (2, 3): 2362 return 2363 inp = gpu_contiguous(as_gpuarray_variable(inp, ctx_name)) 2364 out = gpu_contiguous(as_gpuarray_variable(out, ctx_name)) 2365 out_grad = gpu_contiguous(as_gpuarray_variable(out_grad, ctx_name)) 2366 2367 op = GpuDownsampleFactorMaxGradGrad(op.ignore_border, op.mode, op.ndim) 2368 if inp.ndim == nd + 2: 2369 return op(inp, out, out_grad, ws, stride, pad) 2370 else: 2371 # reshape to 4D or 5D with 2 non-pooling dimensions 2372 inp_padded = pad_dims(inp, 2, nd) 2373 out_padded = pad_dims(out, 2, nd) 2374 out_grad_padded = pad_dims(out_grad, 2, nd) 2375 ret_padded = op(inp_padded, out_padded, out_grad_padded, 2376 ws, stride, pad) 2377 return unpad_dims(ret_padded, inp, 2, nd) 2378 2379 2380@register_opt() 2381@op_lifter([pool.MaxPoolRop]) 2382@register_opt2([pool.MaxPoolRop]) 2383def local_gpu_max_pool_rop(op, ctx_name, inputs, outputs): 2384 assert op.__props__ == ('ignore_border', 'mode', 'ndim') 2385 inp, eval_inp, ws, stride, pad = inputs 2386 nd = op.ndim 2387 if nd not in (2, 3): 2388 return 2389 inp = gpu_contiguous(as_gpuarray_variable(inp, ctx_name)) 2390 eval_inp = gpu_contiguous(as_gpuarray_variable(eval_inp, ctx_name)) 2391 2392 op = GpuMaxPoolRop(op.ignore_border, op.mode, op.ndim) 2393 if inp.ndim == nd + 2: 2394 return op(inp, eval_inp, ws, stride, pad) 2395 else: 2396 # reshape to 4D or 5D with 2 non-pooling dimensions 2397 inp_padded = pad_dims(inp, 2, nd) 2398 eval_inp_padded = pad_dims(eval_inp, 2, nd) 2399 ret_padded = op(inp_padded, eval_inp_padded, ws, stride, pad) 2400 return unpad_dims(ret_padded, inp, 2, nd) 2401 2402 2403@register_opt("low_memory") 2404@local_optimizer([GpuCAReduceCuda]) 2405def local_gpu_elemwise_careduce(node): 2406 """ 2407 Merge some GpuCAReduceCuda and GPUElemwise. 2408 Currently merged: 2409 - SUM(X^2) 2410 - SUM(ABS(X)) 2411 2412 """ 2413 if (isinstance(node.op, GpuCAReduceCuda) and 2414 node.op.pre_scalar_op is None and 2415 node.inputs[0].owner and 2416 isinstance(node.inputs[0].owner.op, GpuElemwise) and 2417 # The Op support all scalar with 1 inputs. We don't 2418 # automatically add more case, as some like trigonometic 2419 # operation with some reduction pattern will probably results 2420 # in slow down. 2421 isinstance(node.inputs[0].owner.op.scalar_op, (scalar.basic.Sqr, 2422 scalar.basic.Abs))): 2423 inp = node.inputs[0].owner.inputs[0] 2424 props = node.op._props_dict() 2425 props["pre_scalar_op"] = node.inputs[0].owner.op.scalar_op 2426 with inherit_stack_trace(node.outputs): 2427 out = GpuCAReduceCuda(**props)(inp) 2428 return [out] 2429 2430 2431@local_optimizer(None) 2432def local_assert_no_cpu_op(node): 2433 if (all([var.owner and isinstance(var.owner.op, HostFromGpu) 2434 for var in node.inputs]) and 2435 any([[c for c in var.clients if isinstance(c[0].op, GpuFromHost)] 2436 for var in node.outputs])): 2437 2438 if config.assert_no_cpu_op == "warn": 2439 _logger.warning(("CPU Op %s is detected in the computation " 2440 "graph") % node) 2441 elif config.assert_no_cpu_op == "raise": 2442 raise AssertionError("The Op %s is on CPU." % node) 2443 elif config.assert_no_cpu_op == "pdb": 2444 pdb.set_trace() 2445 2446 2447# Register the local_assert_no_cpu_op: 2448assert_no_cpu_op = theano.tensor.opt.in2out(local_assert_no_cpu_op, 2449 name='assert_no_cpu_op') 2450# 49.2 is after device specialization & fusion optimizations for last transfers 2451optdb.register('gpua_assert_no_cpu_op', assert_no_cpu_op, 49.2, 2452 'assert_no_cpu_op') 2453 2454 2455def tensor_to_gpu(x, context_name): 2456 if isinstance(x.type, tensor.TensorType): 2457 y = GpuArrayType(broadcastable=x.type.broadcastable, 2458 context_name=context_name, 2459 dtype=x.type.dtype)() 2460 if x.name: 2461 y.name = x.name + '[Gpua]' 2462 return y 2463 else: 2464 return x 2465 2466 2467def gpu_safe_new(x, tag=''): 2468 """ 2469 Internal function that constructs a new variable from x with the same 2470 type, but with a different name (old name + tag). This function is used 2471 by gradient, or the R-op to construct new variables for the inputs of 2472 the inner graph such that there is no interference between the original 2473 graph and the newly constructed graph. 2474 2475 """ 2476 if hasattr(x, 'name') and x.name is not None: 2477 nw_name = x.name + tag 2478 else: 2479 nw_name = None 2480 2481 if isinstance(x, theano.Constant): 2482 return x.clone() 2483 2484 nw_x = x.type() 2485 nw_x.name = nw_name 2486 return nw_x 2487 2488 2489def gpu_reconstruct_graph(inputs, outputs, tag=None): 2490 """ 2491 Different interface to clone, that allows you to pass inputs. 2492 Compared to clone, this method always replaces the inputs with 2493 new variables of the same type, and returns those (in the same 2494 order as the original inputs). 2495 2496 """ 2497 if tag is None: 2498 tag = '' 2499 nw_inputs = [gpu_safe_new(x, tag) for x in inputs] 2500 givens = {} 2501 for nw_x, x in zip(nw_inputs, inputs): 2502 givens[x] = nw_x 2503 nw_outputs = scan_utils.clone(outputs, replace=givens) 2504 return (nw_inputs, nw_outputs) 2505 2506 2507@register_opt('scan', 'fast_compile') 2508@op_lifter([scan_op.Scan]) 2509@register_opt2([scan_op.Scan], 'fast_compile') 2510def local_gpua_scan_to_gpua(op, context_name, inputs, outputs): 2511 info = copy.deepcopy(op.info) 2512 if info.get('gpua', False): 2513 return 2514 info['gpua'] = True 2515 nw_ins = [inputs[0]] 2516 e = (1 + 2517 op.n_seqs + 2518 op.n_mit_mot + 2519 op.n_mit_sot + 2520 op.n_sit_sot + 2521 op.n_shared_outs) 2522 nw_ins += [safe_to_gpu(x, context_name) for x in inputs[1:e]] 2523 b = e 2524 e = e + op.n_nit_sot 2525 nw_ins += inputs[b:e] 2526 nw_ins += [safe_to_gpu(x, context_name) for x in inputs[e:]] 2527 scan_ins = [tensor_to_gpu(x, context_name) for x in op.inputs] 2528 2529 # The inner output corresponding to the looping condition should not be 2530 # moved to the gpu 2531 if op.info['as_while']: 2532 scan_outs = [safe_to_gpu(x, context_name) for x in op.outputs[:-1]] 2533 scan_outs += [op.outputs[-1]] 2534 else: 2535 scan_outs = [safe_to_gpu(x, context_name) for x in op.outputs] 2536 scan_outs = scan_utils.clone( 2537 scan_outs, 2538 replace=list(zip(op.inputs, 2539 (safe_to_cpu(x) for x in scan_ins)))) 2540 2541 # We need to construct the hash here, because scan 2542 # __init__ does not know about the gpu and can not 2543 # handle graphs with inputs being on the gpu 2544 tmp_in, tmp_out = gpu_reconstruct_graph(scan_ins, scan_outs) 2545 local_fgraph = gof.FunctionGraph(tmp_in, tmp_out, clone=True) 2546 _cmodule_key = gof.CLinker().cmodule_key_(local_fgraph, []) 2547 info['gpu_hash'] = hash(_cmodule_key) 2548 2549 def typebuild(dtype, broadcastable, context_name=context_name): 2550 return GpuArrayType(dtype=dtype, broadcastable=broadcastable, 2551 context_name=context_name) 2552 2553 nw_op = scan_op.Scan(scan_ins, scan_outs, info, 2554 typeConstructor=typebuild).make_node(*nw_ins) 2555 return nw_op.outputs 2556 2557 2558def _scan_type_infer(node): 2559 context_name = infer_context_name(*node.inputs) 2560 2561 def typebuild(dtype, broadcastable, context_name=context_name): 2562 return GpuArrayType(dtype=dtype, broadcastable=broadcastable, 2563 context_name=context_name) 2564 return typebuild 2565 2566 2567# Add optimization : maxandargmax (CPU -> GPU) 2568@register_opt('fast_compile') 2569@op_lifter([tensor.MaxAndArgmax]) 2570@register_opt2([tensor.MaxAndArgmax], 'fast_compile') 2571def local_gpu_maxandargmax(op, context_name, inputs, outputs): 2572 op = GpuMaxAndArgmax(op.get_params(None)) 2573 if inputs[0].dtype == "float16": 2574 # For now it is better to copy/cast on the GPU then transfer to the CPU 2575 casted_inputs = inputs[0].astype('float32') 2576 ret = op(casted_inputs) 2577 return [ret[0].astype('float16'), ret[1]] 2578 return op 2579 2580 2581@register_opt('fast_compile') 2582@op_lifter([Images2Neibs]) 2583@register_opt2([Images2Neibs], 'fast_compile') 2584def local_gpua_images2neibs(op, context_name, inputs, outputs): 2585 if op.mode in ['valid', 'half', 'full', 'ignore_borders', 'wrap_centered']: 2586 return GpuImages2Neibs(op.mode) 2587 2588 2589# solve 2590@register_opt('fast_compile') 2591@op_lifter([slinalg.Solve]) 2592@register_opt2([theano.tensor.slinalg.Solve], 'fast_compile') 2593def local_gpu_solve(op, context_name, inputs, outputs): 2594 if inputs[0].dtype not in ['float16', 'float32', 'float64']: 2595 return 2596 if op.A_structure not in MATRIX_STRUCTURES_SOLVE: 2597 return 2598 2599 if op.A_structure in ['lower_triangular', 'upper_triangular']: 2600 if not cublas_available: 2601 return 2602 lower = op.A_structure == 'lower_triangular' 2603 op = GpuCublasTriangularSolve(lower) 2604 else: 2605 if not cusolver_available: 2606 return 2607 op = GpuCusolverSolve(A_structure=op.A_structure) 2608 2609 if inputs[0].dtype == 'float16': 2610 return op(inputs[0].astype('float32'), 2611 inputs[1].astype('float32')).astype('float16') 2612 return op 2613 2614 2615@register_inplace() 2616@local_optimizer([GpuCusolverSolve], inplace=True) 2617def local_inplace_gpu_solve(node): 2618 if isinstance(node.op, GpuCusolverSolve) and not node.op.inplace: 2619 with inherit_stack_trace(node.outputs): 2620 return [GpuCusolverSolve(A_structure=node.op.A_structure, 2621 trans=node.op.trans, 2622 inplace=True)(*node.inputs)] 2623 2624 2625# Cholesky decomposition 2626def local_gpu_cholesky(op, context_name, inputs, outputs): 2627 if not cusolver_available: 2628 return 2629 if inputs[0].dtype not in ['float16', 'float32', 'float64']: 2630 return 2631 op = GpuCholesky(lower=op.lower, inplace=op.destructive) 2632 if inputs[0].dtype == 'float16': 2633 return op(inputs[0].astype('float32')).astype('float16') 2634 2635 return op 2636matrix_ops_db = LocalGroupDB() 2637matrix_ops_db2 = LocalGroupDB(local_opt=theano.gof.opt.GraphToGPULocalOptGroup) 2638matrix_ops_db2.__name__ = "matrix_ops_db2" 2639 2640# For Cholesky decomposition, magma 2.2 is slower than cusolver 8 (tested for 2641# matrices of size 1000). Thus, cusolver is prioritized during graph 2642# optimizations. To explicitly use magma, you should disable cusolver using 2643# `optimizer_excluding=cusolver` in Theano config. 2644lifter = op_lifter([slinalg.Cholesky])(local_gpu_cholesky) 2645matrix_ops_db.register("local_gpu_cholesky", lifter, 2646 'gpuarray', 'fast_compile', 'fast_run', 'cusolver', 2647 position=0) 2648matrix_ops_db2.register("local_gpu_cholesky", 2649 local_optimizer([slinalg.Cholesky])(local_gpu_cholesky), 2650 'gpuarray', 'fast_compile', 'fast_run', 'cusolver', 2651 position=0) 2652register_opt('fast_compile', name='matrix_ops_db')(matrix_ops_db) 2653register_opt2([slinalg.Solve], 'fast_compile', name='matrix_ops_db2')(matrix_ops_db2) 2654 2655 2656@register_inplace() 2657@local_optimizer([GpuCholesky], inplace=True) 2658def local_inplace_gpu_cholesky(node): 2659 if isinstance(node.op, GpuCholesky) and not node.op.inplace: 2660 with inherit_stack_trace(node.outputs): 2661 return [node.op.clone_inplace()(*node.inputs)] 2662 2663 2664def local_gpu_magma_cholesky(op, context_name, inputs, outputs): 2665 if not config.magma.enabled: 2666 return 2667 if inputs[0].dtype not in ['float16', 'float32']: 2668 return 2669 op = GpuMagmaCholesky(lower=op.lower, inplace=op.destructive) 2670 if inputs[0].dtype == 'float16': 2671 return op(inputs[0].astype('float32')).astype('float16') 2672 return op 2673lifter = op_lifter([slinalg.Cholesky])(local_gpu_magma_cholesky) 2674matrix_ops_db.register("local_gpu_magma_cholesky", lifter, 2675 'gpuarray', 'fast_compile', 'fast_run', 'magma', 2676 position=1) 2677matrix_ops_db2.register("local_gpu_magma_cholesky", 2678 local_optimizer([slinalg.Cholesky])(local_gpu_magma_cholesky), 2679 'gpuarray', 'fast_compile', 'fast_run', 'magma', 2680 position=1) 2681 2682 2683@register_inplace() 2684@local_optimizer([GpuMagmaCholesky], inplace=True) 2685def local_inplace_gpu_magma_cholesky(node): 2686 if isinstance(node.op, GpuMagmaCholesky) and not node.op.inplace: 2687 return [node.op.clone_inplace()(*node.inputs)] 2688 2689 2690# QR decomposition 2691@register_opt('magma', 'fast_compile') 2692@op_lifter([nlinalg.QRFull]) 2693@register_opt2([theano.tensor.nlinalg.QRFull], 'magma', 'fast_compile') 2694def local_gpu_magma_qr(op, context_name, inputs, outputs): 2695 if not config.magma.enabled or op.mode != 'reduced': 2696 return 2697 if inputs[0].dtype not in ['float16', 'float32']: 2698 return 2699 x = inputs[0] 2700 if inputs[0].dtype == 'float16': 2701 x = inputs[0].astype('float32') 2702 out = gpu_qr(x, complete=True) 2703 if inputs[0].dtype == 'float16': 2704 return [o.astype('float16') for o in out] 2705 return out 2706 2707 2708@register_opt('magma', 'fast_compile') 2709@op_lifter([nlinalg.QRIncomplete]) 2710@register_opt2([theano.tensor.nlinalg.QRIncomplete], 'magma', 'fast_compile') 2711def local_gpu_magma_qr_incomplete(op, context_name, inputs, outputs): 2712 if not config.magma.enabled: 2713 return 2714 if inputs[0].dtype not in ['float16', 'float32']: 2715 return 2716 x = inputs[0] 2717 if inputs[0].dtype == 'float16': 2718 x = inputs[0].astype('float32') 2719 out = gpu_qr(x, complete=False) 2720 if inputs[0].dtype == 'float16': 2721 return [out.astype('float16')] 2722 return out 2723 2724 2725# Matrix inverse 2726@register_opt('magma', 'fast_compile') 2727@op_lifter([nlinalg.MatrixInverse]) 2728@register_opt2([theano.tensor.nlinalg.MatrixInverse], 'magma', 'fast_compile') 2729def local_gpu_magma_matrix_inverse(op, context_name, inputs, outputs): 2730 if not config.magma.enabled: 2731 return 2732 if inputs[0].dtype not in ['float16', 'float32']: 2733 return 2734 op = GpuMagmaMatrixInverse() 2735 if inputs[0].dtype == 'float16': 2736 return op(inputs[0].astype('float32')).astype('float16') 2737 return op 2738 2739 2740@register_inplace() 2741@local_optimizer([GpuMagmaMatrixInverse]) 2742def local_inplace_gpu_magma_matrix_inverse(node): 2743 if isinstance(node.op, GpuMagmaMatrixInverse) and not node.op.inplace: 2744 with inherit_stack_trace(node.outputs): 2745 return [node.op.clone_inplace()(*node.inputs)] 2746 2747 2748# Eigen decomposition of a symmetric matrix 2749@register_opt('magma', 'fast_compile') 2750@op_lifter([nlinalg.Eigh]) 2751@register_opt2([theano.tensor.nlinalg.Eigh], 'magma', 'fast_compile') 2752def local_gpu_magma_eigh(op, context_name, inputs, outputs): 2753 if not config.magma.enabled: 2754 return 2755 if inputs[0].dtype not in ['float16', 'float32']: 2756 return 2757 op = GpuMagmaEigh(UPLO=op.UPLO, compute_v=True) 2758 if inputs[0].dtype == 'float16': 2759 return op(inputs[0].astype('float32')).astype('float16') 2760 return op 2761 2762 2763# Singular Value Decomposition 2764@register_opt('magma', 'fast_compile') 2765@op_lifter([nlinalg.SVD]) 2766@register_opt2([theano.tensor.nlinalg.SVD], 'magma', 'fast_compile') 2767def local_gpu_magma_svd(op, context_name, inputs, outputs): 2768 if not config.magma.enabled: 2769 return 2770 if inputs[0].dtype not in ['float16', 'float32']: 2771 return 2772 x = inputs[0] 2773 if inputs[0].dtype == 'float16': 2774 x = inputs[0].astype('float32') 2775 out = gpu_svd(x, compute_uv=op.compute_uv, full_matrices=op.full_matrices) 2776 if inputs[0].dtype == 'float16': 2777 if op.compute_uv: 2778 out = [o.astype('float16') for o in out] 2779 else: 2780 out = [out.astype('float16')] 2781 return out 2782 2783 2784@register_opt('ctc', 'fast_compile') 2785@op_lifter([theano.tensor.nnet.ctc.ConnectionistTemporalClassification]) 2786@register_opt2([ConnectionistTemporalClassification], 'ctc', 'fast_compile') 2787def local_gpu_ctc(op, context_name, inputs, outputs): 2788 op = GpuConnectionistTemporalClassification(compute_grad=op.compute_grad) 2789 return op.make_node(*inputs).outputs 2790 2791 2792# Do not register in fast_run or fast_compile. 2793# It will be added to fast_run if the GPU is enabled. 2794optdb.register('gpua_scanOp_make_inplace', 2795 scan_opt.ScanInplaceOptimizer(typeInfer=_scan_type_infer, 2796 gpua_flag=True), 2797 75, 2798 'gpuarray', 2799 'inplace', 2800 'scan') 2801 2802 2803# Register GPU convolution implementation 2804# They are tried in a specific order so we can control 2805# which ones take precedence over others. 2806abstractconv_groupopt = theano.gof.optdb.LocalGroupDB() 2807abstractconv_groupopt.__name__ = "gpuarray_abstractconv_opts" 2808register_opt('fast_compile')(abstractconv_groupopt) 2809 2810# We import these opts here instead of at the top of this file 2811# to avoid a circular dependency problem with dnn 2812from .dnn import (local_abstractconv_cudnn, 2813 local_abstractconv_gw_cudnn, 2814 local_abstractconv_gi_cudnn, # noqa: 402 2815 local_abstractconv_cudnn_alt, 2816 local_abstractconv3d_cudnn_alt) 2817 2818abstractconv_groupopt.register('local_abstractconv_dnn', 2819 local_abstractconv_cudnn, 20, 2820 'conv_dnn', 2821 'gpuarray', 'fast_compile', 'fast_run', 'cudnn') 2822abstractconv_groupopt.register('local_abstractconv_gw_dnn', 2823 local_abstractconv_gw_cudnn, 20, 2824 'conv_dnn', 2825 'gpuarray', 'fast_compile', 'fast_run', 'cudnn') 2826abstractconv_groupopt.register('local_abstractconv_gi_dnn', 2827 local_abstractconv_gi_cudnn, 20, 2828 'conv_dnn', 2829 'gpuarray', 'fast_compile', 'fast_run', 'cudnn') 2830# The GEMM-based convolution comes last to catch all remaining cases. 2831# It can be disabled by excluding 'conv_gemm'. 2832abstractconv_groupopt.register('local_abstractconv_gemm', local_abstractconv_gemm, 30, 2833 'conv_gemm', 2834 'gpuarray', 'fast_compile', 'fast_run') 2835abstractconv_groupopt.register('local_abstractconv3d_gemm', local_abstractconv3d_gemm, 30, 2836 'conv_gemm', 2837 'gpuarray', 'fast_compile', 'fast_run') 2838abstractconv_groupopt.register('local_abstractconv_gradweights_gemm', 2839 local_abstractconv_gradweights_gemm, 30, 2840 'conv_gemm', 2841 'gpuarray', 'fast_compile', 'fast_run') 2842abstractconv_groupopt.register('local_abstractconv3d_gradweights_gemm', 2843 local_abstractconv3d_gradweights_gemm, 30, 2844 'conv_gemm', 2845 'gpuarray', 'fast_compile', 'fast_run') 2846abstractconv_groupopt.register('local_abstractconv_gradinputs', 2847 local_abstractconv_gradinputs_gemm, 30, 2848 'conv_gemm', 2849 'gpuarray', 'fast_compile', 'fast_run') 2850abstractconv_groupopt.register('local_abstractconv3d_gradinputs', 2851 local_abstractconv3d_gradinputs_gemm, 30, 2852 'conv_gemm', 2853 'gpuarray', 'fast_compile', 'fast_run') 2854 2855conv_metaopt = ConvMetaOptimizer() 2856 2857conv_metaopt.register(local_abstractconv_cudnn, 2858 ['default', 'cudnn', 'conv_dnn']) 2859conv_metaopt.register(local_abstractconv_gw_cudnn, 2860 ['default', 'cudnn', 'conv_dnn']) 2861conv_metaopt.register(local_abstractconv_gi_cudnn, 2862 ['default', 'cudnn', 'conv_dnn']) 2863conv_metaopt.register(local_abstractconv_gemm_def, 2864 ['default', 'conv_gemm']) 2865conv_metaopt.register(local_abstractconv3d_gemm_def, 2866 ['default', 'conv_gemm']) 2867conv_metaopt.register(local_abstractconv_gradweights_gemm, 2868 ['default', 'conv_gemm']) 2869conv_metaopt.register(local_abstractconv3d_gradweights_gemm, 2870 ['default', 'conv_gemm']) 2871conv_metaopt.register(local_abstractconv_gradinputs_gemm, 2872 ['default', 'conv_gemm']) 2873conv_metaopt.register(local_abstractconv3d_gradinputs_gemm, 2874 ['default', 'conv_gemm']) 2875conv_metaopt.register(local_abstractconv_gemm_alt, 2876 ['default', 'alternative', 'conv_gemm']) 2877conv_metaopt.register(local_abstractconv_gemm_gradweights_alt, 2878 ['default', 'alternative', 'conv_gemm']) 2879conv_metaopt.register(local_abstractconv_gradinputs_gemm_alt, 2880 ['default', 'alternative', 'conv_gemm']) 2881conv_metaopt.register(local_abstractconv_cudnn_alt, 2882 ['default', 'alternative', 'cudnn', 'conv_dnn']) 2883conv_metaopt.register(local_abstractconv3d_cudnn_alt, 2884 ['default', 'alternative', 'cudnn', 'conv_dnn']) 2885conv_metaopt.register(local_abstractconv3d_alt, 2886 ['default', 'alternative', 'conv_gemm']) 2887conv_metaopt.register(local_abstractconv3d_gemm_gradweights_alt, 2888 ['default', 'alternative', 'conv_gemm']) 2889conv_metaopt.register(local_abstractconv3d_gradinputs_gemm_alt, 2890 ['default', 'alternative', 'conv_gemm']) 2891conv_metaopt.register(local_abstractconv3d2d, 2892 ['alternative', 'conv3d2d']) 2893 2894abstractconv_groupopt.register('conv_metaopt', conv_metaopt, 'conv_meta', position=0) 2895 2896# Register cuDNN batch normalization implementation 2897 2898# We import these opts here instead of at the top of this file 2899# to avoid a circular dependency problem with dnn 2900from .dnn import (local_abstract_batch_norm_train_cudnn, 2901 local_abstract_batch_norm_train_grad_cudnn, 2902 local_abstract_batch_norm_inference_cudnn) # noqa: 402 2903 2904abstract_batch_norm_groupopt = theano.gof.optdb.LocalGroupDB() 2905abstract_batch_norm_groupopt.__name__ = "gpuarray_batchnorm_opts" 2906register_opt('fast_compile')(abstract_batch_norm_groupopt) 2907 2908abstract_batch_norm_db = LocalGroupDB() 2909abstract_batch_norm_db2 = LocalGroupDB( 2910 local_opt=theano.gof.opt.GraphToGPULocalOptGroup) 2911abstract_batch_norm_db2.__name__ = "abstract_batch_norm_db2" 2912register_opt('fast_compile', name='abstract_batch_norm_db')( 2913 abstract_batch_norm_db) 2914register_opt2([bn.AbstractBatchNormTrain, 2915 bn.AbstractBatchNormTrainGrad, 2916 bn.AbstractBatchNormInference], 2917 'fast_compile', name='abstract_batch_norm_db2')( 2918 abstract_batch_norm_db2) 2919 2920for op, fct, cpu in [(bn.AbstractBatchNormTrain, 2921 local_abstract_batch_norm_train_cudnn, 2922 bn.local_abstract_batch_norm_train), 2923 (bn.AbstractBatchNormTrainGrad, 2924 local_abstract_batch_norm_train_grad_cudnn, 2925 bn.local_abstract_batch_norm_train_grad), 2926 (bn.AbstractBatchNormInference, 2927 local_abstract_batch_norm_inference_cudnn, 2928 bn.local_abstract_batch_norm_inference)]: 2929 lifter = op_lifter([op])(fct) 2930 abstract_batch_norm_db.register(fct.__name__, 2931 lifter, 2932 'gpuarray', 'fast_compile', 'fast_run', 2933 'cudnn', 'batchnorm_dnn', 2934 position=1) 2935 abstract_batch_norm_db2.register(fct.__name__, 2936 local_optimizer([op])(fct), 2937 'gpuarray', 'fast_compile', 'fast_run', 2938 'cudnn', 'batchnorm_dnn', 2939 position=1) 2940 # cpu is a normal optimization. We can't register it in 2941 # GraphToGPU. So for now, only add it to the slower EQ phase. If 2942 # there is no cuDNN, we still want to move it to the GPU now with 2943 # a Theano graph so to have this graph on the GPU. 2944 abstract_batch_norm_db.register(cpu.__name__, cpu, 2945 'gpuarray', 'fast_compile', 'fast_run', 2946 position='last') 2947