1 #section support_code_struct
2 
3 cudnnTensorDescriptor_t APPLY_SPECIFIC(input);
4 cudnnTensorDescriptor_t APPLY_SPECIFIC(output);
5 cudnnPoolingDescriptor_t APPLY_SPECIFIC(pool);
6 
7 
8 #section init_code_struct
9 
10 cudnnStatus_t APPLY_SPECIFIC(err);
11 APPLY_SPECIFIC(input) = NULL;
12 APPLY_SPECIFIC(output) = NULL;
13 APPLY_SPECIFIC(pool) = NULL;
14 
15 if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input))) != CUDNN_STATUS_SUCCESS) {
16   PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
17                "(inp): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
18   FAIL;
19 }
20 if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output))) != CUDNN_STATUS_SUCCESS) {
21   PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
22                "(out): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
23   FAIL;
24 }
25 if ((APPLY_SPECIFIC(err) = cudnnCreatePoolingDescriptor(&APPLY_SPECIFIC(pool))) != CUDNN_STATUS_SUCCESS) {
26   PyErr_Format(PyExc_MemoryError, "could not allocate pooling descriptor"
27                 "(pool): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
28   FAIL;
29 }
30 
31 #section cleanup_code_struct
32 
33 if (APPLY_SPECIFIC(input) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(input)); }
34 if (APPLY_SPECIFIC(output) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output)); }
35 if (APPLY_SPECIFIC(pool) != NULL) { cudnnDestroyPoolingDescriptor(APPLY_SPECIFIC(pool)); }
36 
37 
38 #section support_code_struct
39 
APPLY_SPECIFIC(dnn_pool)40 int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
41                              PyArrayObject *ws,
42                              PyArrayObject *stride,
43                              PyArrayObject *pad,
44                              PyGpuArrayObject **out,
45                              PARAMS_TYPE* params) {
46   PyGpuContextObject *c = img->context;
47   size_t dims[5];
48   cudnnStatus_t err;
49 
50   if (!GpuArray_IS_C_CONTIGUOUS(&img->ga)) {
51     PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported.");
52     return 1;
53   }
54 
55   cudnnPoolingMode_t mode;
56   int w[3];
57   int p[3];
58   int s[3];
59   int ndims = PyArray_DIM(ws, 0);//PyGpuArray_NDIM(img) - 2;
60 
61   for(int i = 0; i < ndims; i++) {
62      w[i] = *((npy_intp*)PyArray_GETPTR1(ws, i));
63   }
64   for(int i = 0; i < ndims; i++) {
65      p[i] = *((npy_intp*)PyArray_GETPTR1(pad, i));
66   }
67   for(int i = 0; i < ndims; i++) {
68      s[i] = *((npy_intp*)PyArray_GETPTR1(stride, i));
69   }
70 
71   dims[0] = PyGpuArray_DIM(img, 0);
72   dims[1] = PyGpuArray_DIM(img, 1);
73   dims[2] = (PyGpuArray_DIM(img, 2) + (p[0]*2) - w[0]) / s[0] + 1;
74   dims[3] = (PyGpuArray_DIM(img, 3) + (p[1]*2) - w[1]) / s[1] + 1;
75   if (ndims == 3)
76     dims[4] = (PyGpuArray_DIM(img, 4) + (p[2]*2) - w[2]) / s[2] + 1;
77 
78   if (theano_prep_output(out, ndims+2, dims, img->ga.typecode,
79                          GA_C_ORDER, c) != 0)
80     return 1;
81 
82   // if input batch is empty, we return the empty output without calling cuDNN
83   // (which will fail on zero batch size).
84   if (PyGpuArray_DIM(*out, 0) == 0)
85     return 0;
86 
87   if (c_set_tensorNd(img, APPLY_SPECIFIC(input)) != 0)
88     return 1;
89 
90   if (c_set_tensorNd(*out, APPLY_SPECIFIC(output)) != 0)
91     return 1;
92 
93   err = cudnnSetPoolingNdDescriptor(APPLY_SPECIFIC(pool), params->mode, CUDNN_PROPAGATE_NAN, ndims, w, p, s);
94 
95   if (err != CUDNN_STATUS_SUCCESS) {
96     PyErr_Format(PyExc_RuntimeError, "could not set op descriptor %s", cudnnGetErrorString(err));
97     return 1;
98   }
99 
100   {
101     const float alphaf = 1;
102     const float betaf = 0;
103     const double alphad = 1;
104     const double betad = 0;
105     void *alpha, *beta;
106 
107     switch (img->ga.typecode) {
108     case GA_DOUBLE:
109       alpha = (void *)&alphad;
110       beta = (void *)&betad;
111       break;
112     case GA_FLOAT:
113     case GA_HALF:
114       alpha = (void *)&alphaf;
115       beta = (void *)&betaf;
116       break;
117     default:
118       PyErr_SetString(PyExc_TypeError, "Unsupported type in pooling");
119       return 1;
120     }
121 
122     cuda_enter(c->ctx);
123 
124     cuda_wait(img->ga.data, GPUARRAY_CUDA_WAIT_READ);
125     cuda_wait((*out)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
126 
127     err = cudnnPoolingForward(
128       params->handle, APPLY_SPECIFIC(pool),
129       alpha,
130       APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(img),
131       beta,
132       APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*out));
133 
134     cuda_record(img->ga.data, GPUARRAY_CUDA_WAIT_READ);
135     cuda_record((*out)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
136 
137     cuda_exit(c->ctx);
138   }
139   if (err != CUDNN_STATUS_SUCCESS) {
140     PyErr_Format(PyExc_RuntimeError,
141                  "GpuDnnPool: error doing cudnnPoolingForward operation: %s",
142                  cudnnGetErrorString(err));
143     return 1;
144   }
145   return 0;
146 }
147