1 #include <cuda_runtime.h>
2 #include <curand.h>
3 #include <cublas_v2.h>
4
5 #include "maxpool_layer.h"
6 #include "convolutional_layer.h"
7 #include "blas.h"
8 #include "dark_cuda.h"
9
forward_maxpool_depth_layer_kernel(int n,int w,int h,int c,int out_c,int batch,float * input,float * output,int * indexes)10 __global__ void forward_maxpool_depth_layer_kernel(int n, int w, int h, int c, int out_c, int batch, float *input, float *output, int *indexes)
11 {
12 int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
13 if (id >= n) return;
14
15 int j = id % w;
16 id = id / w;
17 int i = id % h;
18 id = id / h;
19 //int g = id % out_c;
20 //id = id / out_c;
21 int b = id % batch;
22
23 int k;
24 for (int g = 0; g < out_c; ++g)
25 {
26 int out_index = j + w*(i + h*(g + out_c*b));
27 float max = -FLT_MAX;
28 int max_i = -1;
29
30 for (k = g; k < c; k += out_c)
31 {
32 int in_index = j + w*(i + h*(k + c*b));
33 float val = input[in_index];
34
35 max_i = (val > max) ? in_index : max_i;
36 max = (val > max) ? val : max;
37 }
38 output[out_index] = max;
39 if (indexes) indexes[out_index] = max_i;
40 }
41 }
42
43
backward_maxpool_depth_layer_kernel(int n,int w,int h,int c,int batch,float * delta,float * prev_delta,int * indexes)44 __global__ void backward_maxpool_depth_layer_kernel(int n, int w, int h, int c, int batch, float *delta, float *prev_delta, int *indexes)
45 {
46 int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
47 if (id >= n) return;
48
49 int index = indexes[id];
50 prev_delta[index] += delta[id];
51 }
52
53
forward_maxpool_layer_kernel(int n,int in_h,int in_w,int in_c,int stride_x,int stride_y,int size,int pad,float * input,float * output,int * indexes)54 __global__ void forward_maxpool_layer_kernel(int n, int in_h, int in_w, int in_c, int stride_x, int stride_y, int size, int pad, float *input, float *output, int *indexes)
55 {
56 int h = (in_h + pad - size) / stride_y + 1;
57 int w = (in_w + pad - size) / stride_x + 1;
58 int c = in_c;
59
60 int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
61 if(id >= n) return;
62
63 int j = id % w;
64 id /= w;
65 int i = id % h;
66 id /= h;
67 int k = id % c;
68 id /= c;
69 int b = id;
70
71 int w_offset = -pad / 2;
72 int h_offset = -pad / 2;
73
74 int out_index = j + w*(i + h*(k + c*b));
75 float max = -INFINITY;
76 int max_i = -1;
77 int l, m;
78 for(l = 0; l < size; ++l){
79 for(m = 0; m < size; ++m){
80 int cur_h = h_offset + i*stride_y + l;
81 int cur_w = w_offset + j*stride_x + m;
82 int index = cur_w + in_w*(cur_h + in_h*(k + b*in_c));
83 int valid = (cur_h >= 0 && cur_h < in_h &&
84 cur_w >= 0 && cur_w < in_w);
85 float val = (valid != 0) ? input[index] : -INFINITY;
86 max_i = (val > max) ? index : max_i;
87 max = (val > max) ? val : max;
88 }
89 }
90 output[out_index] = max;
91 if (indexes) indexes[out_index] = max_i;
92 }
93
backward_maxpool_layer_kernel(int n,int in_h,int in_w,int in_c,int stride_x,int stride_y,int size,int pad,float * delta,float * prev_delta,int * indexes)94 __global__ void backward_maxpool_layer_kernel(int n, int in_h, int in_w, int in_c, int stride_x, int stride_y, int size, int pad, float *delta, float *prev_delta, int *indexes)
95 {
96 int h = (in_h + pad - size) / stride_y + 1;
97 int w = (in_w + pad - size) / stride_x + 1;
98 int c = in_c;
99 int area_x = (size - 1) / stride_x;
100 int area_y = (size - 1) / stride_y;
101
102 int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
103 if(id >= n) return;
104
105 int index = id;
106 int j = id % in_w;
107 id /= in_w;
108 int i = id % in_h;
109 id /= in_h;
110 int k = id % in_c;
111 id /= in_c;
112 int b = id;
113
114 int w_offset = -pad / 2;
115 int h_offset = -pad / 2;
116
117 float d = 0;
118 int l, m;
119 for(l = -area_y; l < area_y+1; ++l){
120 for(m = -area_x; m < area_x+1; ++m){
121 int out_w = (j-w_offset)/stride_x + m;
122 int out_h = (i-h_offset)/stride_y + l;
123 int out_index = out_w + w*(out_h + h*(k + c*b));
124 int valid = (out_w >= 0 && out_w < w &&
125 out_h >= 0 && out_h < h);
126 d += (valid && indexes[out_index] == index) ? delta[out_index] : 0;
127 }
128 }
129 prev_delta[index] += d;
130 }
131
132
forward_maxpool_layer_gpu(maxpool_layer layer,network_state state)133 extern "C" void forward_maxpool_layer_gpu(maxpool_layer layer, network_state state)
134 {
135 if (layer.maxpool_depth) {
136 int h = layer.out_h;
137 int w = layer.out_w;
138 int c = 1;// layer.out_c;
139
140 size_t n = h*w*c*layer.batch;
141
142 forward_maxpool_depth_layer_kernel << <cuda_gridsize(n), BLOCK, 0, get_cuda_stream() >> >(
143 n, layer.w, layer.h, layer.c, layer.out_c, layer.batch, state.input, layer.output_gpu, layer.indexes_gpu);
144 CHECK_CUDA(cudaPeekAtLastError());
145
146 return;
147 }
148
149 #ifdef CUDNN_DISABLED
150 if (!state.train && layer.stride == layer.size) {
151 // cudnnPoolingBackward
152 cudnnStatus_t maxpool_status;
153
154 float alpha = 1, beta = 0;
155 maxpool_status = cudnnPoolingForward(
156 cudnn_handle(),
157 layer.poolingDesc,
158 &alpha,
159 layer.srcTensorDesc,
160 state.input,
161 &beta,
162 layer.dstTensorDesc,
163 layer.output_gpu);
164
165 //maxpool_status = cudnnDestroyPoolingDescriptor(poolingDesc);
166 //cudnnDestroyTensorDescriptor(layer.srcTensorDesc);
167 //cudnnDestroyTensorDescriptor(layer.dstTensorDesc);
168
169 }
170 else
171 #endif
172 {
173 int h = layer.out_h;
174 int w = layer.out_w;
175 int c = layer.out_c;
176
177 size_t n = h*w*c*layer.batch;
178
179 forward_maxpool_layer_kernel << <cuda_gridsize(n), BLOCK, 0, get_cuda_stream() >> > (n, layer.h, layer.w, layer.c, layer.stride_x, layer.stride_y, layer.size, layer.pad, state.input, layer.output_gpu, layer.indexes_gpu);
180 CHECK_CUDA(cudaPeekAtLastError());
181 }
182
183 if (layer.antialiasing) {
184 network_state s = { 0 };
185 s.train = state.train;
186 s.workspace = state.workspace;
187 s.net = state.net;
188 if (!state.train) s.index = state.index; // don't use TC for training (especially without cuda_convert_f32_to_f16() )
189 s.input = layer.output_gpu;
190 forward_convolutional_layer_gpu(*(layer.input_layer), s);
191 simple_copy_ongpu(layer.outputs*layer.batch, layer.output_gpu, layer.input_antialiasing_gpu);
192 simple_copy_ongpu(layer.input_layer->outputs*layer.input_layer->batch, layer.input_layer->output_gpu, layer.output_gpu);
193 }
194 }
195
backward_maxpool_layer_gpu(maxpool_layer layer,network_state state)196 extern "C" void backward_maxpool_layer_gpu(maxpool_layer layer, network_state state)
197 {
198 if (layer.antialiasing) {
199 network_state s = { 0 };
200 s.train = state.train;
201 s.workspace = state.workspace;
202 s.net = state.net;
203 s.delta = layer.delta_gpu; // s.delta will be returned to l.delta_gpu
204 s.input = layer.input_antialiasing_gpu;
205 //if (!state.train) s.index = state.index; // don't use TC for training (especially without cuda_convert_f32_to_f16() )
206 simple_copy_ongpu(layer.input_layer->outputs*layer.input_layer->batch, layer.delta_gpu, layer.input_layer->delta_gpu);
207 backward_convolutional_layer_gpu(*(layer.input_layer), s);
208
209 //simple_copy_ongpu(layer.outputs*layer.batch, layer.input_antialiasing_gpu, layer.output_gpu);
210 }
211
212 if (layer.maxpool_depth) {
213 int h = layer.out_h;
214 int w = layer.out_w;
215 int c = layer.out_c;
216
217 size_t n = h * w * c * layer.batch;
218
219 backward_maxpool_depth_layer_kernel << <cuda_gridsize(n), BLOCK, 0, get_cuda_stream() >> >(n, layer.w, layer.h, layer.c, layer.batch, layer.delta_gpu, state.delta, layer.indexes_gpu);
220 CHECK_CUDA(cudaPeekAtLastError());
221 return;
222 }
223
224 size_t n = layer.h*layer.w*layer.c*layer.batch;
225
226 backward_maxpool_layer_kernel<<<cuda_gridsize(n), BLOCK, 0, get_cuda_stream() >>>(n, layer.h, layer.w, layer.c, layer.stride_x, layer.stride_y, layer.size, layer.pad, layer.delta_gpu, state.delta, layer.indexes_gpu);
227 CHECK_CUDA(cudaPeekAtLastError());
228 }
229
230
231
232
forward_local_avgpool_layer_kernel(int n,int in_h,int in_w,int in_c,int stride_x,int stride_y,int size,int pad,float * input,float * output)233 __global__ void forward_local_avgpool_layer_kernel(int n, int in_h, int in_w, int in_c, int stride_x, int stride_y, int size, int pad, float *input, float *output)
234 {
235 int h = (in_h + pad - size) / stride_y + 1;
236 int w = (in_w + pad - size) / stride_x + 1;
237 int c = in_c;
238
239 int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
240 if (id >= n) return;
241
242 int j = id % w;
243 id /= w;
244 int i = id % h;
245 id /= h;
246 int k = id % c;
247 id /= c;
248 int b = id;
249
250 int w_offset = -pad / 2;
251 int h_offset = -pad / 2;
252
253 int out_index = j + w*(i + h*(k + c*b));
254 float avg = 0;
255 int counter = 0;
256 int l, m;
257 for (l = 0; l < size; ++l) {
258 for (m = 0; m < size; ++m) {
259 int cur_h = h_offset + i*stride_y + l;
260 int cur_w = w_offset + j*stride_x + m;
261 int index = cur_w + in_w*(cur_h + in_h*(k + b*in_c));
262 int valid = (cur_h >= 0 && cur_h < in_h &&
263 cur_w >= 0 && cur_w < in_w);
264 if (valid) {
265 counter++;
266 avg += input[index];
267 }
268 }
269 }
270 output[out_index] = avg / counter; // as CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING
271 }
272
273
backward_local_avgpool_layer_kernel(int n,int in_h,int in_w,int in_c,int stride_x,int stride_y,int size,int pad,float * delta,float * prev_delta)274 __global__ void backward_local_avgpool_layer_kernel(int n, int in_h, int in_w, int in_c, int stride_x, int stride_y, int size, int pad, float *delta, float *prev_delta)
275 {
276 int h = (in_h + pad - size) / stride_y + 1;
277 int w = (in_w + pad - size) / stride_x + 1;
278 int c = in_c;
279 int area_x = (size - 1) / stride_x;
280 int area_y = (size - 1) / stride_y;
281
282 int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
283 if (id >= n) return;
284
285 int index = id;
286 int j = id % in_w;
287 id /= in_w;
288 int i = id % in_h;
289 id /= in_h;
290 int k = id % in_c;
291 id /= in_c;
292 int b = id;
293
294 int w_offset = -pad / 2;
295 int h_offset = -pad / 2;
296
297 int counter = 0;
298 float d = 0;
299 int l, m;
300 for (l = -area_y; l < area_y + 1; ++l) {
301 for (m = -area_x; m < area_x + 1; ++m) {
302 int out_w = (j - w_offset) / stride_x + m;
303 int out_h = (i - h_offset) / stride_y + l;
304 int out_index = out_w + w*(out_h + h*(k + c*b));
305 int valid = (out_w >= 0 && out_w < w && out_h >= 0 && out_h < h);
306 if (valid) {
307 counter++;
308 d += delta[out_index];
309 }
310 }
311 }
312 if(counter > 0) prev_delta[index] += d / counter;
313 }
314
315
316
forward_local_avgpool_layer_gpu(maxpool_layer layer,network_state state)317 extern "C" void forward_local_avgpool_layer_gpu(maxpool_layer layer, network_state state)
318 {
319
320 #ifdef CUDNN_DISABLED
321 if (!state.train && layer.stride == layer.size) {
322 // cudnnPoolingBackward
323 cudnnStatus_t maxpool_status;
324
325 float alpha = 1, beta = 0;
326 maxpool_status = cudnnPoolingForward(
327 cudnn_handle(),
328 layer.poolingDesc,
329 &alpha,
330 layer.srcTensorDesc,
331 state.input,
332 &beta,
333 layer.dstTensorDesc,
334 layer.output_gpu);
335
336 //maxpool_status = cudnnDestroyPoolingDescriptor(poolingDesc);
337 //cudnnDestroyTensorDescriptor(layer.srcTensorDesc);
338 //cudnnDestroyTensorDescriptor(layer.dstTensorDesc);
339
340 }
341 else
342 #endif
343 {
344 int h = layer.out_h;
345 int w = layer.out_w;
346 int c = layer.out_c;
347
348 size_t n = h*w*c*layer.batch;
349
350 forward_local_avgpool_layer_kernel << <cuda_gridsize(n), BLOCK, 0, get_cuda_stream() >> > (n, layer.h, layer.w, layer.c, layer.stride_x, layer.stride_y, layer.size, layer.pad, state.input, layer.output_gpu);
351 CHECK_CUDA(cudaPeekAtLastError());
352 }
353 }
354
backward_local_avgpool_layer_gpu(maxpool_layer layer,network_state state)355 extern "C" void backward_local_avgpool_layer_gpu(maxpool_layer layer, network_state state)
356 {
357 size_t n = layer.h*layer.w*layer.c*layer.batch;
358
359 backward_local_avgpool_layer_kernel << <cuda_gridsize(n), BLOCK, 0, get_cuda_stream() >> >(n, layer.h, layer.w, layer.c, layer.stride_x, layer.stride_y, layer.size, layer.pad, layer.delta_gpu, state.delta);
360 CHECK_CUDA(cudaPeekAtLastError());
361 }
362