1 #include <cuda_runtime.h>
2 #include <curand.h>
3 #include <cublas_v2.h>
4 
5 #include "maxpool_layer.h"
6 #include "convolutional_layer.h"
7 #include "blas.h"
8 #include "dark_cuda.h"
9 
forward_maxpool_depth_layer_kernel(int n,int w,int h,int c,int out_c,int batch,float * input,float * output,int * indexes)10 __global__ void forward_maxpool_depth_layer_kernel(int n, int w, int h, int c, int out_c, int batch, float *input, float *output, int *indexes)
11 {
12     int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
13     if (id >= n) return;
14 
15     int j = id % w;
16     id = id / w;
17     int i = id % h;
18     id = id / h;
19     //int g = id % out_c;
20     //id = id / out_c;
21     int b = id % batch;
22 
23     int k;
24     for (int g = 0; g < out_c; ++g)
25     {
26         int out_index = j + w*(i + h*(g + out_c*b));
27         float max = -FLT_MAX;
28         int max_i = -1;
29 
30         for (k = g; k < c; k += out_c)
31         {
32             int in_index = j + w*(i + h*(k + c*b));
33             float val = input[in_index];
34 
35             max_i = (val > max) ? in_index : max_i;
36             max = (val > max) ? val : max;
37         }
38         output[out_index] = max;
39         if (indexes) indexes[out_index] = max_i;
40     }
41 }
42 
43 
backward_maxpool_depth_layer_kernel(int n,int w,int h,int c,int batch,float * delta,float * prev_delta,int * indexes)44 __global__ void backward_maxpool_depth_layer_kernel(int n, int w, int h, int c, int batch, float *delta, float *prev_delta, int *indexes)
45 {
46     int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
47     if (id >= n) return;
48 
49     int index = indexes[id];
50     prev_delta[index] += delta[id];
51 }
52 
53 
forward_maxpool_layer_kernel(int n,int in_h,int in_w,int in_c,int stride_x,int stride_y,int size,int pad,float * input,float * output,int * indexes)54 __global__ void forward_maxpool_layer_kernel(int n, int in_h, int in_w, int in_c, int stride_x, int stride_y, int size, int pad, float *input, float *output, int *indexes)
55 {
56     int h = (in_h + pad - size) / stride_y + 1;
57     int w = (in_w + pad - size) / stride_x + 1;
58     int c = in_c;
59 
60     int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
61     if(id >= n) return;
62 
63     int j = id % w;
64     id /= w;
65     int i = id % h;
66     id /= h;
67     int k = id % c;
68     id /= c;
69     int b = id;
70 
71     int w_offset = -pad / 2;
72     int h_offset = -pad / 2;
73 
74     int out_index = j + w*(i + h*(k + c*b));
75     float max = -INFINITY;
76     int max_i = -1;
77     int l, m;
78     for(l = 0; l < size; ++l){
79         for(m = 0; m < size; ++m){
80             int cur_h = h_offset + i*stride_y + l;
81             int cur_w = w_offset + j*stride_x + m;
82             int index = cur_w + in_w*(cur_h + in_h*(k + b*in_c));
83             int valid = (cur_h >= 0 && cur_h < in_h &&
84                     cur_w >= 0 && cur_w < in_w);
85             float val = (valid != 0) ? input[index] : -INFINITY;
86             max_i = (val > max) ? index : max_i;
87             max   = (val > max) ? val   : max;
88         }
89     }
90     output[out_index] = max;
91     if (indexes) indexes[out_index] = max_i;
92 }
93 
backward_maxpool_layer_kernel(int n,int in_h,int in_w,int in_c,int stride_x,int stride_y,int size,int pad,float * delta,float * prev_delta,int * indexes)94 __global__ void backward_maxpool_layer_kernel(int n, int in_h, int in_w, int in_c, int stride_x, int stride_y, int size, int pad, float *delta, float *prev_delta, int *indexes)
95 {
96     int h = (in_h + pad - size) / stride_y + 1;
97     int w = (in_w + pad - size) / stride_x + 1;
98     int c = in_c;
99     int area_x = (size - 1) / stride_x;
100     int area_y = (size - 1) / stride_y;
101 
102     int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
103     if(id >= n) return;
104 
105     int index = id;
106     int j = id % in_w;
107     id /= in_w;
108     int i = id % in_h;
109     id /= in_h;
110     int k = id % in_c;
111     id /= in_c;
112     int b = id;
113 
114     int w_offset = -pad / 2;
115     int h_offset = -pad / 2;
116 
117     float d = 0;
118     int l, m;
119     for(l = -area_y; l < area_y+1; ++l){
120         for(m = -area_x; m < area_x+1; ++m){
121             int out_w = (j-w_offset)/stride_x + m;
122             int out_h = (i-h_offset)/stride_y + l;
123             int out_index = out_w + w*(out_h + h*(k + c*b));
124             int valid = (out_w >= 0 && out_w < w &&
125                      out_h >= 0 && out_h < h);
126             d += (valid && indexes[out_index] == index) ? delta[out_index] : 0;
127         }
128     }
129     prev_delta[index] += d;
130 }
131 
132 
forward_maxpool_layer_gpu(maxpool_layer layer,network_state state)133 extern "C" void forward_maxpool_layer_gpu(maxpool_layer layer, network_state state)
134 {
135     if (layer.maxpool_depth) {
136         int h = layer.out_h;
137         int w = layer.out_w;
138         int c = 1;// layer.out_c;
139 
140         size_t n = h*w*c*layer.batch;
141 
142         forward_maxpool_depth_layer_kernel << <cuda_gridsize(n), BLOCK, 0, get_cuda_stream() >> >(
143             n, layer.w, layer.h, layer.c, layer.out_c, layer.batch, state.input, layer.output_gpu, layer.indexes_gpu);
144         CHECK_CUDA(cudaPeekAtLastError());
145 
146         return;
147     }
148 
149 #ifdef CUDNN_DISABLED
150     if (!state.train && layer.stride == layer.size) {
151         // cudnnPoolingBackward
152         cudnnStatus_t maxpool_status;
153 
154         float alpha = 1, beta = 0;
155         maxpool_status = cudnnPoolingForward(
156             cudnn_handle(),
157             layer.poolingDesc,
158             &alpha,
159             layer.srcTensorDesc,
160             state.input,
161             &beta,
162             layer.dstTensorDesc,
163             layer.output_gpu);
164 
165         //maxpool_status = cudnnDestroyPoolingDescriptor(poolingDesc);
166         //cudnnDestroyTensorDescriptor(layer.srcTensorDesc);
167         //cudnnDestroyTensorDescriptor(layer.dstTensorDesc);
168 
169     }
170     else
171 #endif
172     {
173         int h = layer.out_h;
174         int w = layer.out_w;
175         int c = layer.out_c;
176 
177         size_t n = h*w*c*layer.batch;
178 
179         forward_maxpool_layer_kernel << <cuda_gridsize(n), BLOCK, 0, get_cuda_stream() >> > (n, layer.h, layer.w, layer.c, layer.stride_x, layer.stride_y, layer.size, layer.pad, state.input, layer.output_gpu, layer.indexes_gpu);
180         CHECK_CUDA(cudaPeekAtLastError());
181     }
182 
183     if (layer.antialiasing) {
184         network_state s = { 0 };
185         s.train = state.train;
186         s.workspace = state.workspace;
187         s.net = state.net;
188         if (!state.train) s.index = state.index;  // don't use TC for training (especially without cuda_convert_f32_to_f16() )
189         s.input = layer.output_gpu;
190         forward_convolutional_layer_gpu(*(layer.input_layer), s);
191         simple_copy_ongpu(layer.outputs*layer.batch, layer.output_gpu, layer.input_antialiasing_gpu);
192         simple_copy_ongpu(layer.input_layer->outputs*layer.input_layer->batch, layer.input_layer->output_gpu, layer.output_gpu);
193     }
194 }
195 
backward_maxpool_layer_gpu(maxpool_layer layer,network_state state)196 extern "C" void backward_maxpool_layer_gpu(maxpool_layer layer, network_state state)
197 {
198     if (layer.antialiasing) {
199         network_state s = { 0 };
200         s.train = state.train;
201         s.workspace = state.workspace;
202         s.net = state.net;
203         s.delta = layer.delta_gpu;  // s.delta will be returned to l.delta_gpu
204         s.input = layer.input_antialiasing_gpu;
205         //if (!state.train) s.index = state.index;  // don't use TC for training (especially without cuda_convert_f32_to_f16() )
206         simple_copy_ongpu(layer.input_layer->outputs*layer.input_layer->batch, layer.delta_gpu, layer.input_layer->delta_gpu);
207         backward_convolutional_layer_gpu(*(layer.input_layer), s);
208 
209         //simple_copy_ongpu(layer.outputs*layer.batch, layer.input_antialiasing_gpu, layer.output_gpu);
210     }
211 
212     if (layer.maxpool_depth) {
213         int h = layer.out_h;
214         int w = layer.out_w;
215         int c = layer.out_c;
216 
217         size_t n = h * w * c * layer.batch;
218 
219         backward_maxpool_depth_layer_kernel << <cuda_gridsize(n), BLOCK, 0, get_cuda_stream() >> >(n, layer.w, layer.h, layer.c, layer.batch, layer.delta_gpu, state.delta, layer.indexes_gpu);
220         CHECK_CUDA(cudaPeekAtLastError());
221         return;
222     }
223 
224     size_t n = layer.h*layer.w*layer.c*layer.batch;
225 
226     backward_maxpool_layer_kernel<<<cuda_gridsize(n), BLOCK, 0, get_cuda_stream() >>>(n, layer.h, layer.w, layer.c, layer.stride_x, layer.stride_y, layer.size, layer.pad, layer.delta_gpu, state.delta, layer.indexes_gpu);
227     CHECK_CUDA(cudaPeekAtLastError());
228 }
229 
230 
231 
232 
forward_local_avgpool_layer_kernel(int n,int in_h,int in_w,int in_c,int stride_x,int stride_y,int size,int pad,float * input,float * output)233 __global__ void forward_local_avgpool_layer_kernel(int n, int in_h, int in_w, int in_c, int stride_x, int stride_y, int size, int pad, float *input, float *output)
234 {
235     int h = (in_h + pad - size) / stride_y + 1;
236     int w = (in_w + pad - size) / stride_x + 1;
237     int c = in_c;
238 
239     int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
240     if (id >= n) return;
241 
242     int j = id % w;
243     id /= w;
244     int i = id % h;
245     id /= h;
246     int k = id % c;
247     id /= c;
248     int b = id;
249 
250     int w_offset = -pad / 2;
251     int h_offset = -pad / 2;
252 
253     int out_index = j + w*(i + h*(k + c*b));
254     float avg = 0;
255     int counter = 0;
256     int l, m;
257     for (l = 0; l < size; ++l) {
258         for (m = 0; m < size; ++m) {
259             int cur_h = h_offset + i*stride_y + l;
260             int cur_w = w_offset + j*stride_x + m;
261             int index = cur_w + in_w*(cur_h + in_h*(k + b*in_c));
262             int valid = (cur_h >= 0 && cur_h < in_h &&
263                 cur_w >= 0 && cur_w < in_w);
264             if (valid) {
265                 counter++;
266                 avg += input[index];
267             }
268         }
269     }
270     output[out_index] = avg / counter;  // as CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING
271 }
272 
273 
backward_local_avgpool_layer_kernel(int n,int in_h,int in_w,int in_c,int stride_x,int stride_y,int size,int pad,float * delta,float * prev_delta)274 __global__ void backward_local_avgpool_layer_kernel(int n, int in_h, int in_w, int in_c, int stride_x, int stride_y, int size, int pad, float *delta, float *prev_delta)
275 {
276     int h = (in_h + pad - size) / stride_y + 1;
277     int w = (in_w + pad - size) / stride_x + 1;
278     int c = in_c;
279     int area_x = (size - 1) / stride_x;
280     int area_y = (size - 1) / stride_y;
281 
282     int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
283     if (id >= n) return;
284 
285     int index = id;
286     int j = id % in_w;
287     id /= in_w;
288     int i = id % in_h;
289     id /= in_h;
290     int k = id % in_c;
291     id /= in_c;
292     int b = id;
293 
294     int w_offset = -pad / 2;
295     int h_offset = -pad / 2;
296 
297     int counter = 0;
298     float d = 0;
299     int l, m;
300     for (l = -area_y; l < area_y + 1; ++l) {
301         for (m = -area_x; m < area_x + 1; ++m) {
302             int out_w = (j - w_offset) / stride_x + m;
303             int out_h = (i - h_offset) / stride_y + l;
304             int out_index = out_w + w*(out_h + h*(k + c*b));
305             int valid = (out_w >= 0 && out_w < w && out_h >= 0 && out_h < h);
306             if (valid) {
307                 counter++;
308                 d += delta[out_index];
309             }
310         }
311     }
312     if(counter > 0) prev_delta[index] += d / counter;
313 }
314 
315 
316 
forward_local_avgpool_layer_gpu(maxpool_layer layer,network_state state)317 extern "C" void forward_local_avgpool_layer_gpu(maxpool_layer layer, network_state state)
318 {
319 
320 #ifdef CUDNN_DISABLED
321     if (!state.train && layer.stride == layer.size) {
322         // cudnnPoolingBackward
323         cudnnStatus_t maxpool_status;
324 
325         float alpha = 1, beta = 0;
326         maxpool_status = cudnnPoolingForward(
327             cudnn_handle(),
328             layer.poolingDesc,
329             &alpha,
330             layer.srcTensorDesc,
331             state.input,
332             &beta,
333             layer.dstTensorDesc,
334             layer.output_gpu);
335 
336         //maxpool_status = cudnnDestroyPoolingDescriptor(poolingDesc);
337         //cudnnDestroyTensorDescriptor(layer.srcTensorDesc);
338         //cudnnDestroyTensorDescriptor(layer.dstTensorDesc);
339 
340     }
341     else
342 #endif
343     {
344         int h = layer.out_h;
345         int w = layer.out_w;
346         int c = layer.out_c;
347 
348         size_t n = h*w*c*layer.batch;
349 
350         forward_local_avgpool_layer_kernel << <cuda_gridsize(n), BLOCK, 0, get_cuda_stream() >> > (n, layer.h, layer.w, layer.c, layer.stride_x, layer.stride_y, layer.size, layer.pad, state.input, layer.output_gpu);
351         CHECK_CUDA(cudaPeekAtLastError());
352     }
353 }
354 
backward_local_avgpool_layer_gpu(maxpool_layer layer,network_state state)355 extern "C" void backward_local_avgpool_layer_gpu(maxpool_layer layer, network_state state)
356 {
357     size_t n = layer.h*layer.w*layer.c*layer.batch;
358 
359     backward_local_avgpool_layer_kernel << <cuda_gridsize(n), BLOCK, 0, get_cuda_stream() >> >(n, layer.h, layer.w, layer.c, layer.stride_x, layer.stride_y, layer.size, layer.pad, layer.delta_gpu, state.delta);
360     CHECK_CUDA(cudaPeekAtLastError());
361 }
362