1 #include "maxpool_layer.h"
2 #include "convolutional_layer.h"
3 #include "dark_cuda.h"
4 #include "utils.h"
5 #include "gemm.h"
6 #include <stdio.h>
7 
get_maxpool_image(maxpool_layer l)8 image get_maxpool_image(maxpool_layer l)
9 {
10     int h = l.out_h;
11     int w = l.out_w;
12     int c = l.c;
13     return float_to_image(w,h,c,l.output);
14 }
15 
get_maxpool_delta(maxpool_layer l)16 image get_maxpool_delta(maxpool_layer l)
17 {
18     int h = l.out_h;
19     int w = l.out_w;
20     int c = l.c;
21     return float_to_image(w,h,c,l.delta);
22 }
23 
create_maxpool_cudnn_tensors(layer * l)24 void create_maxpool_cudnn_tensors(layer *l)
25 {
26 #ifdef CUDNN
27     CHECK_CUDNN(cudnnCreatePoolingDescriptor(&l->poolingDesc));
28     CHECK_CUDNN(cudnnCreateTensorDescriptor(&l->srcTensorDesc));
29     CHECK_CUDNN(cudnnCreateTensorDescriptor(&l->dstTensorDesc));
30 #endif // CUDNN
31 }
32 
cudnn_maxpool_setup(layer * l)33 void cudnn_maxpool_setup(layer *l)
34 {
35 #ifdef CUDNN
36     CHECK_CUDNN(cudnnSetPooling2dDescriptor(
37         l->poolingDesc,
38         CUDNN_POOLING_MAX,
39         CUDNN_NOT_PROPAGATE_NAN,    // CUDNN_PROPAGATE_NAN, CUDNN_NOT_PROPAGATE_NAN
40         l->size,
41         l->size,
42         l->pad/2, //0, //l.pad,
43         l->pad/2, //0, //l.pad,
44         l->stride_x,
45         l->stride_y));
46 
47     CHECK_CUDNN(cudnnSetTensor4dDescriptor(l->srcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->c, l->h, l->w));
48     CHECK_CUDNN(cudnnSetTensor4dDescriptor(l->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w));
49 #endif // CUDNN
50 }
51 
52 
cudnn_local_avgpool_setup(layer * l)53 void cudnn_local_avgpool_setup(layer *l)
54 {
55 #ifdef CUDNN
56     CHECK_CUDNN(cudnnSetPooling2dDescriptor(
57         l->poolingDesc,
58         CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING,
59         CUDNN_NOT_PROPAGATE_NAN,    // CUDNN_PROPAGATE_NAN, CUDNN_NOT_PROPAGATE_NAN
60         l->size,
61         l->size,
62         l->pad / 2, //0, //l.pad,
63         l->pad / 2, //0, //l.pad,
64         l->stride_x,
65         l->stride_y));
66 
67     CHECK_CUDNN(cudnnSetTensor4dDescriptor(l->srcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->c, l->h, l->w));
68     CHECK_CUDNN(cudnnSetTensor4dDescriptor(l->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w));
69 #endif // CUDNN
70 }
71 
make_maxpool_layer(int batch,int h,int w,int c,int size,int stride_x,int stride_y,int padding,int maxpool_depth,int out_channels,int antialiasing,int avgpool,int train)72 maxpool_layer make_maxpool_layer(int batch, int h, int w, int c, int size, int stride_x, int stride_y, int padding, int maxpool_depth, int out_channels, int antialiasing, int avgpool, int train)
73 {
74     maxpool_layer l = { (LAYER_TYPE)0 };
75     l.avgpool = avgpool;
76     if (avgpool) l.type = LOCAL_AVGPOOL;
77     else l.type = MAXPOOL;
78     l.train = train;
79 
80     const int blur_stride_x = stride_x;
81     const int blur_stride_y = stride_y;
82     l.antialiasing = antialiasing;
83     if (antialiasing) {
84         stride_x = stride_y = l.stride = l.stride_x = l.stride_y = 1; // use stride=1 in host-layer
85     }
86 
87     l.batch = batch;
88     l.h = h;
89     l.w = w;
90     l.c = c;
91     l.pad = padding;
92     l.maxpool_depth = maxpool_depth;
93     l.out_channels = out_channels;
94     if (maxpool_depth) {
95         l.out_c = out_channels;
96         l.out_w = l.w;
97         l.out_h = l.h;
98     }
99     else {
100         l.out_w = (w + padding - size) / stride_x + 1;
101         l.out_h = (h + padding - size) / stride_y + 1;
102         l.out_c = c;
103     }
104     l.outputs = l.out_h * l.out_w * l.out_c;
105     l.inputs = h*w*c;
106     l.size = size;
107     l.stride = stride_x;
108     l.stride_x = stride_x;
109     l.stride_y = stride_y;
110     int output_size = l.out_h * l.out_w * l.out_c * batch;
111 
112     if (train) {
113         if (!avgpool) l.indexes = (int*)xcalloc(output_size, sizeof(int));
114         l.delta = (float*)xcalloc(output_size, sizeof(float));
115     }
116     l.output = (float*)xcalloc(output_size, sizeof(float));
117     if (avgpool) {
118         l.forward = forward_local_avgpool_layer;
119         l.backward = backward_local_avgpool_layer;
120     }
121     else {
122         l.forward = forward_maxpool_layer;
123         l.backward = backward_maxpool_layer;
124     }
125 #ifdef GPU
126     if (avgpool) {
127         l.forward_gpu = forward_local_avgpool_layer_gpu;
128         l.backward_gpu = backward_local_avgpool_layer_gpu;
129     }
130     else {
131         l.forward_gpu = forward_maxpool_layer_gpu;
132         l.backward_gpu = backward_maxpool_layer_gpu;
133     }
134 
135     if (train) {
136         if (!avgpool) l.indexes_gpu = cuda_make_int_array(output_size);
137         l.delta_gpu = cuda_make_array(l.delta, output_size);
138     }
139     l.output_gpu  = cuda_make_array(l.output, output_size);
140     create_maxpool_cudnn_tensors(&l);
141     if (avgpool) cudnn_local_avgpool_setup(&l);
142     else cudnn_maxpool_setup(&l);
143 
144 #endif  // GPU
145 	l.bflops = (l.size*l.size*l.c * l.out_h*l.out_w) / 1000000000.;
146     if (avgpool) {
147         if (stride_x == stride_y)
148             fprintf(stderr, "avg               %2dx%2d/%2d   %4d x%4d x%4d -> %4d x%4d x%4d %5.3f BF\n", size, size, stride_x, w, h, c, l.out_w, l.out_h, l.out_c, l.bflops);
149         else
150             fprintf(stderr, "avg              %2dx%2d/%2dx%2d %4d x%4d x%4d -> %4d x%4d x%4d %5.3f BF\n", size, size, stride_x, stride_y, w, h, c, l.out_w, l.out_h, l.out_c, l.bflops);
151     }
152     else {
153         if (maxpool_depth)
154             fprintf(stderr, "max-depth         %2dx%2d/%2d   %4d x%4d x%4d -> %4d x%4d x%4d %5.3f BF\n", size, size, stride_x, w, h, c, l.out_w, l.out_h, l.out_c, l.bflops);
155         else if (stride_x == stride_y)
156             fprintf(stderr, "max               %2dx%2d/%2d   %4d x%4d x%4d -> %4d x%4d x%4d %5.3f BF\n", size, size, stride_x, w, h, c, l.out_w, l.out_h, l.out_c, l.bflops);
157         else
158             fprintf(stderr, "max              %2dx%2d/%2dx%2d %4d x%4d x%4d -> %4d x%4d x%4d %5.3f BF\n", size, size, stride_x, stride_y, w, h, c, l.out_w, l.out_h, l.out_c, l.bflops);
159     }
160 
161     if (l.antialiasing) {
162         printf("AA:  ");
163         l.input_layer = (layer*)calloc(1, sizeof(layer));
164         int blur_size = 3;
165         int blur_pad = blur_size / 2;
166         if (l.antialiasing == 2) {
167             blur_size = 2;
168             blur_pad = 0;
169         }
170         *(l.input_layer) = make_convolutional_layer(batch, 1, l.out_h, l.out_w, l.out_c, l.out_c, l.out_c, blur_size, blur_stride_x, blur_stride_y, 1, blur_pad, LINEAR, 0, 0, 0, 0, 0, 1, 0, NULL, 0, 0, train);
171         const int blur_nweights = l.out_c * blur_size * blur_size;  // (n / n) * n * blur_size * blur_size;
172         int i;
173         if (blur_size == 2) {
174             for (i = 0; i < blur_nweights; i += (blur_size*blur_size)) {
175                 l.input_layer->weights[i + 0] = 1 / 4.f;
176                 l.input_layer->weights[i + 1] = 1 / 4.f;
177                 l.input_layer->weights[i + 2] = 1 / 4.f;
178                 l.input_layer->weights[i + 3] = 1 / 4.f;
179             }
180         }
181         else {
182             for (i = 0; i < blur_nweights; i += (blur_size*blur_size)) {
183                 l.input_layer->weights[i + 0] = 1 / 16.f;
184                 l.input_layer->weights[i + 1] = 2 / 16.f;
185                 l.input_layer->weights[i + 2] = 1 / 16.f;
186 
187                 l.input_layer->weights[i + 3] = 2 / 16.f;
188                 l.input_layer->weights[i + 4] = 4 / 16.f;
189                 l.input_layer->weights[i + 5] = 2 / 16.f;
190 
191                 l.input_layer->weights[i + 6] = 1 / 16.f;
192                 l.input_layer->weights[i + 7] = 2 / 16.f;
193                 l.input_layer->weights[i + 8] = 1 / 16.f;
194             }
195         }
196         for (i = 0; i < l.out_c; ++i) l.input_layer->biases[i] = 0;
197 #ifdef GPU
198         if (gpu_index >= 0) {
199             if (l.antialiasing) l.input_antialiasing_gpu = cuda_make_array(NULL, l.batch*l.outputs);
200             push_convolutional_layer(*(l.input_layer));
201         }
202 #endif  // GPU
203     }
204 
205     return l;
206 }
207 
resize_maxpool_layer(maxpool_layer * l,int w,int h)208 void resize_maxpool_layer(maxpool_layer *l, int w, int h)
209 {
210     l->h = h;
211     l->w = w;
212     l->inputs = h*w*l->c;
213 
214     l->out_w = (w + l->pad - l->size) / l->stride_x + 1;
215     l->out_h = (h + l->pad - l->size) / l->stride_y + 1;
216     l->outputs = l->out_w * l->out_h * l->out_c;
217     int output_size = l->outputs * l->batch;
218 
219     if (l->train) {
220         if (!l->avgpool) l->indexes = (int*)xrealloc(l->indexes, output_size * sizeof(int));
221         l->delta = (float*)xrealloc(l->delta, output_size * sizeof(float));
222     }
223     l->output = (float*)xrealloc(l->output, output_size * sizeof(float));
224 
225 #ifdef GPU
226     CHECK_CUDA(cudaFree(l->output_gpu));
227     l->output_gpu  = cuda_make_array(l->output, output_size);
228 
229     if (l->train) {
230         if (!l->avgpool) {
231             CHECK_CUDA(cudaFree((float *)l->indexes_gpu));
232             l->indexes_gpu = cuda_make_int_array(output_size);
233         }
234         CHECK_CUDA(cudaFree(l->delta_gpu));
235         l->delta_gpu = cuda_make_array(l->delta, output_size);
236     }
237 
238     if(l->avgpool) cudnn_local_avgpool_setup(l);
239     else cudnn_maxpool_setup(l);
240 #endif
241 }
242 
forward_maxpool_layer(const maxpool_layer l,network_state state)243 void forward_maxpool_layer(const maxpool_layer l, network_state state)
244 {
245     if (l.maxpool_depth)
246     {
247         int b, i, j, k, g;
248         for (b = 0; b < l.batch; ++b) {
249             #pragma omp parallel for
250             for (i = 0; i < l.h; ++i) {
251                 for (j = 0; j < l.w; ++j) {
252                     for (g = 0; g < l.out_c; ++g)
253                     {
254                         int out_index = j + l.w*(i + l.h*(g + l.out_c*b));
255                         float max = -FLT_MAX;
256                         int max_i = -1;
257 
258                         for (k = g; k < l.c; k += l.out_c)
259                         {
260                             int in_index = j + l.w*(i + l.h*(k + l.c*b));
261                             float val = state.input[in_index];
262 
263                             max_i = (val > max) ? in_index : max_i;
264                             max = (val > max) ? val : max;
265                         }
266                         l.output[out_index] = max;
267                         if (l.indexes) l.indexes[out_index] = max_i;
268                     }
269                 }
270             }
271         }
272         return;
273     }
274 
275 
276     if (!state.train && l.stride_x == l.stride_y) {
277         forward_maxpool_layer_avx(state.input, l.output, l.indexes, l.size, l.w, l.h, l.out_w, l.out_h, l.c, l.pad, l.stride, l.batch);
278     }
279     else
280     {
281 
282         int b, i, j, k, m, n;
283         int w_offset = -l.pad / 2;
284         int h_offset = -l.pad / 2;
285 
286         int h = l.out_h;
287         int w = l.out_w;
288         int c = l.c;
289 
290         for (b = 0; b < l.batch; ++b) {
291             for (k = 0; k < c; ++k) {
292                 for (i = 0; i < h; ++i) {
293                     for (j = 0; j < w; ++j) {
294                         int out_index = j + w*(i + h*(k + c*b));
295                         float max = -FLT_MAX;
296                         int max_i = -1;
297                         for (n = 0; n < l.size; ++n) {
298                             for (m = 0; m < l.size; ++m) {
299                                 int cur_h = h_offset + i*l.stride_y + n;
300                                 int cur_w = w_offset + j*l.stride_x + m;
301                                 int index = cur_w + l.w*(cur_h + l.h*(k + b*l.c));
302                                 int valid = (cur_h >= 0 && cur_h < l.h &&
303                                     cur_w >= 0 && cur_w < l.w);
304                                 float val = (valid != 0) ? state.input[index] : -FLT_MAX;
305                                 max_i = (val > max) ? index : max_i;
306                                 max = (val > max) ? val : max;
307                             }
308                         }
309                         l.output[out_index] = max;
310                         if (l.indexes) l.indexes[out_index] = max_i;
311                     }
312                 }
313             }
314         }
315     }
316 
317     if (l.antialiasing) {
318         network_state s = { 0 };
319         s.train = state.train;
320         s.workspace = state.workspace;
321         s.net = state.net;
322         s.input = l.output;
323         forward_convolutional_layer(*(l.input_layer), s);
324         //simple_copy_ongpu(l.outputs*l.batch, l.output, l.input_antialiasing);
325         memcpy(l.output, l.input_layer->output, l.input_layer->outputs * l.input_layer->batch * sizeof(float));
326     }
327 }
328 
backward_maxpool_layer(const maxpool_layer l,network_state state)329 void backward_maxpool_layer(const maxpool_layer l, network_state state)
330 {
331     int i;
332     int h = l.out_h;
333     int w = l.out_w;
334     int c = l.out_c;
335     #pragma omp parallel for
336     for(i = 0; i < h*w*c*l.batch; ++i){
337         int index = l.indexes[i];
338         state.delta[index] += l.delta[i];
339     }
340 }
341 
342 
forward_local_avgpool_layer(const maxpool_layer l,network_state state)343 void forward_local_avgpool_layer(const maxpool_layer l, network_state state)
344 {
345     int b, i, j, k, m, n;
346     int w_offset = -l.pad / 2;
347     int h_offset = -l.pad / 2;
348 
349     int h = l.out_h;
350     int w = l.out_w;
351     int c = l.c;
352 
353     for (b = 0; b < l.batch; ++b) {
354         for (k = 0; k < c; ++k) {
355             for (i = 0; i < h; ++i) {
356                 for (j = 0; j < w; ++j) {
357                     int out_index = j + w*(i + h*(k + c*b));
358                     float avg = 0;
359                     int counter = 0;
360                     for (n = 0; n < l.size; ++n) {
361                         for (m = 0; m < l.size; ++m) {
362                             int cur_h = h_offset + i*l.stride_y + n;
363                             int cur_w = w_offset + j*l.stride_x + m;
364                             int index = cur_w + l.w*(cur_h + l.h*(k + b*l.c));
365                             int valid = (cur_h >= 0 && cur_h < l.h &&
366                                 cur_w >= 0 && cur_w < l.w);
367                             if (valid) {
368                                 counter++;
369                                 avg += state.input[index];
370                             }
371 
372                         }
373                     }
374                     l.output[out_index] = avg / counter;
375                 }
376             }
377         }
378     }
379 }
380 
backward_local_avgpool_layer(const maxpool_layer l,network_state state)381 void backward_local_avgpool_layer(const maxpool_layer l, network_state state)
382 {
383 
384     int b, i, j, k, m, n;
385     int w_offset = -l.pad / 2;
386     int h_offset = -l.pad / 2;
387 
388     int h = l.out_h;
389     int w = l.out_w;
390     int c = l.c;
391 
392     for (b = 0; b < l.batch; ++b) {
393         for (k = 0; k < c; ++k) {
394             for (i = 0; i < h; ++i) {
395                 for (j = 0; j < w; ++j) {
396                     int out_index = j + w*(i + h*(k + c*b));
397                     for (n = 0; n < l.size; ++n) {
398                         for (m = 0; m < l.size; ++m) {
399                             int cur_h = h_offset + i*l.stride_y + n;
400                             int cur_w = w_offset + j*l.stride_x + m;
401                             int index = cur_w + l.w*(cur_h + l.h*(k + b*l.c));
402                             int valid = (cur_h >= 0 && cur_h < l.h &&
403                                 cur_w >= 0 && cur_w < l.w);
404 
405                             if (valid) state.delta[index] += l.delta[out_index] / (l.size*l.size);
406                         }
407                     }
408 
409                 }
410             }
411         }
412     }
413 
414 }