1 /*!
2  ******************* BEGIN Caffe Copyright Notice and Disclaimer ****************
3  *
4  * COPYRIGHT
5  *
6  * All contributions by the University of California:
7  * Copyright (c) 2014-2017 The Regents of the University of California (Regents)
8  * All rights reserved.
9  *
10  * All other contributions:
11  * Copyright (c) 2014-2017, the respective contributors
12  * All rights reserved.
13  *
14  * Caffe uses a shared copyright model: each contributor holds copyright over
15  * their contributions to Caffe. The project versioning records all such
16  * contribution and copyright details. If a contributor wants to further mark
17  * their specific copyright on a particular contribution, they should indicate
18  * their copyright solely in the commit message of the change when it is
19  * committed.
20  *
21  * LICENSE
22  *
23  * Redistribution and use in source and binary forms, with or without
24  * modification, are permitted provided that the following conditions are met:
25  *
26  * 1. Redistributions of source code must retain the above copyright notice, this
27  * list of conditions and the following disclaimer.
28  * 2. Redistributions in binary form must reproduce the above copyright notice,
29  * this list of conditions and the following disclaimer in the documentation
30  * and/or other materials provided with the distribution.
31  *
32  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
33  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
34  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
35  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
36  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
37  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
38  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
39  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
40  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
41  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
42  *
43  * CONTRIBUTION AGREEMENT
44  *
45  * By contributing to the BVLC/caffe repository through pull-request, comment,
46  * or otherwise, the contributor releases their content to the
47  * license and copyright terms herein.
48  *
49  ***************** END Caffe Copyright Notice and Disclaimer ********************
50  *
51  * \file pool.h
52  * \brief Function definitions of pooling 1/2/3-D images.
53  * We adopted looping 2-D image pixels from Caffe and extended it to 1-D and 3-D cases.
54  * \ref https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cpp
55  * \author Jun Wu
56  */
57 
58 #ifndef MXNET_OPERATOR_NN_POOL_H_
59 #define MXNET_OPERATOR_NN_POOL_H_
60 
61 #include <mxnet/base.h>
62 #include <mxnet/operator.h>
63 #include <vector>
64 #include <algorithm>
65 #include "./pool_utils.h"
66 #include "../mxnet_op.h"
67 #include "../mshadow_op.h"
68 
69 namespace mxnet {
70 namespace op {
71 
72 namespace pool_enum {
73 enum PoolingOpInputs {kData};
74 enum PoolingOpOutputs {kOut, kMask};
75 enum PoolingOpType {kMaxPooling, kAvgPooling, kSumPooling, kLpPooling};
76 enum PoolingOpPadConventionType {kValid, kFull, kSame};
77 }  // namespace pool_enum
78 
79 /*!
80  * \brief max pooling cpu function for 1-D images in 'ncw' layout.
81  * Do not call this kernel directly. Use the interface pool().
82  */
83 template<typename DType>
pool_max_1d_ncw_cpu(const DType * in_data,const mxnet::TShape & ishape,const mxnet::TShape & oshape,const mxnet::TShape & kernel,const mxnet::TShape & pad,const mxnet::TShape & stride,DType * out_data)84 inline void pool_max_1d_ncw_cpu(const DType *in_data, const mxnet::TShape &ishape,
85                                 const mxnet::TShape &oshape, const mxnet::TShape &kernel,
86                                 const mxnet::TShape &pad, const mxnet::TShape &stride,
87                                 DType *out_data) {
88   using mshadow::red::limits::MinValue;
89   const int width = ishape[2];
90   const int pooled_width = oshape[2];
91   const int kernel_w = kernel[0];
92   const int pad_w = pad[0];
93   const int stride_w = stride[0];
94   const index_t in_data_offset = ishape[2];
95   const index_t out_data_offset = oshape[2];
96   for (index_t n = 0; n < oshape[0]; ++n) {
97     for (index_t c = 0; c < oshape[1]; ++c) {
98       for (int pw = 0; pw < pooled_width; ++pw) {
99         int wstart = pw * stride_w - pad_w;
100         int wend = std::min(wstart + kernel_w, width);
101         wstart = std::max(wstart, 0);
102         DType max_val = MinValue<DType>();
103         for (int w = wstart; w < wend; ++w) {
104           if (in_data[w] > max_val) {
105             max_val = in_data[w];
106           }
107         }
108         out_data[pw] = max_val;
109       }
110       in_data += in_data_offset;
111       out_data += out_data_offset;
112     }
113   }
114 }
115 
116 /*!
117  * \brief max pooling cpu function for 1-D images in 'nwc' layout.
118  * Do not call this kernel directly. Use the interface pool().
119  */
120 template<typename DType>
pool_max_1d_nwc_cpu(const DType * in_data,const mxnet::TShape & ishape,const mxnet::TShape & oshape,const mxnet::TShape & kernel,const mxnet::TShape & pad,const mxnet::TShape & stride,DType * out_data)121 inline void pool_max_1d_nwc_cpu(const DType* in_data, const mxnet::TShape& ishape,
122                                 const mxnet::TShape& oshape, const mxnet::TShape& kernel,
123                                 const mxnet::TShape& pad, const mxnet::TShape& stride,
124                                 DType* out_data) {
125   using mshadow::red::limits::MinValue;
126   const int width = ishape[1];
127   const int pooled_width = oshape[1];
128   const int kernel_w = kernel[0];
129   const int pad_w = pad[0];
130   const int stride_w = stride[0];
131   const int features = oshape[2];
132   const index_t in_data_offset = ishape[1] * features;
133   const index_t out_data_offset = oshape[1] * features;
134   std::vector<DType> max_vals(features);
135   for (index_t n = 0; n < oshape[0]; ++n) {
136     for (int pw = 0; pw < pooled_width; ++pw) {
137       int wstart = pw * stride_w - pad_w;
138       int wend = std::min(wstart + kernel_w, width);
139       wstart = std::max(wstart, 0);
140       std::fill(max_vals.begin(), max_vals.end(), MinValue<DType>());
141       for (int w = wstart; w < wend; ++w) {
142         for (index_t c = 0; c < features; ++c) {
143           if (in_data[w * features + c] > max_vals[c]) {
144             max_vals[c] = in_data[w * features + c];
145           }
146         }
147       }
148       for (index_t c = 0; c < features; ++c)
149         out_data[pw * features + c] = max_vals[c];
150     }
151     in_data += in_data_offset;
152     out_data += out_data_offset;
153   }
154 }
155 
156 /*!
157  * \brief max pooling cpu function for 2-D images in 'nchw' layout.
158  * Do not call this kernel directly. Use the interface pool().
159  */
160 template<typename DType>
pool_max_2d_nchw_cpu(const DType * in_data,const mxnet::TShape & ishape,const mxnet::TShape & oshape,const mxnet::TShape & kernel,const mxnet::TShape & pad,const mxnet::TShape & stride,DType * out_data)161 inline void pool_max_2d_nchw_cpu(const DType *in_data, const mxnet::TShape &ishape,
162                                  const mxnet::TShape &oshape, const mxnet::TShape &kernel,
163                                  const mxnet::TShape &pad, const mxnet::TShape &stride,
164                                  DType *out_data) {
165   using mshadow::red::limits::MinValue;
166   const int height = ishape[2], width = ishape[3];
167   const int pooled_height = oshape[2], pooled_width = oshape[3];
168   const int kernel_h = kernel[0], kernel_w = kernel[1];
169   const int pad_h = pad[0], pad_w = pad[1];
170   const int stride_h = stride[0], stride_w = stride[1];
171   const index_t in_data_offset = ishape[2] * ishape[3];
172   const index_t out_data_offset = oshape[2] * oshape[3];
173   for (index_t n = 0; n < oshape[0]; ++n) {
174     for (index_t c = 0; c < oshape[1]; ++c) {
175       for (int ph = 0; ph < pooled_height; ++ph) {
176         for (int pw = 0; pw < pooled_width; ++pw) {
177           int hstart = ph * stride_h - pad_h;
178           int wstart = pw * stride_w - pad_w;
179           int hend = std::min(hstart + kernel_h, height);
180           int wend = std::min(wstart + kernel_w, width);
181           hstart = std::max(hstart, 0);
182           wstart = std::max(wstart, 0);
183           const int pool_index = ph * pooled_width + pw;
184           DType max_val = MinValue<DType>();
185           for (int h = hstart; h < hend; ++h) {
186             for (int w = wstart; w < wend; ++w) {
187               const int in_index = h * width + w;
188               if (in_data[in_index] > max_val) {
189                 max_val = in_data[in_index];
190               }
191             }
192           }
193           out_data[pool_index] = max_val;
194         }
195       }
196       in_data += in_data_offset;
197       out_data += out_data_offset;
198     }
199   }
200 }
201 
202 /*!
203  * \brief max pooling cpu function for 2-D images in 'nhwc' layout.
204  * Do not call this kernel directly. Use the interface pool().
205  */
206 template<typename DType>
pool_max_2d_nhwc_cpu(const DType * in_data,const mxnet::TShape & ishape,const mxnet::TShape & oshape,const mxnet::TShape & kernel,const mxnet::TShape & pad,const mxnet::TShape & stride,DType * out_data)207 inline void pool_max_2d_nhwc_cpu(const DType* in_data, const mxnet::TShape& ishape,
208                                  const mxnet::TShape& oshape, const mxnet::TShape& kernel,
209                                  const mxnet::TShape& pad, const mxnet::TShape& stride,
210                                  DType* out_data) {
211   using mshadow::red::limits::MinValue;
212   const int height = ishape[1], width = ishape[2];
213   const int pooled_height = oshape[1], pooled_width = oshape[2];
214   const int kernel_h = kernel[0], kernel_w = kernel[1];
215   const int pad_h = pad[0], pad_w = pad[1];
216   const int stride_h = stride[0], stride_w = stride[1];
217   const int features = oshape[3];
218   const index_t in_data_offset = ishape[1] * ishape[2] * features;
219   const index_t out_data_offset = oshape[1] * oshape[2] * features;
220   std::vector<DType> max_vals(features);
221   for (index_t n = 0; n < oshape[0]; ++n) {
222     for (int ph = 0; ph < pooled_height; ++ph) {
223       for (int pw = 0; pw < pooled_width; ++pw) {
224         int hstart = ph * stride_h - pad_h;
225         int wstart = pw * stride_w - pad_w;
226         int hend = std::min(hstart + kernel_h, height);
227         int wend = std::min(wstart + kernel_w, width);
228         hstart = std::max(hstart, 0);
229         wstart = std::max(wstart, 0);
230         const int pool_index = ph * pooled_width + pw;
231         std::fill(max_vals.begin(), max_vals.end(), MinValue<DType>());
232         for (int h = hstart; h < hend; ++h) {
233           for (int w = wstart; w < wend; ++w) {
234             const int in_index = h * width + w;
235             for (index_t c = 0; c < features; ++c) {
236               if (in_data[in_index * features + c] > max_vals[c]) {
237                 max_vals[c] = in_data[in_index * features + c];
238               }
239             }
240           }
241         }
242         for (index_t c = 0; c < features; ++c)
243           out_data[pool_index * features + c] = max_vals[c];
244       }
245     }
246     in_data += in_data_offset;
247     out_data += out_data_offset;
248   }
249 }
250 
251 /*!
252  * \brief max pooling cpu function for 3-D images in 'ncdhw' layout.
253  * Do not call this kernel directly. Use the interface pool().
254  */
255 template<typename DType>
pool_max_3d_ncdhw_cpu(const DType * in_data,const mxnet::TShape & ishape,const mxnet::TShape & oshape,const mxnet::TShape & kernel,const mxnet::TShape & pad,const mxnet::TShape & stride,DType * out_data)256 inline void pool_max_3d_ncdhw_cpu(const DType *in_data, const mxnet::TShape &ishape,
257                                   const mxnet::TShape &oshape, const mxnet::TShape &kernel,
258                                   const mxnet::TShape &pad, const mxnet::TShape &stride,
259                                   DType *out_data) {
260   using mshadow::red::limits::MinValue;
261   const int depth = ishape[2], height = ishape[3], width = ishape[4];
262   const int pooled_depth = oshape[2], pooled_height = oshape[3], pooled_width = oshape[4];
263   const int kernel_d = kernel[0], kernel_h = kernel[1], kernel_w = kernel[2];
264   const int pad_d = pad[0], pad_h = pad[1], pad_w = pad[2];
265   const int stride_d = stride[0], stride_h = stride[1], stride_w = stride[2];
266   const index_t in_data_offset = ishape[2] * ishape[3] * ishape[4];
267   const index_t out_data_offset = oshape[2] * oshape[3] * oshape[4];
268   for (index_t n = 0; n < oshape[0]; ++n) {
269     for (index_t c = 0; c < oshape[1]; ++c) {
270       for (int pd = 0; pd < pooled_depth; ++pd) {
271         for (int ph = 0; ph < pooled_height; ++ph) {
272           for (int pw = 0; pw < pooled_width; ++pw) {
273             int dstart = pd * stride_d - pad_d;
274             int hstart = ph * stride_h - pad_h;
275             int wstart = pw * stride_w - pad_w;
276             int dend = std::min(dstart + kernel_d, depth);
277             int hend = std::min(hstart + kernel_h, height);
278             int wend = std::min(wstart + kernel_w, width);
279             dstart = std::max(dstart, 0);
280             hstart = std::max(hstart, 0);
281             wstart = std::max(wstart, 0);
282             const int pool_index = (pd * pooled_height + ph) * pooled_width + pw;
283             DType max_val = MinValue<DType>();
284             for (int d = dstart; d < dend; ++d) {
285               for (int h = hstart; h < hend; ++h) {
286                 for (int w = wstart; w < wend; ++w) {
287                   const int in_index = (d * height + h) * width + w;
288                   if (in_data[in_index] > max_val) {
289                     max_val = in_data[in_index];
290                   }
291                 }
292               }
293             }
294             out_data[pool_index] = max_val;
295           }
296         }
297       }
298       in_data += in_data_offset;
299       out_data += out_data_offset;
300     }
301   }
302 }
303 
304 /*!
305  * \brief max pooling cpu function for 3-D images in 'ndhwc' layout.
306  * Do not call this kernel directly. Use the interface pool().
307  */
308 template<typename DType>
pool_max_3d_ndhwc_cpu(const DType * in_data,const mxnet::TShape & ishape,const mxnet::TShape & oshape,const mxnet::TShape & kernel,const mxnet::TShape & pad,const mxnet::TShape & stride,DType * out_data)309 inline void pool_max_3d_ndhwc_cpu(const DType* in_data, const mxnet::TShape& ishape,
310                                   const mxnet::TShape& oshape, const mxnet::TShape& kernel,
311                                   const mxnet::TShape& pad, const mxnet::TShape& stride,
312                                   DType* out_data) {
313   using mshadow::red::limits::MinValue;
314   const int depth = ishape[1], height = ishape[2], width = ishape[3];
315   const int pooled_depth = oshape[1], pooled_height = oshape[2], pooled_width = oshape[3];
316   const int kernel_d = kernel[0], kernel_h = kernel[1], kernel_w = kernel[2];
317   const int pad_d = pad[0], pad_h = pad[1], pad_w = pad[2];
318   const int stride_d = stride[0], stride_h = stride[1], stride_w = stride[2];
319   const int features = oshape[4];
320   const index_t in_data_offset = ishape[1] * ishape[2] * ishape[3] * features;
321   const index_t out_data_offset = oshape[1] * oshape[2] * oshape[3] * features;
322   std::vector<DType> max_vals(features);
323   for (index_t n = 0; n < oshape[0]; ++n) {
324     for (int pd = 0; pd < pooled_depth; ++pd) {
325       for (int ph = 0; ph < pooled_height; ++ph) {
326         for (int pw = 0; pw < pooled_width; ++pw) {
327           int dstart = pd * stride_d - pad_d;
328           int hstart = ph * stride_h - pad_h;
329           int wstart = pw * stride_w - pad_w;
330           int dend = std::min(dstart + kernel_d, depth);
331           int hend = std::min(hstart + kernel_h, height);
332           int wend = std::min(wstart + kernel_w, width);
333           dstart = std::max(dstart, 0);
334           hstart = std::max(hstart, 0);
335           wstart = std::max(wstart, 0);
336           const int pool_index = (pd * pooled_height + ph) * pooled_width + pw;
337           std::fill(max_vals.begin(), max_vals.end(), MinValue<DType>());
338           for (int d = dstart; d < dend; ++d) {
339             for (int h = hstart; h < hend; ++h) {
340               for (int w = wstart; w < wend; ++w) {
341                 const int in_index = (d * height + h) * width + w;
342                 for (index_t c = 0; c < features; ++c) {
343                   if (in_data[in_index * features + c] > max_vals[c]) {
344                     max_vals[c] = in_data[in_index * features + c];
345                   }
346                 }
347               }
348             }
349           }
350           for (index_t c = 0; c < features; ++c)
351             out_data[pool_index * features + c] = max_vals[c];
352         }
353       }
354     }
355     in_data += in_data_offset;
356     out_data += out_data_offset;
357   }
358 }
359 
360 /*!
361  * \brief avg/sum pooling cpu function for 1-D images in 'ncw' layout.
362  * Do not call this kernel directly. Use the interface pool().
363  */
364 template<typename DType, int p = 1>
365 inline void pool_sum_1d_ncw_cpu(const DType *in_data, const mxnet::TShape &ishape,
366                                 const mxnet::TShape &oshape, const mxnet::TShape &kernel,
367                                 const mxnet::TShape &pad, const mxnet::TShape &stride,
368                                 DType *out_data,
369                                 const bool get_avg = false, const bool count_include_pad = true) {
370   using AccType = typename PoolingTypes<DType>::AccType;
371   const int width = ishape[2];
372   const int pooled_width = oshape[2];
373   const int kernel_w = kernel[0];
374   const int pad_w = pad[0];
375   const int stride_w = stride[0];
376   const index_t in_data_offset = ishape[2];
377   const index_t out_data_offset = oshape[2];
378   for (index_t n = 0; n < oshape[0]; ++n) {
379     for (index_t c = 0; c < oshape[1]; ++c) {
380       for (int pw = 0; pw < pooled_width; ++pw) {
381         int wstart = pw * stride_w - pad_w;
382         int wend = std::min(wstart + kernel_w, width + pad_w);
383         int pool_size = (get_avg ? (wend - wstart) : 1);
384         wstart = std::max(wstart, 0);
385         wend = std::min(wend, width);
386         if (get_avg && !count_include_pad) {
387           pool_size = (wend - wstart);
388         }
389         AccType sum = 0;
390         for (int w = wstart; w < wend; ++w) {
391           sum += a_pow_p<AccType, p>::Map(in_data[w]) / pool_size;
392         }
393         out_data[pw] = a_root_p<AccType, p>::Map(sum);
394       }
395       in_data += in_data_offset;
396       out_data += out_data_offset;
397     }
398   }
399 }
400 
401 /*!
402  * \brief avg/sum pooling cpu function for 1-D images in 'nwc' layout.
403  * Do not call this kernel directly. Use the interface pool().
404  */
405 template<typename DType, int p = 1>
406 inline void pool_sum_1d_nwc_cpu(const DType* in_data, const mxnet::TShape& ishape,
407                                 const mxnet::TShape& oshape, const mxnet::TShape& kernel,
408                                 const mxnet::TShape& pad, const mxnet::TShape& stride,
409                                 DType* out_data,
410                                 const bool get_avg = false, const bool count_include_pad = true) {
411   using AccType = typename PoolingTypes<DType>::AccType;
412   const int width = ishape[1];
413   const int pooled_width = oshape[1];
414   const int kernel_w = kernel[0];
415   const int pad_w = pad[0];
416   const int stride_w = stride[0];
417   const int features = oshape[2];
418   const index_t in_data_offset = ishape[1] * features;
419   const index_t out_data_offset = oshape[1] * features;
420   std::vector<AccType> sums(features);
421   for (index_t n = 0; n < oshape[0]; ++n) {
422     for (int pw = 0; pw < pooled_width; ++pw) {
423       int wstart = pw * stride_w - pad_w;
424       int wend = std::min(wstart + kernel_w, width + pad_w);
425       int pool_size = (get_avg ? (wend - wstart) : 1);
426       wstart = std::max(wstart, 0);
427       wend = std::min(wend, width);
428       if (get_avg && !count_include_pad) {
429         pool_size = (wend - wstart);
430       }
431       std::fill(sums.begin(), sums.end(), 0);
432       for (int w = wstart; w < wend; ++w) {
433         for (index_t c = 0; c < features; ++c) {
434           sums[c] += a_pow_p<AccType, p>::Map(in_data[w * features + c]) / pool_size;
435         }
436       }
437       for (index_t c = 0; c < features; ++c)
438         out_data[pw * features + c] = a_root_p<AccType, p>::Map(sums[c]);
439     }
440     in_data += in_data_offset;
441     out_data += out_data_offset;
442   }
443 }
444 
445 /*!
446  * \brief avg/sum pooling cpu function for 2-D images in 'nchw' layout.
447  * Do not call this kernel directly. Use the interface pool().
448  */
449 template<typename DType, int p = 1>
450 inline void pool_sum_2d_nchw_cpu(const DType *in_data, const mxnet::TShape &ishape,
451                                  const mxnet::TShape &oshape, const mxnet::TShape &kernel,
452                                  const mxnet::TShape &pad, const mxnet::TShape &stride,
453                                  DType *out_data,
454                                  const bool get_avg = false, const bool count_include_pad = true) {
455   using AccType = typename PoolingTypes<DType>::AccType;
456   const int height = ishape[2], width = ishape[3];
457   const int pooled_height = oshape[2], pooled_width = oshape[3];
458   const int kernel_h = kernel[0], kernel_w = kernel[1];
459   const int pad_h = pad[0], pad_w = pad[1];
460   const int stride_h = stride[0], stride_w = stride[1];
461   const index_t in_data_offset = ishape[2] * ishape[3];
462   const index_t out_data_offset = oshape[2] * oshape[3];
463   for (index_t n = 0; n < oshape[0]; ++n) {
464     for (index_t c = 0; c < oshape[1]; ++c) {
465       for (int ph = 0; ph < pooled_height; ++ph) {
466         for (int pw = 0; pw < pooled_width; ++pw) {
467           int hstart = ph * stride_h - pad_h;
468           int wstart = pw * stride_w - pad_w;
469           int hend = std::min(hstart + kernel_h, height + pad_h);
470           int wend = std::min(wstart + kernel_w, width + pad_w);
471           int pool_size = (get_avg ? (hend - hstart) * (wend - wstart) : 1);
472           hstart = std::max(hstart, 0);
473           wstart = std::max(wstart, 0);
474           hend = std::min(hend, height);
475           wend = std::min(wend, width);
476           if (get_avg && !count_include_pad) {
477             pool_size = (hend - hstart) * (wend - wstart);
478           }
479           AccType sum = 0;
480           for (int h = hstart; h < hend; ++h) {
481             for (int w = wstart; w < wend; ++w) {
482               sum += a_pow_p<AccType, p>::Map(in_data[h*width+w]) / pool_size;
483             }
484           }
485           out_data[ph*pooled_width+pw] = a_root_p<AccType, p>::Map(sum);
486         }
487       }
488       in_data += in_data_offset;
489       out_data += out_data_offset;
490     }
491   }
492 }
493 
494 /*!
495  * \brief avg/sum pooling cpu function for 2-D images in 'nhwc' layout.
496  * Do not call this kernel directly. Use the interface pool().
497  */
498 template<typename DType, int p = 1>
499 inline void pool_sum_2d_nhwc_cpu(const DType* in_data, const mxnet::TShape& ishape,
500                                  const mxnet::TShape& oshape, const mxnet::TShape& kernel,
501                                  const mxnet::TShape& pad, const mxnet::TShape& stride,
502                                  DType* out_data,
503                                  const bool get_avg = false, const bool count_include_pad = true) {
504   using AccType = typename PoolingTypes<DType>::AccType;
505   const int height = ishape[1], width = ishape[2];
506   const int pooled_height = oshape[1], pooled_width = oshape[2];
507   const int kernel_h = kernel[0], kernel_w = kernel[1];
508   const int pad_h = pad[0], pad_w = pad[1];
509   const int stride_h = stride[0], stride_w = stride[1];
510   const int features = oshape[3];
511   const index_t in_data_offset = ishape[1] * ishape[2] * features;
512   const index_t out_data_offset = oshape[1] * oshape[2] * features;
513   std::vector<AccType> sums(features);
514   for (index_t n = 0; n < oshape[0]; ++n) {
515     for (int ph = 0; ph < pooled_height; ++ph) {
516       for (int pw = 0; pw < pooled_width; ++pw) {
517         int hstart = ph * stride_h - pad_h;
518         int wstart = pw * stride_w - pad_w;
519         int hend = std::min(hstart + kernel_h, height + pad_h);
520         int wend = std::min(wstart + kernel_w, width + pad_w);
521         int pool_size = (get_avg ? (hend - hstart) * (wend - wstart) : 1);
522         hstart = std::max(hstart, 0);
523         wstart = std::max(wstart, 0);
524         hend = std::min(hend, height);
525         wend = std::min(wend, width);
526         if (get_avg && !count_include_pad) {
527           pool_size = (hend - hstart) * (wend - wstart);
528         }
529         const int pool_index = ph * pooled_width + pw;
530         std::fill(sums.begin(), sums.end(), 0);
531         for (int h = hstart; h < hend; ++h) {
532           for (int w = wstart; w < wend; ++w) {
533             const int in_index = h * width + w;
534             for (index_t c = 0; c < features; ++c) {
535               sums[c] += a_pow_p<AccType, p>::Map(in_data[in_index * features + c]) / pool_size;
536             }
537           }
538         }
539         for (index_t c = 0; c < features; ++c)
540           out_data[pool_index * features + c] = a_root_p<AccType, p>::Map(sums[c]);
541       }
542     }
543     in_data += in_data_offset;
544     out_data += out_data_offset;
545   }
546 }
547 
548 /*!
549  * \brief avg/sum pooling cpu function for 3-D images in 'ncdhw' layout.
550  * Do not call this kernel directly. Use the interface pool().
551  */
552 template<typename DType, int p = 1>
553 inline void pool_sum_3d_ncdhw_cpu(const DType *in_data, const mxnet::TShape &ishape,
554                                   const mxnet::TShape &oshape, const mxnet::TShape &kernel,
555                                   const mxnet::TShape &pad, const mxnet::TShape &stride,
556                                   DType *out_data,
557                                   const bool get_avg = false, const bool count_include_pad = true) {
558   using AccType = typename PoolingTypes<DType>::AccType;
559   const int depth = ishape[2], height = ishape[3], width = ishape[4];
560   const int pooled_depth = oshape[2], pooled_height = oshape[3], pooled_width = oshape[4];
561   const int kernel_d = kernel[0], kernel_h = kernel[1], kernel_w = kernel[2];
562   const int pad_d = pad[0], pad_h = pad[1], pad_w = pad[2];
563   const int stride_d = stride[0], stride_h = stride[1], stride_w = stride[2];
564   const index_t in_data_offset = ishape[2] * ishape[3] * ishape[4];
565   const index_t out_data_offset = oshape[2] * oshape[3] * oshape[4];
566   for (index_t n = 0; n < oshape[0]; ++n) {
567     for (index_t c = 0; c < oshape[1]; ++c) {
568       for (int pd = 0; pd < pooled_depth; ++pd) {
569         for (int ph = 0; ph < pooled_height; ++ph) {
570           for (int pw = 0; pw < pooled_width; ++pw) {
571             int dstart = pd * stride_d - pad_d;
572             int hstart = ph * stride_h - pad_h;
573             int wstart = pw * stride_w - pad_w;
574             int dend = std::min(dstart + kernel_d, depth + pad_d);
575             int hend = std::min(hstart + kernel_h, height + pad_h);
576             int wend = std::min(wstart + kernel_w, width + pad_w);
577             int pool_size = (get_avg ? (dend - dstart) * (hend - hstart) * (wend - wstart) : 1);
578             dstart = std::max(dstart, 0);
579             hstart = std::max(hstart, 0);
580             wstart = std::max(wstart, 0);
581             dend = std::min(dend, depth);
582             hend = std::min(hend, height);
583             wend = std::min(wend, width);
584             if (get_avg && !count_include_pad) {
585               pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
586             }
587             AccType sum = 0;
588             for (int d = dstart; d < dend; ++d) {
589               for (int h = hstart; h < hend; ++h) {
590                 for (int w = wstart; w < wend; ++w) {
591                   sum += a_pow_p<AccType, p>::Map(in_data[(d*height+h)*width+w]) / pool_size;
592                 }
593               }
594             }
595             out_data[(pd*pooled_height+ph)*pooled_width+pw] = (pool_size == 0) ?
596                                                               AccType(nanf("")) :
597                                                               a_root_p<AccType, p>::Map(sum);
598           }
599         }
600       }
601       in_data += in_data_offset;
602       out_data += out_data_offset;
603     }
604   }
605 }
606 
607 /*!
608  * \brief avg/sum pooling cpu function for 3-D images in 'ndhwc' layout.
609  * Do not call this kernel directly. Use the interface pool().
610  */
611 template<typename DType, int p = 1>
612 inline void pool_sum_3d_ndhwc_cpu(const DType* in_data, const mxnet::TShape& ishape,
613                                   const mxnet::TShape& oshape, const mxnet::TShape& kernel,
614                                   const mxnet::TShape& pad, const mxnet::TShape& stride,
615                                   DType* out_data,
616                                   const bool get_avg = false, const bool count_include_pad = true) {
617   using AccType = typename PoolingTypes<DType>::AccType;
618   const int depth = ishape[1], height = ishape[2], width = ishape[3];
619   const int pooled_depth = oshape[1], pooled_height = oshape[2], pooled_width = oshape[3];
620   const int kernel_d = kernel[0], kernel_h = kernel[1], kernel_w = kernel[2];
621   const int pad_d = pad[0], pad_h = pad[1], pad_w = pad[2];
622   const int stride_d = stride[0], stride_h = stride[1], stride_w = stride[2];
623   const int features = oshape[4];
624   const index_t in_data_offset = ishape[1] * ishape[2] * ishape[3] * features;
625   const index_t out_data_offset = oshape[1] * oshape[2] * oshape[3] * features;
626   std::vector<AccType> sums(features);
627   for (index_t n = 0; n < oshape[0]; ++n) {
628     for (int pd = 0; pd < pooled_depth; ++pd) {
629       for (int ph = 0; ph < pooled_height; ++ph) {
630         for (int pw = 0; pw < pooled_width; ++pw) {
631           int dstart = pd * stride_d - pad_d;
632           int hstart = ph * stride_h - pad_h;
633           int wstart = pw * stride_w - pad_w;
634           int dend = std::min(dstart + kernel_d, depth + pad_d);
635           int hend = std::min(hstart + kernel_h, height + pad_h);
636           int wend = std::min(wstart + kernel_w, width + pad_w);
637           int pool_size = (get_avg ? (dend - dstart) * (hend - hstart) * (wend - wstart) : 1);
638           dstart = std::max(dstart, 0);
639           hstart = std::max(hstart, 0);
640           wstart = std::max(wstart, 0);
641           dend = std::min(dend, depth);
642           hend = std::min(hend, height);
643           wend = std::min(wend, width);
644           if (get_avg && !count_include_pad) {
645             pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
646           }
647           const int pool_index = (pd * pooled_height + ph) * pooled_width + pw;
648           std::fill(sums.begin(), sums.end(), 0);
649           for (int d = dstart; d < dend; ++d) {
650             for (int h = hstart; h < hend; ++h) {
651               for (int w = wstart; w < wend; ++w) {
652                 const int in_index = (d * height + h) * width + w;
653                 for (index_t c = 0; c < features; ++c) {
654                   sums[c] += a_pow_p<AccType, p>::Map(in_data[in_index * features + c]) / pool_size;
655                 }
656               }
657             }
658           }
659           for (index_t c = 0; c < features; ++c)
660             out_data[pool_index * features + c] = (pool_size == 0) ?
661                                                             AccType(nanf("")) :
662                                                             a_root_p<AccType, p>::Map(sums[c]);
663         }
664       }
665     }
666     in_data += in_data_offset;
667     out_data += out_data_offset;
668   }
669 }
670 
671 /*!
672  * \brief max unpooling cpu function for 1-D images in 'ncw' layout.
673  * Do not call this kernel directly. Use the interface unpool().
674  */
675 template<typename DType>
unpool_max_1d_ncw_cpu(const DType * out_grad,const DType * in_data,const DType * out_data,const mxnet::TShape & ishape,const mxnet::TShape & oshape,const mxnet::TShape & kernel,const mxnet::TShape & pad,const mxnet::TShape & stride,DType * in_grad)676 inline void unpool_max_1d_ncw_cpu(const DType *out_grad, const DType *in_data,
677                                   const DType *out_data, const mxnet::TShape &ishape,
678                                   const mxnet::TShape &oshape, const mxnet::TShape &kernel,
679                                   const mxnet::TShape &pad, const mxnet::TShape &stride,
680                                   DType *in_grad) {
681   const int width = ishape[2];
682   const int pooled_width = oshape[2];
683   const int kernel_w = kernel[0];
684   const int pad_w = pad[0];
685   const int stride_w = stride[0];
686   const index_t in_offset = ishape[2];
687   const index_t out_offset = oshape[2];
688   for (index_t n = 0; n < oshape[0]; ++n) {
689     for (index_t c = 0; c < oshape[1]; ++c) {
690       for (int pw = 0; pw < pooled_width; ++pw) {
691         int wstart = pw * stride_w - pad_w;
692         int wend = std::min(wstart + kernel_w, width);
693         wstart = std::max(wstart, 0);
694         int max_idx = -1;
695         for (int w = wstart; w < wend; ++w) {
696           if (in_data[w] == out_data[pw]) {
697             max_idx = w;
698             break;
699           }
700         }
701         // In the case where pad > 0 and kernel = 1, for example,
702         // max_idx can be -1 reaching this step.
703         if (max_idx >= 0) {
704           in_grad[max_idx] += out_grad[pw];
705         }
706       }
707       in_data += in_offset;
708       in_grad += in_offset;
709       out_data += out_offset;
710       out_grad += out_offset;
711     }
712   }
713 }
714 
715 /*!
716  * \brief max unpooling cpu function for 1-D images in 'nwc' layout.
717  * Do not call this kernel directly. Use the interface unpool().
718  */
719 template<typename DType>
unpool_max_1d_nwc_cpu(const DType * out_grad,const DType * in_data,const DType * out_data,const mxnet::TShape & ishape,const mxnet::TShape & oshape,const mxnet::TShape & kernel,const mxnet::TShape & pad,const mxnet::TShape & stride,DType * in_grad)720 inline void unpool_max_1d_nwc_cpu(const DType* out_grad, const DType* in_data,
721                               const DType* out_data, const mxnet::TShape& ishape,
722                               const mxnet::TShape& oshape, const mxnet::TShape& kernel,
723                               const mxnet::TShape& pad, const mxnet::TShape& stride,
724                               DType* in_grad) {
725   const int width = ishape[1];
726   const int pooled_width = oshape[1];
727   const int kernel_w = kernel[0];
728   const int pad_w = pad[0];
729   const int stride_w = stride[0];
730   const int features = oshape[2];
731   const index_t in_offset = ishape[1] * features;
732   const index_t out_offset = oshape[1] * features;
733   std::vector<int> max_idxs(features);
734   for (index_t n = 0; n < oshape[0]; ++n) {
735     for (int pw = 0; pw < pooled_width; ++pw) {
736       int wstart = pw * stride_w - pad_w;
737       int wend = std::min(wstart + kernel_w, width);
738       wstart = std::max(wstart, 0);
739       std::fill(max_idxs.begin(), max_idxs.end(), -1);
740       for (index_t c = 0; c < features; ++c) {
741         for (int w = wstart; w < wend; ++w) {
742           if (in_data[w * features + c] == out_data[pw * features + c]) {
743             max_idxs[c] = w;
744             break;
745           }
746         }
747       }
748       // In the case where pad > 0 and kernel = 1, for example,
749       // max_idx can be -1 reaching this step.
750       for (index_t c = 0; c < features; ++c) {
751         if (max_idxs[c] >= 0) {
752           in_grad[max_idxs[c] * features + c] += out_grad[pw * features + c];
753         }
754       }
755     }
756     in_data += in_offset;
757     in_grad += in_offset;
758     out_data += out_offset;
759     out_grad += out_offset;
760   }
761 }
762 
763 /*!
764  * \brief max unpooling cpu function for 2-D images in 'nchw' layout.
765  * Do not call this kernel directly. Use the interface unpool().
766  */
767 template<typename DType>
unpool_max_2d_nchw_cpu(const DType * out_grad,const DType * in_data,const DType * out_data,const mxnet::TShape & ishape,const mxnet::TShape & oshape,const mxnet::TShape & kernel,const mxnet::TShape & pad,const mxnet::TShape & stride,DType * in_grad)768 inline void unpool_max_2d_nchw_cpu(const DType *out_grad, const DType *in_data,
769                                    const DType *out_data, const mxnet::TShape &ishape,
770                                    const mxnet::TShape &oshape, const mxnet::TShape &kernel,
771                                    const mxnet::TShape &pad, const mxnet::TShape &stride,
772                                    DType *in_grad) {
773   const int height = ishape[2], width = ishape[3];
774   const int pooled_height = oshape[2], pooled_width = oshape[3];
775   const int kernel_h = kernel[0], kernel_w = kernel[1];
776   const int pad_h = pad[0], pad_w = pad[1];
777   const int stride_h = stride[0], stride_w = stride[1];
778   const index_t in_offset = ishape[2] * ishape[3];
779   const index_t out_offset = oshape[2] * oshape[3];
780   for (index_t n = 0; n < oshape[0]; ++n) {
781     for (index_t c = 0; c < oshape[1]; ++c) {
782       for (int ph = 0; ph < pooled_height; ++ph) {
783         for (int pw = 0; pw < pooled_width; ++pw) {
784           int hstart = ph * stride_h - pad_h;
785           int wstart = pw * stride_w - pad_w;
786           int hend = std::min(hstart + kernel_h, height);
787           int wend = std::min(wstart + kernel_w, width);
788           hstart = std::max(hstart, 0);
789           wstart = std::max(wstart, 0);
790           const int pool_index = ph * pooled_width + pw;
791           int max_idx = -1;
792           bool found = false;
793           for (int h = hstart; h < hend; ++h) {
794             for (int w = wstart; w < wend; ++w) {
795               const int idx = h * width + w;
796               if (in_data[idx] == out_data[pool_index]) {
797                 max_idx = idx;
798                 found = true;
799                 break;
800               }
801             }
802             if (found) break;
803           }
804           // In the case where pad > 0 and kernel = 1, for example,
805           // max_idx can be -1 reaching this step.
806           if (max_idx >= 0) {
807             in_grad[max_idx] += out_grad[pool_index];
808           }
809         }
810       }
811       in_data += in_offset;
812       in_grad += in_offset;
813       out_data += out_offset;
814       out_grad += out_offset;
815     }
816   }
817 }
818 
819 /*!
820  * \brief max unpooling cpu function for 2-D images in 'nhwc' layout.
821  * Do not call this kernel directly. Use the interface unpool().
822  */
823 template<typename DType>
unpool_max_2d_nhwc_cpu(const DType * out_grad,const DType * in_data,const DType * out_data,const mxnet::TShape & ishape,const mxnet::TShape & oshape,const mxnet::TShape & kernel,const mxnet::TShape & pad,const mxnet::TShape & stride,DType * in_grad)824 inline void unpool_max_2d_nhwc_cpu(const DType* out_grad, const DType* in_data,
825                               const DType* out_data, const mxnet::TShape& ishape,
826                               const mxnet::TShape& oshape, const mxnet::TShape& kernel,
827                               const mxnet::TShape& pad, const mxnet::TShape& stride,
828                               DType* in_grad) {
829   const int height = ishape[1], width = ishape[2];
830   const int pooled_height = oshape[1], pooled_width = oshape[2];
831   const int kernel_h = kernel[0], kernel_w = kernel[1];
832   const int pad_h = pad[0], pad_w = pad[1];
833   const int stride_h = stride[0], stride_w = stride[1];
834   const int features = oshape[3];
835   const index_t in_offset = ishape[1] * ishape[2] * features;
836   const index_t out_offset = oshape[1] * oshape[2] * features;
837   std::vector<int> max_idxs(features);
838   for (index_t n = 0; n < oshape[0]; ++n) {
839     for (int ph = 0; ph < pooled_height; ++ph) {
840       for (int pw = 0; pw < pooled_width; ++pw) {
841         int hstart = ph * stride_h - pad_h;
842         int wstart = pw * stride_w - pad_w;
843         int hend = std::min(hstart + kernel_h, height);
844         int wend = std::min(wstart + kernel_w, width);
845         hstart = std::max(hstart, 0);
846         wstart = std::max(wstart, 0);
847         const int pool_index = ph * pooled_width + pw;
848         std::fill(max_idxs.begin(), max_idxs.end(), -1);
849         for (index_t c = 0; c < features; ++c) {
850           bool found = false;
851           for (int h = hstart; h < hend; ++h) {
852             for (int w = wstart; w < wend; ++w) {
853               const int idx = h * width + w;
854               if (in_data[idx * features + c] == out_data[pool_index * features + c]) {
855                 max_idxs[c] = idx;
856                 found = true;
857                 break;
858               }
859             }
860             if (found) break;
861           }
862         }
863         // In the case where pad > 0 and kernel = 1, for example,
864         // max_idx can be -1 reaching this step.
865         for (index_t c = 0; c < features; ++c) {
866           if (max_idxs[c] >= 0) {
867             in_grad[max_idxs[c] * features + c] += out_grad[pool_index * features + c];
868           }
869         }
870       }
871     }
872     in_data += in_offset;
873     in_grad += in_offset;
874     out_data += out_offset;
875     out_grad += out_offset;
876   }
877 }
878 
879 /*!
880  * \brief max unpooling cpu function for 3-D images in 'ncdhw' layout.
881  * Do not call this kernel directly. Use the interface unpool().
882  */
883 template<typename DType>
unpool_max_3d_ncdhw_cpu(const DType * out_grad,const DType * in_data,const DType * out_data,const mxnet::TShape & ishape,const mxnet::TShape & oshape,const mxnet::TShape & kernel,const mxnet::TShape & pad,const mxnet::TShape & stride,DType * in_grad)884 inline void unpool_max_3d_ncdhw_cpu(const DType *out_grad, const DType *in_data,
885                                     const DType *out_data, const mxnet::TShape &ishape,
886                                     const mxnet::TShape &oshape, const mxnet::TShape &kernel,
887                                     const mxnet::TShape &pad, const mxnet::TShape &stride,
888                                     DType *in_grad) {
889   const int depth = ishape[2], height = ishape[3], width = ishape[4];
890   const int pooled_depth = oshape[2], pooled_height = oshape[3], pooled_width = oshape[4];
891   const int kernel_d = kernel[0], kernel_h = kernel[1], kernel_w = kernel[2];
892   const int pad_d = pad[0], pad_h = pad[1], pad_w = pad[2];
893   const int stride_d = stride[0], stride_h = stride[1], stride_w = stride[2];
894   const index_t in_offset = ishape[2] * ishape[3] * ishape[4];
895   const index_t out_offset = oshape[2] * oshape[3] * oshape[4];
896   for (index_t n = 0; n < oshape[0]; ++n) {
897     for (index_t c = 0; c < oshape[1]; ++c) {
898       for (int pd = 0; pd < pooled_depth; ++pd) {
899         for (int ph = 0; ph < pooled_height; ++ph) {
900           for (int pw = 0; pw < pooled_width; ++pw) {
901             int dstart = pd * stride_d - pad_d;
902             int hstart = ph * stride_h - pad_h;
903             int wstart = pw * stride_w - pad_w;
904             int dend = std::min(dstart + kernel_d, depth);
905             int hend = std::min(hstart + kernel_h, height);
906             int wend = std::min(wstart + kernel_w, width);
907             dstart = std::max(dstart, 0);
908             hstart = std::max(hstart, 0);
909             wstart = std::max(wstart, 0);
910             const int pool_index = (pd * pooled_height + ph) * pooled_width + pw;
911             int max_idx = -1;
912             bool found = false;
913             for (int d = dstart; d < dend; ++d) {
914               for (int h = hstart; h < hend; ++h) {
915                 for (int w = wstart; w < wend; ++w) {
916                   const int idx = (d * height + h) * width + w;
917                   if (in_data[idx] == out_data[pool_index]) {
918                     max_idx = idx;
919                     found = true;
920                     break;
921                   }
922                 }
923                 if (found) break;
924               }
925               if (found) break;
926             }
927             // In the case where pad > 0 and kernel = 1, for example,
928             // max_idx can be -1 reaching this step.
929             if (max_idx >= 0) {
930               in_grad[max_idx] += out_grad[pool_index];
931             }
932           }
933         }
934       }
935       in_data += in_offset;
936       in_grad += in_offset;
937       out_data += out_offset;
938       out_grad += out_offset;
939     }
940   }
941 }
942 
943 /*!
944  * \brief max unpooling cpu function for 3-D images in 'ndhwc' layout.
945  * Do not call this kernel directly. Use the interface unpool().
946  */
947 template<typename DType>
unpool_max_3d_ndhwc_cpu(const DType * out_grad,const DType * in_data,const DType * out_data,const mxnet::TShape & ishape,const mxnet::TShape & oshape,const mxnet::TShape & kernel,const mxnet::TShape & pad,const mxnet::TShape & stride,DType * in_grad)948 inline void unpool_max_3d_ndhwc_cpu(const DType* out_grad, const DType* in_data,
949                               const DType* out_data, const mxnet::TShape& ishape,
950                               const mxnet::TShape& oshape, const mxnet::TShape& kernel,
951                               const mxnet::TShape& pad, const mxnet::TShape& stride,
952                               DType* in_grad) {
953   const int depth = ishape[1], height = ishape[2], width = ishape[3];
954   const int pooled_depth = oshape[1], pooled_height = oshape[2], pooled_width = oshape[3];
955   const int kernel_d = kernel[0], kernel_h = kernel[1], kernel_w = kernel[2];
956   const int pad_d = pad[0], pad_h = pad[1], pad_w = pad[2];
957   const int stride_d = stride[0], stride_h = stride[1], stride_w = stride[2];
958   const int features = oshape[4];
959   const index_t in_offset = ishape[1] * ishape[2] * ishape[3] * features;
960   const index_t out_offset = oshape[1] * oshape[2] * oshape[3] * features;
961   std::vector<int> max_idxs(features);
962   for (index_t n = 0; n < oshape[0]; ++n) {
963     for (int pd = 0; pd < pooled_depth; ++pd) {
964       for (int ph = 0; ph < pooled_height; ++ph) {
965         for (int pw = 0; pw < pooled_width; ++pw) {
966           int dstart = pd * stride_d - pad_d;
967           int hstart = ph * stride_h - pad_h;
968           int wstart = pw * stride_w - pad_w;
969           int dend = std::min(dstart + kernel_d, depth);
970           int hend = std::min(hstart + kernel_h, height);
971           int wend = std::min(wstart + kernel_w, width);
972           dstart = std::max(dstart, 0);
973           hstart = std::max(hstart, 0);
974           wstart = std::max(wstart, 0);
975           const int pool_index = (pd * pooled_height + ph) * pooled_width + pw;
976           std::fill(max_idxs.begin(), max_idxs.end(), -1);
977           for (index_t c = 0; c < features; ++c) {
978             bool found = false;
979             for (int d = dstart; d < dend; ++d) {
980               for (int h = hstart; h < hend; ++h) {
981                 for (int w = wstart; w < wend; ++w) {
982                   const int idx = (d * height + h) * width + w;
983                   if (in_data[idx * features + c] == out_data[pool_index * features + c]) {
984                     max_idxs[c] = idx;
985                     found = true;
986                     break;
987                   }
988                 }
989                 if (found) break;
990               }
991               if (found) break;
992             }
993           }
994           // In the case where pad > 0 and kernel = 1, for example,
995           // max_idx can be -1 reaching this step.
996           for (index_t c = 0; c < features; ++c) {
997             if (max_idxs[c] >= 0) {
998               in_grad[max_idxs[c] * features + c] += out_grad[pool_index * features + c];
999             }
1000           }
1001         }
1002       }
1003     }
1004     in_data += in_offset;
1005     in_grad += in_offset;
1006     out_data += out_offset;
1007     out_grad += out_offset;
1008   }
1009 }
1010 
1011 /*!
1012  * \brief avg/sum unpooling cpu function for 1-D images in 'ncw' layout.
1013  * Do not call this kernel directly. Use the interface unpool().
1014  */
1015 template<typename DType, int p = 1>
1016 inline void unpool_sum_1d_ncw_cpu(const DType *out_grad, const DType *in_data,
1017                                   const DType *out_data,
1018                                   const mxnet::TShape &ishape, const mxnet::TShape &oshape,
1019                                   const mxnet::TShape &kernel, const mxnet::TShape &pad,
1020                                   const mxnet::TShape &stride, DType *in_grad,
1021                                   const bool is_avg = false, const bool count_include_pad = true) {
1022   const int width = ishape[2];
1023   const int pooled_width = oshape[2];
1024   const int kernel_w = kernel[0];
1025   const int pad_w = pad[0];
1026   const int stride_w = stride[0];
1027   const index_t in_grad_offset = ishape[2];
1028   const index_t out_grad_offset = oshape[2];
1029   for (index_t n = 0; n < oshape[0]; ++n) {
1030     for (index_t c = 0; c < oshape[1]; ++c) {
1031       for (int pw = 0; pw < pooled_width; ++pw) {
1032         int wstart = pw * stride_w - pad_w;
1033         int wend = std::min(wstart + kernel_w, width + pad_w);
1034         int pool_size = (is_avg ? (wend - wstart) : 1);
1035         wstart = std::max(wstart, 0);
1036         wend = std::min(wend, width);
1037         if (is_avg && !count_include_pad) {
1038           pool_size = (wend - wstart);
1039         }
1040         for (int w = wstart; w < wend; ++w) {
1041           in_grad[w] += lp_grad<DType, p>::Map(out_grad[pw], in_data[w], out_data[pw]) / pool_size;
1042         }
1043       }
1044       in_grad += in_grad_offset;
1045       in_data += in_grad_offset;
1046       out_grad += out_grad_offset;
1047       out_data += out_grad_offset;
1048     }
1049   }
1050 }
1051 
1052 /*!
1053  * \brief avg/sum unpooling cpu function for 1-D images in 'nwc' layout.
1054  * Do not call this kernel directly. Use the interface unpool().
1055  */
1056 template<typename DType, int p = 1>
1057 inline void unpool_sum_1d_nwc_cpu(const DType* out_grad, const DType* in_data,
1058                                   const DType *out_data, const mxnet::TShape &ishape,
1059                                   const mxnet::TShape &oshape, const mxnet::TShape &kernel,
1060                                   const mxnet::TShape &pad, const mxnet::TShape &stride,
1061                                   DType *in_grad, const bool is_avg = false,
1062                                   const bool count_include_pad = true) {
1063   const int width = ishape[1];
1064   const int pooled_width = oshape[1];
1065   const int kernel_w = kernel[0];
1066   const int pad_w = pad[0];
1067   const int stride_w = stride[0];
1068   const int features = oshape[2];
1069   const index_t in_grad_offset = ishape[1] * features;
1070   const index_t out_grad_offset = oshape[1] * features;
1071   for (index_t n = 0; n < oshape[0]; ++n) {
1072     for (int pw = 0; pw < pooled_width; ++pw) {
1073       int wstart = pw * stride_w - pad_w;
1074       int wend = std::min(wstart + kernel_w, width + pad_w);
1075       int pool_size = (is_avg ? (wend - wstart) : 1);
1076       wstart = std::max(wstart, 0);
1077       wend = std::min(wend, width);
1078       if (is_avg && !count_include_pad) {
1079         pool_size = (wend - wstart);
1080       }
1081       for (int w = wstart; w < wend; ++w) {
1082         for (index_t c = 0; c < features; ++c) {
1083           in_grad[w * features + c] +=
1084               lp_grad<DType, p>::Map(out_grad[pw * features + c],
1085                                      in_data[w * features + c],
1086                                      out_data[pw * features + c]) / pool_size;
1087         }
1088       }
1089     }
1090     in_grad += in_grad_offset;
1091     in_data += in_grad_offset;
1092     out_grad += out_grad_offset;
1093     out_data += out_grad_offset;
1094   }
1095 }
1096 
1097 /*!
1098  * \brief avg/sum unpooling cpu function for 2-D images in 'nchw' layout.
1099  * Do not call this kernel directly. Use the interface unpool().
1100  */
1101 template<typename DType, int p = 1>
1102 inline void unpool_sum_2d_nchw_cpu(const DType *out_grad, const DType *in_data,
1103                                    const DType *out_data, const mxnet::TShape &ishape,
1104                                    const mxnet::TShape &oshape, const mxnet::TShape &kernel,
1105                                    const mxnet::TShape &pad, const mxnet::TShape &stride,
1106                                    DType *in_grad, const bool is_avg = false,
1107                                    const bool count_include_pad = true) {
1108   const int height = ishape[2], width = ishape[3];
1109   const int pooled_height = oshape[2], pooled_width = oshape[3];
1110   const int kernel_h = kernel[0], kernel_w = kernel[1];
1111   const int pad_h = pad[0], pad_w = pad[1];
1112   const int stride_h = stride[0], stride_w = stride[1];
1113   const index_t in_grad_offset = ishape[2] * ishape[3];
1114   const index_t out_grad_offset = oshape[2] * oshape[3];
1115   for (index_t n = 0; n < oshape[0]; ++n) {
1116     for (index_t c = 0; c < oshape[1]; ++c) {
1117       for (int ph = 0; ph < pooled_height; ++ph) {
1118         for (int pw = 0; pw < pooled_width; ++pw) {
1119           int hstart = ph * stride_h - pad_h;
1120           int wstart = pw * stride_w - pad_w;
1121           int hend = std::min(hstart + kernel_h, height + pad_h);
1122           int wend = std::min(wstart + kernel_w, width + pad_w);
1123           int pool_size = (is_avg ? (hend - hstart) * (wend - wstart) : 1);
1124           hstart = std::max(hstart, 0);
1125           wstart = std::max(wstart, 0);
1126           hend = std::min(hend, height);
1127           wend = std::min(wend, width);
1128           if (is_avg && !count_include_pad) {
1129             pool_size = (hend - hstart) * (wend - wstart);
1130           }
1131           const int pool_index = ph * pooled_width + pw;
1132           for (int h = hstart; h < hend; ++h) {
1133             for (int w = wstart; w < wend; ++w) {
1134               in_grad[h*width+w] +=
1135                 lp_grad<DType, p>::Map(out_grad[pool_index],
1136                                        in_data[h*width+w],
1137                                        out_data[pool_index]) / pool_size;
1138             }
1139           }
1140         }
1141       }
1142       in_grad += in_grad_offset;
1143       in_data += in_grad_offset;
1144       out_grad += out_grad_offset;
1145       out_data += out_grad_offset;
1146     }
1147   }
1148 }
1149 
1150 /*!
1151  * \brief avg/sum unpooling cpu function for 2-D images in 'nhwc' layout.
1152  * Do not call this kernel directly. Use the interface unpool().
1153  */
1154 template<typename DType, int p = 1>
1155 inline void unpool_sum_2d_nhwc_cpu(const DType* out_grad, const DType* in_data,
1156                                    const DType *out_data, const mxnet::TShape &ishape,
1157                                    const mxnet::TShape &oshape, const mxnet::TShape &kernel,
1158                                    const mxnet::TShape &pad, const mxnet::TShape &stride,
1159                                    DType *in_grad, const bool is_avg = false,
1160                                    const bool count_include_pad = true) {
1161   const int height = ishape[1], width = ishape[2];
1162   const int pooled_height = oshape[1], pooled_width = oshape[2];
1163   const int kernel_h = kernel[0], kernel_w = kernel[1];
1164   const int pad_h = pad[0], pad_w = pad[1];
1165   const int features = oshape[3];
1166   const int stride_h = stride[0], stride_w = stride[1];
1167   const index_t in_grad_offset = ishape[1] * ishape[2] * features;
1168   const index_t out_grad_offset = oshape[1] * oshape[2] * features;
1169   for (index_t n = 0; n < oshape[0]; ++n) {
1170     for (int ph = 0; ph < pooled_height; ++ph) {
1171       for (int pw = 0; pw < pooled_width; ++pw) {
1172         int hstart = ph * stride_h - pad_h;
1173         int wstart = pw * stride_w - pad_w;
1174         int hend = std::min(hstart + kernel_h, height + pad_h);
1175         int wend = std::min(wstart + kernel_w, width + pad_w);
1176         int pool_size = (is_avg ? (hend - hstart) * (wend - wstart) : 1);
1177         hstart = std::max(hstart, 0);
1178         wstart = std::max(wstart, 0);
1179         hend = std::min(hend, height);
1180         wend = std::min(wend, width);
1181         if (is_avg && !count_include_pad) {
1182           pool_size = (hend - hstart) * (wend - wstart);
1183         }
1184         const int pool_index = ph * pooled_width + pw;
1185         for (int h = hstart; h < hend; ++h) {
1186           for (int w = wstart; w < wend; ++w) {
1187             const int in_index = h * width + w;
1188             for (index_t c = 0; c < features; ++c) {
1189               in_grad[in_index * features + c] +=
1190                   lp_grad<DType, p>::Map(out_grad[pool_index * features + c],
1191                                          in_data[in_index * features + c],
1192                                          out_data[pool_index * features + c]) / pool_size;
1193             }
1194           }
1195         }
1196       }
1197     }
1198     in_grad += in_grad_offset;
1199     in_data += in_grad_offset;
1200     out_grad += out_grad_offset;
1201     out_data += out_grad_offset;
1202   }
1203 }
1204 
1205 /*!
1206  * \brief avg/sum unpooling cpu function for 3-D images in 'ncdhw' layout.
1207  * Do not call this kernel directly. Use the interface unpool().
1208  */
1209 template<typename DType, int p = 1>
1210 inline void unpool_sum_3d_ncdhw_cpu(const DType *out_grad, const DType *in_data,
1211                                     const DType *out_data, const mxnet::TShape &ishape,
1212                                     const mxnet::TShape &oshape, const mxnet::TShape &kernel,
1213                                     const mxnet::TShape &pad, const mxnet::TShape &stride,
1214                                     DType *in_grad, const bool is_avg = false,
1215                                     const bool count_include_pad = true) {
1216   const int depth = ishape[2], height = ishape[3], width = ishape[4];
1217   const int pooled_depth = oshape[2], pooled_height = oshape[3], pooled_width = oshape[4];
1218   const int kernel_d = kernel[0], kernel_h = kernel[1], kernel_w = kernel[2];
1219   const int pad_d = pad[0], pad_h = pad[1], pad_w = pad[2];
1220   const int stride_d = stride[0], stride_h = stride[1], stride_w = stride[2];
1221   const index_t in_grad_offset = ishape[2] * ishape[3] * ishape[4];
1222   const index_t out_grad_offset = oshape[2] * oshape[3] * oshape[4];
1223   for (index_t n = 0; n < oshape[0]; ++n) {
1224     for (index_t c = 0; c < oshape[1]; ++c) {
1225       for (int pd = 0; pd < pooled_depth; ++pd) {
1226         for (int ph = 0; ph < pooled_height; ++ph) {
1227           for (int pw = 0; pw < pooled_width; ++pw) {
1228             int dstart = pd * stride_d - pad_d;
1229             int hstart = ph * stride_h - pad_h;
1230             int wstart = pw * stride_w - pad_w;
1231             int dend = std::min(dstart + kernel_d, depth + pad_d);
1232             int hend = std::min(hstart + kernel_h, height + pad_h);
1233             int wend = std::min(wstart + kernel_w, width + pad_w);
1234             int pool_size = (is_avg ? (dend - dstart) * (hend - hstart) * (wend - wstart) : 1);
1235             dstart = std::max(dstart, 0);
1236             hstart = std::max(hstart, 0);
1237             wstart = std::max(wstart, 0);
1238             dend = std::min(dend, depth);
1239             hend = std::min(hend, height);
1240             wend = std::min(wend, width);
1241             if (is_avg && !count_include_pad) {
1242               pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
1243             }
1244             const int pool_index = (pd * pooled_height + ph) * pooled_width + pw;
1245             for (int d = dstart; d < dend; ++d) {
1246               for (int h = hstart; h < hend; ++h) {
1247                 for (int w = wstart; w < wend; ++w) {
1248                   in_grad[(d*height+h)*width+w] +=
1249                     lp_grad<DType, p>::Map(out_grad[pool_index],
1250                                            in_data[(d*height+h)*width+w],
1251                                            out_data[pool_index]) / pool_size;
1252                 }
1253               }
1254             }
1255           }
1256         }
1257       }
1258       in_grad += in_grad_offset;
1259       in_data += in_grad_offset;
1260       out_grad += out_grad_offset;
1261       out_data += out_grad_offset;
1262     }
1263   }
1264 }
1265 
1266 /*!
1267  * \brief avg/sum unpooling cpu function for 3-D images in 'ndhwc' layout.
1268  * Do not call this kernel directly. Use the interface unpool().
1269  */
1270 template<typename DType, int p = 1>
1271 inline void unpool_sum_3d_ndhwc_cpu(const DType* out_grad, const DType* in_data,
1272                                     const DType *out_data, const mxnet::TShape &ishape,
1273                                     const mxnet::TShape &oshape, const mxnet::TShape &kernel,
1274                                     const mxnet::TShape &pad, const mxnet::TShape &stride,
1275                                     DType *in_grad, const bool is_avg = false,
1276                                     const bool count_include_pad = true) {
1277   const int depth = ishape[1], height = ishape[2], width = ishape[3];
1278   const int pooled_depth = oshape[1], pooled_height = oshape[2], pooled_width = oshape[3];
1279   const int kernel_d = kernel[0], kernel_h = kernel[1], kernel_w = kernel[2];
1280   const int pad_d = pad[0], pad_h = pad[1], pad_w = pad[2];
1281   const int stride_d = stride[0], stride_h = stride[1], stride_w = stride[2];
1282   const int features = oshape[4];
1283   const index_t in_grad_offset = ishape[1] * ishape[2] * ishape[3] * features;
1284   const index_t out_grad_offset = oshape[1] * oshape[2] * oshape[3] * features;
1285   for (index_t n = 0; n < oshape[0]; ++n) {
1286     for (int pd = 0; pd < pooled_depth; ++pd) {
1287       for (int ph = 0; ph < pooled_height; ++ph) {
1288         for (int pw = 0; pw < pooled_width; ++pw) {
1289           int dstart = pd * stride_d - pad_d;
1290           int hstart = ph * stride_h - pad_h;
1291           int wstart = pw * stride_w - pad_w;
1292           int dend = std::min(dstart + kernel_d, depth + pad_d);
1293           int hend = std::min(hstart + kernel_h, height + pad_h);
1294           int wend = std::min(wstart + kernel_w, width + pad_w);
1295           int pool_size = (is_avg ? (dend - dstart) * (hend - hstart) * (wend - wstart) : 1);
1296           dstart = std::max(dstart, 0);
1297           hstart = std::max(hstart, 0);
1298           wstart = std::max(wstart, 0);
1299           dend = std::min(dend, depth);
1300           hend = std::min(hend, height);
1301           wend = std::min(wend, width);
1302           if (is_avg && !count_include_pad) {
1303             pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
1304           }
1305           const int pool_index = (pd * pooled_height + ph) * pooled_width + pw;
1306           for (int d = dstart; d < dend; ++d) {
1307             for (int h = hstart; h < hend; ++h) {
1308               for (int w = wstart; w < wend; ++w) {
1309                 const int in_index = (d * height + h) * width + w;
1310                 for (index_t c = 0; c < features; ++c) {
1311                   in_grad[in_index * features + c] +=
1312                       lp_grad<DType, p>::Map(out_grad[pool_index * features + c],
1313                                              in_data[in_index * features + c],
1314                                              out_data[pool_index * features + c]) / pool_size;
1315                 }
1316               }
1317             }
1318           }
1319         }
1320       }
1321     }
1322     in_grad += in_grad_offset;
1323     in_data += in_grad_offset;
1324     out_grad += out_grad_offset;
1325     out_data += out_grad_offset;
1326   }
1327 }
1328 
1329 /*!
1330  * \brief This function serves as an interface for 1/2/3-D pooling operations.
1331  * \param s context stream defining the device in use is cpu
1332  * \param in_data pointer of the input tensor data in the format of NCW, NCHW, or NCDHW
1333  * \param ishape input tensor shape
1334  * \param oshape output tensor shape
1335  * \param kernel kernel shape
1336  * \param pad pad shape
1337  * \param stride stride shape
1338  * \param pool_type supported pooling type: max, avg, sum
1339  * \param req_type operator request type, only support kWriteTo for now
1340  * \param out_data pointer of the output tensor data in the format of NCW, NCHW, or NCDHW
1341  * \param p_value value of p for Lp pooling
1342  */
1343 template<typename DType, int p>
pool(mshadow::Stream<cpu> * s,const DType * in_data,const mxnet::TShape & ishape,const mxnet::TShape & oshape,const mxnet::TShape & kernel,const mxnet::TShape & pad,const mxnet::TShape & stride,const int pool_type,OpReqType req_type,DType * out_data,const bool count_include_pad,int layout)1344 inline void pool(mshadow::Stream<cpu>* s, const DType* in_data, const mxnet::TShape& ishape,
1345                  const mxnet::TShape& oshape, const mxnet::TShape& kernel, const mxnet::TShape& pad,
1346                  const mxnet::TShape& stride, const int pool_type, OpReqType req_type,
1347                  DType* out_data, const bool count_include_pad, int layout) {
1348   CHECK_EQ(req_type, kWriteTo) << "Only support req=kWriteTo in pooling operations";
1349   if (kernel.ndim() == 1) {
1350     if (layout == mshadow::kNWC) {
1351       if (pool_enum::kMaxPooling == pool_type) {
1352         pool_max_1d_nwc_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data);
1353       } else if (pool_enum::kAvgPooling == pool_type) {
1354         pool_sum_1d_nwc_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data,
1355                         true, count_include_pad);
1356       } else if (pool_enum::kSumPooling == pool_type) {
1357         pool_sum_1d_nwc_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data);
1358       } else if (pool_enum::kLpPooling == pool_type) {
1359         pool_sum_1d_nwc_cpu<DType, p>(in_data, ishape, oshape, kernel, pad, stride, out_data);
1360       } else {
1361         LOG(FATAL) << "Unknown pooling type " << pool_type;
1362       }
1363     } else if (layout == mshadow::kNCW) {
1364       if (pool_enum::kMaxPooling == pool_type) {
1365         pool_max_1d_ncw_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data);
1366       } else if (pool_enum::kAvgPooling == pool_type) {
1367         pool_sum_1d_ncw_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data,
1368                             true, count_include_pad);
1369       } else if (pool_enum::kSumPooling == pool_type) {
1370         pool_sum_1d_ncw_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data);
1371       } else if (pool_enum::kLpPooling == pool_type) {
1372         pool_sum_1d_ncw_cpu<DType, p>(in_data, ishape, oshape, kernel, pad, stride, out_data);
1373       } else {
1374         LOG(FATAL) << "Unknown pooling type " << pool_type;
1375       }
1376     } else {
1377       LOG(FATAL) << "Unsupported layout, expecting kNCW or kNWC, saw: " << layout;
1378     }
1379   } else if (kernel.ndim() == 2) {
1380     if (layout == mshadow::kNHWC) {
1381       if (pool_enum::kMaxPooling == pool_type) {
1382         pool_max_2d_nhwc_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data);
1383       } else if (pool_enum::kAvgPooling == pool_type) {
1384         pool_sum_2d_nhwc_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data,
1385                         true, count_include_pad);
1386       } else if (pool_enum::kSumPooling == pool_type) {
1387         pool_sum_2d_nhwc_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data);
1388       } else if (pool_enum::kLpPooling == pool_type) {
1389         pool_sum_2d_nhwc_cpu<DType, p>(in_data, ishape, oshape, kernel, pad, stride, out_data);
1390       } else {
1391         LOG(FATAL) << "Unknown pooling type " << pool_type;
1392       }
1393     } else if (layout == mshadow::kNCHW) {
1394       if (pool_enum::kMaxPooling == pool_type) {
1395         pool_max_2d_nchw_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data);
1396       } else if (pool_enum::kAvgPooling == pool_type) {
1397         pool_sum_2d_nchw_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data,
1398                              true, count_include_pad);
1399       } else if (pool_enum::kSumPooling == pool_type) {
1400         pool_sum_2d_nchw_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data);
1401       } else if (pool_enum::kLpPooling == pool_type) {
1402         pool_sum_2d_nchw_cpu<DType, p>(in_data, ishape, oshape, kernel, pad, stride, out_data);
1403       } else {
1404         LOG(FATAL) << "Unknown pooling type " << pool_type;
1405       }
1406     } else {
1407       LOG(FATAL) << "Unsupported layout, expecting kNCHW or kNHWC, saw: " << layout;
1408     }
1409   } else if (kernel.ndim() == 3) {
1410     if (layout == mshadow::kNDHWC) {
1411       if (pool_enum::kMaxPooling == pool_type) {
1412         pool_max_3d_ndhwc_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data);
1413       } else if (pool_enum::kAvgPooling == pool_type) {
1414         pool_sum_3d_ndhwc_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data,
1415                         true, count_include_pad);
1416       } else if (pool_enum::kSumPooling == pool_type) {
1417         pool_sum_3d_ndhwc_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data);
1418       } else if (pool_enum::kLpPooling == pool_type) {
1419         pool_sum_3d_ndhwc_cpu<DType, p>(in_data, ishape, oshape, kernel, pad, stride, out_data);
1420       } else {
1421         LOG(FATAL) << "Unknown pooling type " << pool_type;
1422       }
1423     } else if (layout == mshadow::kNCDHW) {
1424       if (pool_enum::kMaxPooling == pool_type) {
1425         pool_max_3d_ncdhw_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data);
1426       } else if (pool_enum::kAvgPooling == pool_type) {
1427         pool_sum_3d_ncdhw_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data,
1428                               true, count_include_pad);
1429       } else if (pool_enum::kSumPooling == pool_type) {
1430         pool_sum_3d_ncdhw_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data);
1431       } else if (pool_enum::kLpPooling == pool_type) {
1432         pool_sum_3d_ncdhw_cpu<DType, p>(in_data, ishape, oshape, kernel, pad, stride, out_data);
1433       } else {
1434         LOG(FATAL) << "Unknown pooling type " << pool_type;
1435       }
1436     } else {
1437       LOG(FATAL) << "Unsupported layout, expecting kNCDHW or kNDHWC, saw: " << layout;
1438     }
1439   } else {
1440     LOG(FATAL) << "Unsupported " << kernel.ndim() << "-D pooling";
1441   }
1442 }
1443 
1444 /*!
1445  * \brief This function serves as an interface for 1/2/3-D unpooling operations.
1446  * \param s context stream defining the device in use is cpu
1447  * \param out_grad pointer of the gradient of operator's output tensor
1448  * \param in_data pointer of the input tensor in the format of NCW, NCHW, or NCDHW
1449  * \param out_data pointer of the output tensor in the format of NCW, NCHW, or NCDHW
1450  * \param ishape input tensor shape
1451  * \param oshape output tensor shape
1452  * \param kernel kernel shape
1453  * \param pad pad shape
1454  * \param stride stride shape
1455  * \param pool_type supported pooling type: max, avg, sum
1456  * \param req_type operator request type: kNullOp, kNullWriteInplace, kNullWriteTo, kNullAddTo
1457  * \param in_grad pointer of the gradient of the operator's input tensor
1458  * \param p_value value of p for Lp pooling
1459  */
1460 template<typename DType, int p>
unpool(mshadow::Stream<cpu> * s,const DType * out_grad,const DType * in_data,const DType * out_data,const mxnet::TShape & ishape,const mxnet::TShape & oshape,const mxnet::TShape & kernel,const mxnet::TShape & pad,const mxnet::TShape & stride,const int pool_type,OpReqType req_type,DType * in_grad,const bool count_include_pad,int layout)1461 inline void unpool(mshadow::Stream<cpu>* s, const DType* out_grad, const DType* in_data,
1462                    const DType* out_data, const mxnet::TShape& ishape,
1463                    const mxnet::TShape& oshape, const mxnet::TShape& kernel,
1464                    const mxnet::TShape& pad, const mxnet::TShape& stride,
1465                    const int pool_type, OpReqType req_type, DType* in_grad,
1466                    const bool count_include_pad, int layout) {
1467   if (mxnet::kNullOp == req_type) return;
1468   if (mxnet::kAddTo != req_type) {
1469     mxnet_op::Kernel<mxnet_op::set_zero, cpu>::Launch(s, ishape.Size(), in_grad);
1470   }
1471   if (kernel.ndim() == 1) {
1472     if (layout == mshadow::kNWC) {
1473       if (pool_enum::kMaxPooling == pool_type) {
1474         unpool_max_1d_nwc_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride,
1475                           in_grad);
1476       } else if (pool_enum::kAvgPooling == pool_type) {
1477         unpool_sum_1d_nwc_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride,
1478                               in_grad, true, count_include_pad);
1479       } else if (pool_enum::kSumPooling == pool_type) {
1480         unpool_sum_1d_nwc_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride,
1481                           in_grad);
1482       } else if (pool_enum::kLpPooling == pool_type) {
1483         unpool_sum_1d_nwc_cpu<DType, p>(out_grad, in_data, out_data, ishape, oshape, kernel, pad,
1484                                     stride,
1485                                     in_grad);
1486       } else {
1487         LOG(FATAL) << "Unknown pooling type " << pool_type;
1488       }
1489     } else if (layout == mshadow::kNCW) {
1490       if (pool_enum::kMaxPooling == pool_type) {
1491         unpool_max_1d_ncw_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride,
1492                               in_grad);
1493       } else if (pool_enum::kAvgPooling == pool_type) {
1494         unpool_sum_1d_ncw_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride,
1495                               in_grad,
1496                               true, count_include_pad);
1497       } else if (pool_enum::kSumPooling == pool_type) {
1498         unpool_sum_1d_ncw_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride,
1499                               in_grad);
1500       } else if (pool_enum::kLpPooling == pool_type) {
1501         unpool_sum_1d_ncw_cpu<DType, p>(out_grad, in_data, out_data, ishape, oshape, kernel, pad,
1502                                         stride,
1503                                         in_grad);
1504       } else {
1505         LOG(FATAL) << "Unknown pooling type " << pool_type;
1506       }
1507     } else {
1508       LOG(FATAL) << "Unsupported layout, expecting kNCW or kNWC, saw: " << layout;
1509     }
1510   } else if (kernel.ndim() == 2) {
1511     if (layout == mshadow::kNHWC) {
1512       if (pool_enum::kMaxPooling == pool_type) {
1513         unpool_max_2d_nhwc_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride,
1514                           in_grad);
1515       } else if (pool_enum::kAvgPooling == pool_type) {
1516         unpool_sum_2d_nhwc_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride,
1517                           in_grad,
1518                           true, count_include_pad);
1519       } else if (pool_enum::kSumPooling == pool_type) {
1520         unpool_sum_2d_nhwc_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride,
1521                           in_grad);
1522       } else if (pool_enum::kLpPooling == pool_type) {
1523         unpool_sum_2d_nhwc_cpu<DType, p>(out_grad, in_data, out_data, ishape, oshape, kernel, pad,
1524                                     stride,
1525                                     in_grad);
1526       } else {
1527         LOG(FATAL) << "Unknown pooling type " << pool_type;
1528       }
1529     } else if (layout == mshadow::kNCHW) {
1530       if (pool_enum::kMaxPooling == pool_type) {
1531         unpool_max_2d_nchw_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride,
1532                                in_grad);
1533       } else if (pool_enum::kAvgPooling == pool_type) {
1534         unpool_sum_2d_nchw_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride,
1535                                in_grad,
1536                                true, count_include_pad);
1537       } else if (pool_enum::kSumPooling == pool_type) {
1538         unpool_sum_2d_nchw_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride,
1539                                in_grad);
1540       } else if (pool_enum::kLpPooling == pool_type) {
1541         unpool_sum_2d_nchw_cpu<DType, p>(out_grad, in_data, out_data, ishape, oshape, kernel, pad,
1542                                          stride,
1543                                          in_grad);
1544       } else {
1545         LOG(FATAL) << "Unknown pooling type " << pool_type;
1546       }
1547     } else {
1548       LOG(FATAL) << "Unsupported layout, expecting kNCHW or kNHWC, saw: " << layout;
1549     }
1550   } else if (kernel.ndim() == 3) {
1551     if (layout == mshadow::kNDHWC) {
1552       if (pool_enum::kMaxPooling == pool_type) {
1553         unpool_max_3d_ndhwc_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride,
1554                           in_grad);
1555       } else if (pool_enum::kAvgPooling == pool_type) {
1556         unpool_sum_3d_ndhwc_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride,
1557                                 in_grad, true, count_include_pad);
1558       } else if (pool_enum::kSumPooling == pool_type) {
1559         unpool_sum_3d_ndhwc_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride,
1560                           in_grad);
1561       } else if (pool_enum::kLpPooling == pool_type) {
1562         unpool_sum_3d_ndhwc_cpu<DType, p>(out_grad, in_data, out_data, ishape, oshape, kernel, pad,
1563                                     stride,
1564                                     in_grad);
1565       } else {
1566         LOG(FATAL) << "Unknown pooling type " << pool_type;
1567       }
1568     } else if (layout == mshadow::kNCDHW) {
1569       if (pool_enum::kMaxPooling == pool_type) {
1570         unpool_max_3d_ncdhw_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride,
1571                                 in_grad);
1572       } else if (pool_enum::kAvgPooling == pool_type) {
1573         unpool_sum_3d_ncdhw_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride,
1574                                 in_grad,
1575                                 true, count_include_pad);
1576       } else if (pool_enum::kSumPooling == pool_type) {
1577         unpool_sum_3d_ncdhw_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride,
1578                                 in_grad);
1579       } else if (pool_enum::kLpPooling == pool_type) {
1580         unpool_sum_3d_ncdhw_cpu<DType, p>(out_grad, in_data, out_data, ishape, oshape, kernel, pad,
1581                                           stride,
1582                                           in_grad);
1583       } else {
1584         LOG(FATAL) << "Unknown pooling type " << pool_type;
1585       }
1586     } else {
1587       LOG(FATAL) << "Unsupported layout, expecting kNCDHW or kNDHWC, saw: " << layout;
1588     }
1589   } else {
1590     LOG(FATAL) << "Unsupported " << kernel.ndim() << "-D unpooling";
1591   }
1592 }
1593 
1594 }  // namespace op
1595 }  // namespace mxnet
1596 #ifdef __CUDACC__
1597 #include "./pool.cuh"
1598 #endif
1599 
1600 #endif  // MXNET_OPERATOR_NN_POOL_H_
1601