1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14 
15 #include "pooling_x86.h"
16 
17 #if __SSE2__
18 #if __AVX__
19 #include <immintrin.h>
20 #endif
21 #endif // __SSE2__
22 
23 #include <float.h>
24 
25 namespace ncnn {
26 
27 #if __SSE2__
28 #if __AVX__
29 #include "pooling_2x2.h"
30 #include "pooling_2x2_pack8.h"
31 #include "pooling_3x3_pack8.h"
32 #endif
33 #endif // __SSE2__
34 
Pooling_x86()35 Pooling_x86::Pooling_x86()
36 {
37 #if __SSE2__
38     support_packing = true;
39 #endif // __SSE2__
40 }
41 
create_pipeline(const Option &)42 int Pooling_x86::create_pipeline(const Option& /*opt*/)
43 {
44     if (adaptive_pooling)
45     {
46         support_packing = false;
47 
48         support_bf16_storage = false;
49         support_fp16_storage = false;
50         support_int8_storage = false;
51         support_image_storage = false;
52         support_tensor_storage = false;
53 
54         support_weight_fp16_storage = false;
55     }
56     return 0;
57 }
58 
forward(const Mat & bottom_blob,Mat & top_blob,const Option & opt) const59 int Pooling_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
60 {
61     // max value in NxN window
62     // avg value in NxN window
63 
64     if (adaptive_pooling)
65     {
66         return Pooling::forward(bottom_blob, top_blob, opt);
67     }
68 
69 #if __SSE2__
70     int elempack = bottom_blob.elempack;
71 
72 #if __AVX__
73     int w = bottom_blob.w;
74     int h = bottom_blob.h;
75     int channels = bottom_blob.c;
76     size_t elemsize = bottom_blob.elemsize;
77 
78     //     NCNN_LOGE("Pooling     input %d x %d  pad = %d %d %d %d  ksize=%d %d  stride=%d %d", w, h, pad_left, pad_right, pad_top, pad_bottom, kernel_w, kernel_h, stride_w, stride_h);
79     if (elempack == 8)
80     {
81         if (global_pooling)
82         {
83             top_blob.create(channels, elemsize, elempack, opt.blob_allocator);
84             if (top_blob.empty())
85                 return -100;
86 
87             int size = w * h;
88 
89             if (pooling_type == PoolMethod_MAX)
90             {
91                 #pragma omp parallel for num_threads(opt.num_threads)
92                 for (int q = 0; q < channels; q++)
93                 {
94                     const float* ptr = bottom_blob.channel(q);
95 
96                     __m256 _max = _mm256_loadu_ps(ptr);
97                     for (int i = 0; i < size; i++)
98                     {
99                         __m256 _val = _mm256_loadu_ps(ptr);
100                         _max = _mm256_max_ps(_max, _val);
101                         ptr += 8;
102                     }
103 
104                     float* outptr = top_blob;
105                     _mm256_storeu_ps(outptr + q * 8, _max);
106                 }
107             }
108             else if (pooling_type == PoolMethod_AVE)
109             {
110                 #pragma omp parallel for num_threads(opt.num_threads)
111                 for (int q = 0; q < channels; q++)
112                 {
113                     const float* ptr = bottom_blob.channel(q);
114 
115                     __m256 _sum = _mm256_set1_ps(0.f);
116                     for (int i = 0; i < size; i++)
117                     {
118                         __m256 _val = _mm256_loadu_ps(ptr);
119                         _sum = _mm256_add_ps(_sum, _val);
120                         ptr += 8;
121                     }
122 
123                     __m256 _inv_size = _mm256_set1_ps(1.f / size);
124                     __m256 _avg = _mm256_mul_ps(_sum, _inv_size);
125 
126                     float* outptr = top_blob;
127                     _mm256_storeu_ps(outptr + q * 8, _avg);
128                 }
129             }
130 
131             return 0;
132         }
133 
134         Mat bottom_blob_bordered;
135         make_padding(bottom_blob, bottom_blob_bordered, opt);
136         if (bottom_blob_bordered.empty())
137             return -100;
138 
139         w = bottom_blob_bordered.w;
140         h = bottom_blob_bordered.h;
141 
142         int outw = (w - kernel_w) / stride_w + 1;
143         int outh = (h - kernel_h) / stride_h + 1;
144 
145         top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_allocator);
146         if (top_blob.empty())
147             return -100;
148 
149         const int maxk = kernel_w * kernel_h;
150 
151         // kernel offsets
152         std::vector<int> _space_ofs(maxk);
153         int* space_ofs = &_space_ofs[0];
154         {
155             int p1 = 0;
156             int p2 = 0;
157             int gap = w - kernel_w;
158             for (int i = 0; i < kernel_h; i++)
159             {
160                 for (int j = 0; j < kernel_w; j++)
161                 {
162                     space_ofs[p1] = p2;
163                     p1++;
164                     p2++;
165                 }
166                 p2 += gap;
167             }
168         }
169         if (pooling_type == PoolMethod_MAX)
170         {
171             if (kernel_w == 2 && kernel_h == 2 && stride_w == 2 && stride_h == 2)
172             {
173                 pooling2x2s2_max_pack8_avx(bottom_blob_bordered, top_blob, opt);
174 
175                 return 0;
176             }
177             if (kernel_w == 3 && kernel_h == 3 && stride_w == 2 && stride_h == 2)
178             {
179                 pooling3x3s2_max_pack8_avx(bottom_blob_bordered, top_blob, opt);
180 
181                 return 0;
182             }
183 
184             #pragma omp parallel for num_threads(opt.num_threads)
185             for (int q = 0; q < channels; q++)
186             {
187                 const Mat m = bottom_blob_bordered.channel(q);
188                 float* outptr = top_blob.channel(q);
189 
190                 for (int i = 0; i < outh; i++)
191                 {
192                     for (int j = 0; j < outw; j++)
193                     {
194                         const float* sptr = m.row(i * stride_h) + j * stride_w * 8;
195 
196                         __m256 _max = _mm256_loadu_ps(sptr);
197 
198                         for (int k = 0; k < maxk; k++)
199                         {
200                             __m256 _val = _mm256_loadu_ps(sptr + space_ofs[k] * 8);
201                             _max = _mm256_max_ps(_max, _val);
202                         }
203 
204                         _mm256_storeu_ps(outptr + j * 8, _max);
205                     }
206 
207                     outptr += outw * 8;
208                 }
209             }
210         }
211         else if (pooling_type == PoolMethod_AVE)
212         {
213             if (avgpool_count_include_pad == 0)
214             {
215                 int wtailpad = 0;
216                 int htailpad = 0;
217 
218                 if (pad_mode == 0) // full padding
219                 {
220                     wtailpad = bottom_blob_bordered.w - bottom_blob.w - pad_left - pad_right;
221                     htailpad = bottom_blob_bordered.h - bottom_blob.h - pad_top - pad_bottom;
222                 }
223 
224                 #pragma omp parallel for num_threads(opt.num_threads)
225                 for (int q = 0; q < channels; q++)
226                 {
227                     const Mat m = bottom_blob_bordered.channel(q);
228                     float* outptr = top_blob.channel(q);
229 
230                     for (int i = 0; i < outh; i++)
231                     {
232                         int sy0 = i * stride_h;
233 
234                         for (int j = 0; j < outw; j++)
235                         {
236                             int sx0 = j * stride_w;
237 
238                             __m256 _sum = _mm256_set1_ps(0.f);
239                             int area = 0;
240 
241                             for (int ki = 0; ki < kernel_h; ki++)
242                             {
243                                 int sy = sy0 + ki;
244 
245                                 if (sy < pad_top)
246                                     continue;
247 
248                                 if (sy >= h - pad_bottom - htailpad)
249                                     break;
250 
251                                 for (int kj = 0; kj < kernel_w; kj++)
252                                 {
253                                     int sx = sx0 + kj;
254 
255                                     if (sx < pad_left)
256                                         continue;
257 
258                                     if (sx >= w - pad_right - wtailpad)
259                                         break;
260 
261                                     __m256 _val = _mm256_loadu_ps(m.row(sy) + sx * 8);
262                                     _sum = _mm256_add_ps(_sum, _val);
263                                     area += 1;
264                                 }
265                             }
266 
267                             __m256 _inv_area = _mm256_set1_ps(1.f / area);
268                             __m256 _avg = _mm256_mul_ps(_sum, _inv_area);
269                             _mm256_storeu_ps(outptr + j * 8, _avg);
270                         }
271 
272                         outptr += outw * 8;
273                     }
274                 }
275             }
276             else // if (avgpool_count_include_pad == 1)
277             {
278                 #pragma omp parallel for num_threads(opt.num_threads)
279                 for (int q = 0; q < channels; q++)
280                 {
281                     const Mat m = bottom_blob_bordered.channel(q);
282                     float* outptr = top_blob.channel(q);
283 
284                     __m256 _inv_maxk = _mm256_set1_ps(1.f / maxk);
285 
286                     for (int i = 0; i < outh; i++)
287                     {
288                         for (int j = 0; j < outw; j++)
289                         {
290                             const float* sptr = m.row(i * stride_h) + j * stride_w * 8;
291 
292                             __m256 _sum = _mm256_set1_ps(0.f);
293 
294                             for (int k = 0; k < maxk; k++)
295                             {
296                                 __m256 _val = _mm256_loadu_ps(sptr + space_ofs[k] * 8);
297                                 _sum = _mm256_add_ps(_sum, _val);
298                             }
299 
300                             __m256 _avg = _mm256_mul_ps(_sum, _inv_maxk);
301                             _mm256_storeu_ps(outptr + j * 8, _avg);
302                         }
303 
304                         outptr += outw * 8;
305                     }
306                 }
307             }
308         }
309 
310         return 0;
311     }
312 #endif // __AVX__
313 
314     if (elempack == 4)
315     {
316         // TODO implement pack4
317         Mat bottom_blob_unpacked;
318 
319         Option opt_pack = opt;
320         opt_pack.blob_allocator = opt.workspace_allocator;
321         convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack);
322 
323         return forward(bottom_blob_unpacked, top_blob, opt);
324     }
325 #endif // __SSE2__
326 
327     if (kernel_w != kernel_h || stride_w != stride_h)
328     {
329         return Pooling::forward(bottom_blob, top_blob, opt);
330     }
331 
332     const int stride = stride_w;
333 
334     if (pooling_type != PoolMethod_MAX || stride != 2 || global_pooling == 1)
335     {
336         return Pooling::forward(bottom_blob, top_blob, opt);
337     }
338 
339 #if __AVX__
340     const int kernel_size = kernel_w;
341 
342     if (kernel_size != 2)
343     {
344         return Pooling::forward(bottom_blob, top_blob, opt);
345     }
346 
347     Mat bottom_blob_bordered;
348     make_padding(bottom_blob, bottom_blob_bordered, opt);
349     if (bottom_blob_bordered.empty())
350         return -100;
351 
352     w = bottom_blob_bordered.w;
353     h = bottom_blob_bordered.h;
354 
355     int outw = (w - kernel_w) / stride_w + 1;
356     int outh = (h - kernel_h) / stride_h + 1;
357 
358     top_blob.create(outw, outh, channels, elemsize, opt.blob_allocator);
359     if (top_blob.empty())
360         return -100;
361 
362     if (kernel_size == 2)
363         pooling2x2s2_max_avx(bottom_blob_bordered, top_blob, opt);
364 
365     return 0;
366 #else
367     return Pooling::forward(bottom_blob, top_blob, opt);
368 #endif
369 }
370 
371 } // namespace ncnn
372