1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14
15 #include "pooling_x86.h"
16
17 #if __SSE2__
18 #if __AVX__
19 #include <immintrin.h>
20 #endif
21 #endif // __SSE2__
22
23 #include <float.h>
24
25 namespace ncnn {
26
27 #if __SSE2__
28 #if __AVX__
29 #include "pooling_2x2.h"
30 #include "pooling_2x2_pack8.h"
31 #include "pooling_3x3_pack8.h"
32 #endif
33 #endif // __SSE2__
34
Pooling_x86()35 Pooling_x86::Pooling_x86()
36 {
37 #if __SSE2__
38 support_packing = true;
39 #endif // __SSE2__
40 }
41
create_pipeline(const Option &)42 int Pooling_x86::create_pipeline(const Option& /*opt*/)
43 {
44 if (adaptive_pooling)
45 {
46 support_packing = false;
47
48 support_bf16_storage = false;
49 support_fp16_storage = false;
50 support_int8_storage = false;
51 support_image_storage = false;
52 support_tensor_storage = false;
53
54 support_weight_fp16_storage = false;
55 }
56 return 0;
57 }
58
forward(const Mat & bottom_blob,Mat & top_blob,const Option & opt) const59 int Pooling_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
60 {
61 // max value in NxN window
62 // avg value in NxN window
63
64 if (adaptive_pooling)
65 {
66 return Pooling::forward(bottom_blob, top_blob, opt);
67 }
68
69 #if __SSE2__
70 int elempack = bottom_blob.elempack;
71
72 #if __AVX__
73 int w = bottom_blob.w;
74 int h = bottom_blob.h;
75 int channels = bottom_blob.c;
76 size_t elemsize = bottom_blob.elemsize;
77
78 // NCNN_LOGE("Pooling input %d x %d pad = %d %d %d %d ksize=%d %d stride=%d %d", w, h, pad_left, pad_right, pad_top, pad_bottom, kernel_w, kernel_h, stride_w, stride_h);
79 if (elempack == 8)
80 {
81 if (global_pooling)
82 {
83 top_blob.create(channels, elemsize, elempack, opt.blob_allocator);
84 if (top_blob.empty())
85 return -100;
86
87 int size = w * h;
88
89 if (pooling_type == PoolMethod_MAX)
90 {
91 #pragma omp parallel for num_threads(opt.num_threads)
92 for (int q = 0; q < channels; q++)
93 {
94 const float* ptr = bottom_blob.channel(q);
95
96 __m256 _max = _mm256_loadu_ps(ptr);
97 for (int i = 0; i < size; i++)
98 {
99 __m256 _val = _mm256_loadu_ps(ptr);
100 _max = _mm256_max_ps(_max, _val);
101 ptr += 8;
102 }
103
104 float* outptr = top_blob;
105 _mm256_storeu_ps(outptr + q * 8, _max);
106 }
107 }
108 else if (pooling_type == PoolMethod_AVE)
109 {
110 #pragma omp parallel for num_threads(opt.num_threads)
111 for (int q = 0; q < channels; q++)
112 {
113 const float* ptr = bottom_blob.channel(q);
114
115 __m256 _sum = _mm256_set1_ps(0.f);
116 for (int i = 0; i < size; i++)
117 {
118 __m256 _val = _mm256_loadu_ps(ptr);
119 _sum = _mm256_add_ps(_sum, _val);
120 ptr += 8;
121 }
122
123 __m256 _inv_size = _mm256_set1_ps(1.f / size);
124 __m256 _avg = _mm256_mul_ps(_sum, _inv_size);
125
126 float* outptr = top_blob;
127 _mm256_storeu_ps(outptr + q * 8, _avg);
128 }
129 }
130
131 return 0;
132 }
133
134 Mat bottom_blob_bordered;
135 make_padding(bottom_blob, bottom_blob_bordered, opt);
136 if (bottom_blob_bordered.empty())
137 return -100;
138
139 w = bottom_blob_bordered.w;
140 h = bottom_blob_bordered.h;
141
142 int outw = (w - kernel_w) / stride_w + 1;
143 int outh = (h - kernel_h) / stride_h + 1;
144
145 top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_allocator);
146 if (top_blob.empty())
147 return -100;
148
149 const int maxk = kernel_w * kernel_h;
150
151 // kernel offsets
152 std::vector<int> _space_ofs(maxk);
153 int* space_ofs = &_space_ofs[0];
154 {
155 int p1 = 0;
156 int p2 = 0;
157 int gap = w - kernel_w;
158 for (int i = 0; i < kernel_h; i++)
159 {
160 for (int j = 0; j < kernel_w; j++)
161 {
162 space_ofs[p1] = p2;
163 p1++;
164 p2++;
165 }
166 p2 += gap;
167 }
168 }
169 if (pooling_type == PoolMethod_MAX)
170 {
171 if (kernel_w == 2 && kernel_h == 2 && stride_w == 2 && stride_h == 2)
172 {
173 pooling2x2s2_max_pack8_avx(bottom_blob_bordered, top_blob, opt);
174
175 return 0;
176 }
177 if (kernel_w == 3 && kernel_h == 3 && stride_w == 2 && stride_h == 2)
178 {
179 pooling3x3s2_max_pack8_avx(bottom_blob_bordered, top_blob, opt);
180
181 return 0;
182 }
183
184 #pragma omp parallel for num_threads(opt.num_threads)
185 for (int q = 0; q < channels; q++)
186 {
187 const Mat m = bottom_blob_bordered.channel(q);
188 float* outptr = top_blob.channel(q);
189
190 for (int i = 0; i < outh; i++)
191 {
192 for (int j = 0; j < outw; j++)
193 {
194 const float* sptr = m.row(i * stride_h) + j * stride_w * 8;
195
196 __m256 _max = _mm256_loadu_ps(sptr);
197
198 for (int k = 0; k < maxk; k++)
199 {
200 __m256 _val = _mm256_loadu_ps(sptr + space_ofs[k] * 8);
201 _max = _mm256_max_ps(_max, _val);
202 }
203
204 _mm256_storeu_ps(outptr + j * 8, _max);
205 }
206
207 outptr += outw * 8;
208 }
209 }
210 }
211 else if (pooling_type == PoolMethod_AVE)
212 {
213 if (avgpool_count_include_pad == 0)
214 {
215 int wtailpad = 0;
216 int htailpad = 0;
217
218 if (pad_mode == 0) // full padding
219 {
220 wtailpad = bottom_blob_bordered.w - bottom_blob.w - pad_left - pad_right;
221 htailpad = bottom_blob_bordered.h - bottom_blob.h - pad_top - pad_bottom;
222 }
223
224 #pragma omp parallel for num_threads(opt.num_threads)
225 for (int q = 0; q < channels; q++)
226 {
227 const Mat m = bottom_blob_bordered.channel(q);
228 float* outptr = top_blob.channel(q);
229
230 for (int i = 0; i < outh; i++)
231 {
232 int sy0 = i * stride_h;
233
234 for (int j = 0; j < outw; j++)
235 {
236 int sx0 = j * stride_w;
237
238 __m256 _sum = _mm256_set1_ps(0.f);
239 int area = 0;
240
241 for (int ki = 0; ki < kernel_h; ki++)
242 {
243 int sy = sy0 + ki;
244
245 if (sy < pad_top)
246 continue;
247
248 if (sy >= h - pad_bottom - htailpad)
249 break;
250
251 for (int kj = 0; kj < kernel_w; kj++)
252 {
253 int sx = sx0 + kj;
254
255 if (sx < pad_left)
256 continue;
257
258 if (sx >= w - pad_right - wtailpad)
259 break;
260
261 __m256 _val = _mm256_loadu_ps(m.row(sy) + sx * 8);
262 _sum = _mm256_add_ps(_sum, _val);
263 area += 1;
264 }
265 }
266
267 __m256 _inv_area = _mm256_set1_ps(1.f / area);
268 __m256 _avg = _mm256_mul_ps(_sum, _inv_area);
269 _mm256_storeu_ps(outptr + j * 8, _avg);
270 }
271
272 outptr += outw * 8;
273 }
274 }
275 }
276 else // if (avgpool_count_include_pad == 1)
277 {
278 #pragma omp parallel for num_threads(opt.num_threads)
279 for (int q = 0; q < channels; q++)
280 {
281 const Mat m = bottom_blob_bordered.channel(q);
282 float* outptr = top_blob.channel(q);
283
284 __m256 _inv_maxk = _mm256_set1_ps(1.f / maxk);
285
286 for (int i = 0; i < outh; i++)
287 {
288 for (int j = 0; j < outw; j++)
289 {
290 const float* sptr = m.row(i * stride_h) + j * stride_w * 8;
291
292 __m256 _sum = _mm256_set1_ps(0.f);
293
294 for (int k = 0; k < maxk; k++)
295 {
296 __m256 _val = _mm256_loadu_ps(sptr + space_ofs[k] * 8);
297 _sum = _mm256_add_ps(_sum, _val);
298 }
299
300 __m256 _avg = _mm256_mul_ps(_sum, _inv_maxk);
301 _mm256_storeu_ps(outptr + j * 8, _avg);
302 }
303
304 outptr += outw * 8;
305 }
306 }
307 }
308 }
309
310 return 0;
311 }
312 #endif // __AVX__
313
314 if (elempack == 4)
315 {
316 // TODO implement pack4
317 Mat bottom_blob_unpacked;
318
319 Option opt_pack = opt;
320 opt_pack.blob_allocator = opt.workspace_allocator;
321 convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack);
322
323 return forward(bottom_blob_unpacked, top_blob, opt);
324 }
325 #endif // __SSE2__
326
327 if (kernel_w != kernel_h || stride_w != stride_h)
328 {
329 return Pooling::forward(bottom_blob, top_blob, opt);
330 }
331
332 const int stride = stride_w;
333
334 if (pooling_type != PoolMethod_MAX || stride != 2 || global_pooling == 1)
335 {
336 return Pooling::forward(bottom_blob, top_blob, opt);
337 }
338
339 #if __AVX__
340 const int kernel_size = kernel_w;
341
342 if (kernel_size != 2)
343 {
344 return Pooling::forward(bottom_blob, top_blob, opt);
345 }
346
347 Mat bottom_blob_bordered;
348 make_padding(bottom_blob, bottom_blob_bordered, opt);
349 if (bottom_blob_bordered.empty())
350 return -100;
351
352 w = bottom_blob_bordered.w;
353 h = bottom_blob_bordered.h;
354
355 int outw = (w - kernel_w) / stride_w + 1;
356 int outh = (h - kernel_h) / stride_h + 1;
357
358 top_blob.create(outw, outh, channels, elemsize, opt.blob_allocator);
359 if (top_blob.empty())
360 return -100;
361
362 if (kernel_size == 2)
363 pooling2x2s2_max_avx(bottom_blob_bordered, top_blob, opt);
364
365 return 0;
366 #else
367 return Pooling::forward(bottom_blob, top_blob, opt);
368 #endif
369 }
370
371 } // namespace ncnn
372