1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14
15 #include "pooling_arm.h"
16
17 #include <float.h>
18
19 #if __ARM_NEON
20 #include <arm_neon.h>
21 #endif // __ARM_NEON
22
23 namespace ncnn {
24
25 #include "pooling_2x2.h"
26 #include "pooling_3x3.h"
27
28 #if __ARM_NEON
29 #include "pooling_2x2_pack4.h"
30 #include "pooling_3x3_pack4.h"
31 #endif
32
Pooling_arm()33 Pooling_arm::Pooling_arm()
34 {
35 #if __ARM_NEON
36 support_packing = true;
37 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
38 support_fp16_storage = true;
39 #endif
40 #endif // __ARM_NEON
41
42 #if NCNN_BF16
43 support_bf16_storage = true;
44 #endif
45 }
46
create_pipeline(const Option &)47 int Pooling_arm::create_pipeline(const Option& /*opt*/)
48 {
49 if (adaptive_pooling)
50 {
51 support_packing = false;
52
53 support_bf16_storage = false;
54 support_fp16_storage = false;
55 support_int8_storage = false;
56 support_tensor_storage = false;
57
58 support_weight_fp16_storage = false;
59 }
60 return 0;
61 }
62
forward(const Mat & bottom_blob,Mat & top_blob,const Option & opt) const63 int Pooling_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
64 {
65 if (adaptive_pooling)
66 {
67 return Pooling::forward(bottom_blob, top_blob, opt);
68 }
69
70 int elembits = bottom_blob.elembits();
71
72 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
73 if (opt.use_fp16_storage && elembits == 16)
74 {
75 if (opt.use_fp16_arithmetic)
76 return forward_fp16sa(bottom_blob, top_blob, opt);
77 else
78 return forward_fp16s(bottom_blob, top_blob, opt);
79 }
80 #endif
81
82 #if NCNN_BF16
83 if (opt.use_bf16_storage && elembits == 16)
84 return forward_bf16s(bottom_blob, top_blob, opt);
85 #endif
86
87 // max value in NxN window
88 // avg value in NxN window
89
90 int w = bottom_blob.w;
91 int h = bottom_blob.h;
92 int channels = bottom_blob.c;
93 size_t elemsize = bottom_blob.elemsize;
94 int elempack = bottom_blob.elempack;
95
96 #if __ARM_NEON
97 // NCNN_LOGE("Pooling input %d x %d pad = %d %d %d %d ksize=%d %d stride=%d %d", w, h, pad_left, pad_right, pad_top, pad_bottom, kernel_w, kernel_h, stride_w, stride_h);
98
99 if (elempack == 4)
100 {
101 if (global_pooling)
102 {
103 top_blob.create(channels, elemsize, elempack, opt.blob_allocator);
104 if (top_blob.empty())
105 return -100;
106
107 int size = w * h;
108
109 if (pooling_type == PoolMethod_MAX)
110 {
111 #pragma omp parallel for num_threads(opt.num_threads)
112 for (int q = 0; q < channels; q++)
113 {
114 const float* ptr = bottom_blob.channel(q);
115
116 float32x4_t _max = vld1q_f32(ptr);
117 for (int i = 0; i < size; i++)
118 {
119 float32x4_t _val = vld1q_f32(ptr);
120 _max = vmaxq_f32(_max, _val);
121 ptr += 4;
122 }
123
124 float* outptr = top_blob;
125 vst1q_f32(outptr + q * 4, _max);
126 }
127 }
128 else if (pooling_type == PoolMethod_AVE)
129 {
130 #pragma omp parallel for num_threads(opt.num_threads)
131 for (int q = 0; q < channels; q++)
132 {
133 const float* ptr = bottom_blob.channel(q);
134
135 float32x4_t _sum = vdupq_n_f32(0.f);
136 for (int i = 0; i < size; i++)
137 {
138 float32x4_t _val = vld1q_f32(ptr);
139 _sum = vaddq_f32(_sum, _val);
140 ptr += 4;
141 }
142
143 float32x4_t _inv_size = vdupq_n_f32(1.f / size);
144 float32x4_t _avg = vmulq_f32(_sum, _inv_size);
145
146 float* outptr = top_blob;
147 vst1q_f32(outptr + q * 4, _avg);
148 }
149 }
150
151 return 0;
152 }
153
154 Mat bottom_blob_bordered;
155 make_padding(bottom_blob, bottom_blob_bordered, opt);
156 if (bottom_blob_bordered.empty())
157 return -100;
158
159 w = bottom_blob_bordered.w;
160 h = bottom_blob_bordered.h;
161
162 int outw = (w - kernel_w) / stride_w + 1;
163 int outh = (h - kernel_h) / stride_h + 1;
164
165 top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_allocator);
166 if (top_blob.empty())
167 return -100;
168
169 const int maxk = kernel_w * kernel_h;
170
171 // kernel offsets
172 std::vector<int> _space_ofs(maxk);
173 int* space_ofs = &_space_ofs[0];
174 {
175 int p1 = 0;
176 int p2 = 0;
177 int gap = w - kernel_w;
178 for (int i = 0; i < kernel_h; i++)
179 {
180 for (int j = 0; j < kernel_w; j++)
181 {
182 space_ofs[p1] = p2;
183 p1++;
184 p2++;
185 }
186 p2 += gap;
187 }
188 }
189
190 if (pooling_type == PoolMethod_MAX)
191 {
192 if (kernel_w == 2 && kernel_h == 2 && stride_w == 2 && stride_h == 2)
193 {
194 pooling2x2s2_max_pack4_neon(bottom_blob_bordered, top_blob, opt);
195
196 return 0;
197 }
198
199 if (kernel_w == 3 && kernel_h == 3 && stride_w == 2 && stride_h == 2)
200 {
201 pooling3x3s2_max_pack4_neon(bottom_blob_bordered, top_blob, opt);
202
203 return 0;
204 }
205
206 #pragma omp parallel for num_threads(opt.num_threads)
207 for (int q = 0; q < channels; q++)
208 {
209 const Mat m = bottom_blob_bordered.channel(q);
210 float* outptr = top_blob.channel(q);
211
212 for (int i = 0; i < outh; i++)
213 {
214 for (int j = 0; j < outw; j++)
215 {
216 const float* sptr = m.row(i * stride_h) + j * stride_w * 4;
217
218 float32x4_t _max = vld1q_f32(sptr);
219
220 for (int k = 0; k < maxk; k++)
221 {
222 float32x4_t _val = vld1q_f32(sptr + space_ofs[k] * 4);
223 _max = vmaxq_f32(_max, _val);
224 }
225
226 vst1q_f32(outptr + j * 4, _max);
227 }
228
229 outptr += outw * 4;
230 }
231 }
232 }
233 else if (pooling_type == PoolMethod_AVE)
234 {
235 if (avgpool_count_include_pad == 0)
236 {
237 int wtailpad = 0;
238 int htailpad = 0;
239
240 if (pad_mode == 0) // full padding
241 {
242 wtailpad = bottom_blob_bordered.w - bottom_blob.w - pad_left - pad_right;
243 htailpad = bottom_blob_bordered.h - bottom_blob.h - pad_top - pad_bottom;
244 }
245
246 #pragma omp parallel for num_threads(opt.num_threads)
247 for (int q = 0; q < channels; q++)
248 {
249 const Mat m = bottom_blob_bordered.channel(q);
250 float* outptr = top_blob.channel(q);
251
252 for (int i = 0; i < outh; i++)
253 {
254 int sy0 = i * stride_h;
255
256 for (int j = 0; j < outw; j++)
257 {
258 int sx0 = j * stride_w;
259
260 float32x4_t _sum = vdupq_n_f32(0.f);
261 int area = 0;
262
263 for (int ki = 0; ki < kernel_h; ki++)
264 {
265 int sy = sy0 + ki;
266
267 if (sy < pad_top)
268 continue;
269
270 if (sy >= h - pad_bottom - htailpad)
271 break;
272
273 for (int kj = 0; kj < kernel_w; kj++)
274 {
275 int sx = sx0 + kj;
276
277 if (sx < pad_left)
278 continue;
279
280 if (sx >= w - pad_right - wtailpad)
281 break;
282
283 float32x4_t _val = vld1q_f32(m.row(sy) + sx * 4);
284 _sum = vaddq_f32(_sum, _val);
285 area += 1;
286 }
287 }
288
289 float32x4_t _inv_area = vdupq_n_f32(1.f / area);
290 float32x4_t _avg = vmulq_f32(_sum, _inv_area);
291 vst1q_f32(outptr + j * 4, _avg);
292 }
293
294 outptr += outw * 4;
295 }
296 }
297 }
298 else // if (avgpool_count_include_pad == 1)
299 {
300 #pragma omp parallel for num_threads(opt.num_threads)
301 for (int q = 0; q < channels; q++)
302 {
303 const Mat m = bottom_blob_bordered.channel(q);
304 float* outptr = top_blob.channel(q);
305
306 float32x4_t _inv_maxk = vdupq_n_f32(1.f / maxk);
307
308 for (int i = 0; i < outh; i++)
309 {
310 for (int j = 0; j < outw; j++)
311 {
312 const float* sptr = m.row(i * stride_h) + j * stride_w * 4;
313
314 float32x4_t _sum = vdupq_n_f32(0.f);
315
316 for (int k = 0; k < maxk; k++)
317 {
318 float32x4_t _val = vld1q_f32(sptr + space_ofs[k] * 4);
319 _sum = vaddq_f32(_sum, _val);
320 }
321
322 float32x4_t _avg = vmulq_f32(_sum, _inv_maxk);
323 vst1q_f32(outptr + j * 4, _avg);
324 }
325
326 outptr += outw * 4;
327 }
328 }
329 }
330 }
331
332 return 0;
333 }
334 #endif // __ARM_NEON
335
336 if (kernel_w != kernel_h || stride_w != stride_h)
337 {
338 return Pooling::forward(bottom_blob, top_blob, opt);
339 }
340
341 const int kernel_size = kernel_w;
342 const int stride = stride_w;
343
344 if (pooling_type != PoolMethod_MAX || stride != 2 || global_pooling == 1)
345 {
346 return Pooling::forward(bottom_blob, top_blob, opt);
347 }
348
349 if (kernel_size != 2 && kernel_size != 3)
350 {
351 return Pooling::forward(bottom_blob, top_blob, opt);
352 }
353
354 Mat bottom_blob_bordered;
355 make_padding(bottom_blob, bottom_blob_bordered, opt);
356 if (bottom_blob_bordered.empty())
357 return -100;
358
359 w = bottom_blob_bordered.w;
360 h = bottom_blob_bordered.h;
361
362 int outw = (w - kernel_w) / stride_w + 1;
363 int outh = (h - kernel_h) / stride_h + 1;
364
365 top_blob.create(outw, outh, channels, elemsize, opt.blob_allocator);
366 if (top_blob.empty())
367 return -100;
368
369 if (kernel_size == 2)
370 pooling2x2s2_max_neon(bottom_blob_bordered, top_blob, opt);
371 if (kernel_size == 3)
372 pooling3x3s2_max_neon(bottom_blob_bordered, top_blob, opt);
373
374 return 0;
375 }
376
377 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
forward_fp16s(const Mat & bottom_blob,Mat & top_blob,const Option & opt) const378 int Pooling_arm::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
379 {
380 // max value in NxN window
381 // avg value in NxN window
382
383 int w = bottom_blob.w;
384 int h = bottom_blob.h;
385 int channels = bottom_blob.c;
386 size_t elemsize = bottom_blob.elemsize;
387 int elempack = bottom_blob.elempack;
388
389 // NCNN_LOGE("Pooling input %d x %d pad = %d %d %d %d ksize=%d %d stride=%d %d", w, h, pad_left, pad_right, pad_top, pad_bottom, kernel_w, kernel_h, stride_w, stride_h);
390
391 if (global_pooling)
392 {
393 top_blob.create(channels, elemsize, elempack, opt.blob_allocator);
394 if (top_blob.empty())
395 return -100;
396
397 int size = w * h;
398
399 if (pooling_type == PoolMethod_MAX)
400 {
401 if (elempack == 8)
402 {
403 #pragma omp parallel for num_threads(opt.num_threads)
404 for (int q = 0; q < channels; q++)
405 {
406 const __fp16* ptr = bottom_blob.channel(q);
407
408 float16x8_t _max = vdupq_n_f16((__fp16)-FLT_MAX);
409 for (int i = 0; i < size; i++)
410 {
411 float16x8_t _val = vld1q_f16(ptr);
412 _max = vmaxq_f16(_max, _val);
413 ptr += 8;
414 }
415
416 __fp16* outptr = top_blob;
417 vst1q_f16(outptr + q * 8, _max);
418 }
419 }
420
421 if (elempack == 4)
422 {
423 #pragma omp parallel for num_threads(opt.num_threads)
424 for (int q = 0; q < channels; q++)
425 {
426 const __fp16* ptr = bottom_blob.channel(q);
427
428 float16x4_t _max = vdup_n_f16((__fp16)-FLT_MAX);
429 for (int i = 0; i < size; i++)
430 {
431 float16x4_t _val = vld1_f16(ptr);
432 _max = vmax_f16(_max, _val);
433 ptr += 4;
434 }
435
436 __fp16* outptr = top_blob;
437 vst1_f16(outptr + q * 4, _max);
438 }
439 }
440
441 if (elempack == 1)
442 {
443 #pragma omp parallel for num_threads(opt.num_threads)
444 for (int q = 0; q < channels; q++)
445 {
446 const __fp16* ptr = bottom_blob.channel(q);
447
448 __fp16 max = (__fp16)-FLT_MAX;
449 for (int i = 0; i < size; i++)
450 {
451 max = std::max(max, ptr[i]);
452 }
453
454 __fp16* outptr = top_blob;
455 outptr[q] = max;
456 }
457 }
458 }
459
460 if (pooling_type == PoolMethod_AVE)
461 {
462 if (elempack == 4)
463 {
464 #pragma omp parallel for num_threads(opt.num_threads)
465 for (int q = 0; q < channels; q++)
466 {
467 const __fp16* ptr = bottom_blob.channel(q);
468
469 float32x4_t _sum = vdupq_n_f32(0.f);
470 for (int i = 0; i < size; i++)
471 {
472 float32x4_t _val = vcvt_f32_f16(vld1_f16(ptr));
473 _sum = vaddq_f32(_sum, _val);
474 ptr += 4;
475 }
476
477 float32x4_t _inv_size = vdupq_n_f32(1.f / size);
478 float32x4_t _avg = vmulq_f32(_sum, _inv_size);
479
480 __fp16* outptr = top_blob;
481 vst1_f16(outptr + q * 4, vcvt_f16_f32(_avg));
482 }
483 }
484
485 if (elempack == 1)
486 {
487 #pragma omp parallel for num_threads(opt.num_threads)
488 for (int q = 0; q < channels; q++)
489 {
490 const __fp16* ptr = bottom_blob.channel(q);
491
492 float sum = 0.f;
493 for (int i = 0; i < size; i++)
494 {
495 sum += (float)ptr[i];
496 }
497
498 __fp16* outptr = top_blob;
499 outptr[q] = (__fp16)(sum / size);
500 }
501 }
502 }
503
504 return 0;
505 }
506
507 Mat bottom_blob_bordered;
508 make_padding(bottom_blob, bottom_blob_bordered, opt);
509 if (bottom_blob_bordered.empty())
510 return -100;
511
512 w = bottom_blob_bordered.w;
513 h = bottom_blob_bordered.h;
514
515 int outw = (w - kernel_w) / stride_w + 1;
516 int outh = (h - kernel_h) / stride_h + 1;
517
518 top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_allocator);
519 if (top_blob.empty())
520 return -100;
521
522 const int maxk = kernel_w * kernel_h;
523
524 // kernel offsets
525 std::vector<int> _space_ofs(maxk);
526 int* space_ofs = &_space_ofs[0];
527 {
528 int p1 = 0;
529 int p2 = 0;
530 int gap = w - kernel_w;
531 for (int i = 0; i < kernel_h; i++)
532 {
533 for (int j = 0; j < kernel_w; j++)
534 {
535 space_ofs[p1] = p2;
536 p1++;
537 p2++;
538 }
539 p2 += gap;
540 }
541 }
542
543 if (pooling_type == PoolMethod_MAX)
544 {
545 if (elempack == 8)
546 {
547 #pragma omp parallel for num_threads(opt.num_threads)
548 for (int q = 0; q < channels; q++)
549 {
550 const Mat m = bottom_blob_bordered.channel(q);
551 __fp16* outptr = top_blob.channel(q);
552
553 for (int i = 0; i < outh; i++)
554 {
555 for (int j = 0; j < outw; j++)
556 {
557 const __fp16* sptr = m.row<const __fp16>(i * stride_h) + j * stride_w * 8;
558
559 float16x8_t _max = vdupq_n_f16((__fp16)-FLT_MAX);
560
561 for (int k = 0; k < maxk; k++)
562 {
563 float16x8_t _val = vld1q_f16(sptr + space_ofs[k] * 8);
564 _max = vmaxq_f16(_max, _val);
565 }
566
567 vst1q_f16(outptr + j * 8, _max);
568 }
569
570 outptr += outw * 8;
571 }
572 }
573 }
574
575 if (elempack == 4)
576 {
577 #pragma omp parallel for num_threads(opt.num_threads)
578 for (int q = 0; q < channels; q++)
579 {
580 const Mat m = bottom_blob_bordered.channel(q);
581 __fp16* outptr = top_blob.channel(q);
582
583 for (int i = 0; i < outh; i++)
584 {
585 for (int j = 0; j < outw; j++)
586 {
587 const __fp16* sptr = m.row<const __fp16>(i * stride_h) + j * stride_w * 4;
588
589 float16x4_t _max = vdup_n_f16((__fp16)-FLT_MAX);
590
591 for (int k = 0; k < maxk; k++)
592 {
593 float16x4_t _val = vld1_f16(sptr + space_ofs[k] * 4);
594 _max = vmax_f16(_max, _val);
595 }
596
597 vst1_f16(outptr + j * 4, _max);
598 }
599
600 outptr += outw * 4;
601 }
602 }
603 }
604
605 if (elempack == 1)
606 {
607 #pragma omp parallel for num_threads(opt.num_threads)
608 for (int q = 0; q < channels; q++)
609 {
610 const Mat m = bottom_blob_bordered.channel(q);
611 __fp16* outptr = top_blob.channel(q);
612
613 for (int i = 0; i < outh; i++)
614 {
615 for (int j = 0; j < outw; j++)
616 {
617 const __fp16* sptr = m.row<const __fp16>(i * stride_h) + j * stride_w;
618
619 __fp16 max = (__fp16)-FLT_MAX;
620
621 for (int k = 0; k < maxk; k++)
622 {
623 __fp16 val = sptr[space_ofs[k]];
624 max = std::max(max, val);
625 }
626
627 outptr[j] = max;
628 }
629
630 outptr += outw;
631 }
632 }
633 }
634 }
635
636 if (pooling_type == PoolMethod_AVE)
637 {
638 if (avgpool_count_include_pad == 0)
639 {
640 int wtailpad = 0;
641 int htailpad = 0;
642
643 if (pad_mode == 0) // full padding
644 {
645 wtailpad = bottom_blob_bordered.w - bottom_blob.w - pad_left - pad_right;
646 htailpad = bottom_blob_bordered.h - bottom_blob.h - pad_top - pad_bottom;
647 }
648
649 if (elempack == 4)
650 {
651 #pragma omp parallel for num_threads(opt.num_threads)
652 for (int q = 0; q < channels; q++)
653 {
654 const Mat m = bottom_blob_bordered.channel(q);
655 __fp16* outptr = top_blob.channel(q);
656
657 for (int i = 0; i < outh; i++)
658 {
659 int sy0 = i * stride_h;
660
661 for (int j = 0; j < outw; j++)
662 {
663 int sx0 = j * stride_w;
664
665 float32x4_t _sum = vdupq_n_f32(0.f);
666 int area = 0;
667
668 for (int ki = 0; ki < kernel_h; ki++)
669 {
670 int sy = sy0 + ki;
671
672 if (sy < pad_top)
673 continue;
674
675 if (sy >= h - pad_bottom - htailpad)
676 break;
677
678 for (int kj = 0; kj < kernel_w; kj++)
679 {
680 int sx = sx0 + kj;
681
682 if (sx < pad_left)
683 continue;
684
685 if (sx >= w - pad_right - wtailpad)
686 break;
687
688 float32x4_t _val = vcvt_f32_f16(vld1_f16(m.row<const __fp16>(sy) + sx * 4));
689 _sum = vaddq_f32(_sum, _val);
690 area += 1;
691 }
692 }
693
694 float32x4_t _inv_area = vdupq_n_f32(1.f / area);
695 float32x4_t _avg = vmulq_f32(_sum, _inv_area);
696 vst1_f16(outptr + j * 4, vcvt_f16_f32(_avg));
697 }
698
699 outptr += outw * 4;
700 }
701 }
702 }
703
704 if (elempack == 1)
705 {
706 #pragma omp parallel for num_threads(opt.num_threads)
707 for (int q = 0; q < channels; q++)
708 {
709 const Mat m = bottom_blob_bordered.channel(q);
710 __fp16* outptr = top_blob.channel(q);
711
712 for (int i = 0; i < outh; i++)
713 {
714 int sy0 = i * stride_h;
715
716 for (int j = 0; j < outw; j++)
717 {
718 int sx0 = j * stride_w;
719
720 float sum = 0.f;
721 int area = 0;
722
723 for (int ki = 0; ki < kernel_h; ki++)
724 {
725 int sy = sy0 + ki;
726
727 if (sy < pad_top)
728 continue;
729
730 if (sy >= h - pad_bottom - htailpad)
731 break;
732
733 for (int kj = 0; kj < kernel_w; kj++)
734 {
735 int sx = sx0 + kj;
736
737 if (sx < pad_left)
738 continue;
739
740 if (sx >= w - pad_right - wtailpad)
741 break;
742
743 float val = (float)(m.row<const __fp16>(sy)[sx]);
744 sum += val;
745 area += 1;
746 }
747 }
748
749 outptr[j] = (__fp16)(sum / area);
750 }
751
752 outptr += outw;
753 }
754 }
755 }
756 }
757
758 if (avgpool_count_include_pad == 1)
759 {
760 if (elempack == 4)
761 {
762 #pragma omp parallel for num_threads(opt.num_threads)
763 for (int q = 0; q < channels; q++)
764 {
765 const Mat m = bottom_blob_bordered.channel(q);
766 __fp16* outptr = top_blob.channel(q);
767
768 float32x4_t _inv_maxk = vdupq_n_f32(1.f / maxk);
769
770 for (int i = 0; i < outh; i++)
771 {
772 for (int j = 0; j < outw; j++)
773 {
774 const __fp16* sptr = m.row<const __fp16>(i * stride_h) + j * stride_w * 4;
775
776 float32x4_t _sum = vdupq_n_f32(0.f);
777
778 for (int k = 0; k < maxk; k++)
779 {
780 float32x4_t _val = vcvt_f32_f16(vld1_f16(sptr + space_ofs[k] * 4));
781 _sum = vaddq_f32(_sum, _val);
782 }
783
784 float32x4_t _avg = vmulq_f32(_sum, _inv_maxk);
785 vst1_f16(outptr + j * 4, vcvt_f16_f32(_avg));
786 }
787
788 outptr += outw * 4;
789 }
790 }
791 }
792
793 if (elempack == 1)
794 {
795 #pragma omp parallel for num_threads(opt.num_threads)
796 for (int q = 0; q < channels; q++)
797 {
798 const Mat m = bottom_blob_bordered.channel(q);
799 __fp16* outptr = top_blob.channel(q);
800
801 for (int i = 0; i < outh; i++)
802 {
803 for (int j = 0; j < outw; j++)
804 {
805 const __fp16* sptr = m.row<const __fp16>(i * stride_h) + j * stride_w;
806
807 float sum = 0.f;
808
809 for (int k = 0; k < maxk; k++)
810 {
811 float val = (float)(sptr[space_ofs[k]]);
812 sum += val;
813 }
814
815 outptr[j] = (__fp16)(sum / maxk);
816 }
817
818 outptr += outw;
819 }
820 }
821 }
822 }
823 }
824
825 return 0;
826 }
827
forward_fp16sa(const Mat & bottom_blob,Mat & top_blob,const Option & opt) const828 int Pooling_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
829 {
830 // max value in NxN window
831 // avg value in NxN window
832
833 if (pooling_type == PoolMethod_MAX)
834 {
835 return forward_fp16s(bottom_blob, top_blob, opt);
836 }
837
838 int w = bottom_blob.w;
839 int h = bottom_blob.h;
840 int channels = bottom_blob.c;
841 size_t elemsize = bottom_blob.elemsize;
842 int elempack = bottom_blob.elempack;
843
844 // NCNN_LOGE("Pooling input %d x %d pad = %d %d %d %d ksize=%d %d stride=%d %d", w, h, pad_left, pad_right, pad_top, pad_bottom, kernel_w, kernel_h, stride_w, stride_h);
845
846 if (global_pooling)
847 {
848 top_blob.create(channels, elemsize, elempack, opt.blob_allocator);
849 if (top_blob.empty())
850 return -100;
851
852 int size = w * h;
853
854 if (pooling_type == PoolMethod_AVE)
855 {
856 if (elempack == 8)
857 {
858 #pragma omp parallel for num_threads(opt.num_threads)
859 for (int q = 0; q < channels; q++)
860 {
861 const __fp16* ptr = bottom_blob.channel(q);
862
863 float16x8_t _sum = vdupq_n_f16((__fp16)0.f);
864 for (int i = 0; i < size; i++)
865 {
866 float16x8_t _val = vld1q_f16(ptr);
867 _sum = vaddq_f16(_sum, _val);
868 ptr += 8;
869 }
870
871 float16x8_t _inv_size = vdupq_n_f16((__fp16)(1.f / size));
872 float16x8_t _avg = vmulq_f16(_sum, _inv_size);
873
874 __fp16* outptr = top_blob;
875 vst1q_f16(outptr + q * 8, _avg);
876 }
877 }
878
879 if (elempack == 4)
880 {
881 #pragma omp parallel for num_threads(opt.num_threads)
882 for (int q = 0; q < channels; q++)
883 {
884 const __fp16* ptr = bottom_blob.channel(q);
885
886 float16x4_t _sum = vdup_n_f16((__fp16)0.f);
887 for (int i = 0; i < size; i++)
888 {
889 float16x4_t _val = vld1_f16(ptr);
890 _sum = vadd_f16(_sum, _val);
891 ptr += 4;
892 }
893
894 float16x4_t _inv_size = vdup_n_f16((__fp16)(1.f / size));
895 float16x4_t _avg = vmul_f16(_sum, _inv_size);
896
897 __fp16* outptr = top_blob;
898 vst1_f16(outptr + q * 4, _avg);
899 }
900 }
901
902 if (elempack == 1)
903 {
904 #pragma omp parallel for num_threads(opt.num_threads)
905 for (int q = 0; q < channels; q++)
906 {
907 const __fp16* ptr = bottom_blob.channel(q);
908
909 __fp16 sum = (__fp16)0.f;
910 for (int i = 0; i < size; i++)
911 {
912 sum += ptr[i];
913 }
914
915 __fp16* outptr = top_blob;
916 outptr[q] = sum / size;
917 }
918 }
919 }
920
921 return 0;
922 }
923
924 Mat bottom_blob_bordered;
925 make_padding(bottom_blob, bottom_blob_bordered, opt);
926 if (bottom_blob_bordered.empty())
927 return -100;
928
929 w = bottom_blob_bordered.w;
930 h = bottom_blob_bordered.h;
931
932 int outw = (w - kernel_w) / stride_w + 1;
933 int outh = (h - kernel_h) / stride_h + 1;
934
935 top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_allocator);
936 if (top_blob.empty())
937 return -100;
938
939 const int maxk = kernel_w * kernel_h;
940
941 // kernel offsets
942 std::vector<int> _space_ofs(maxk);
943 int* space_ofs = &_space_ofs[0];
944 {
945 int p1 = 0;
946 int p2 = 0;
947 int gap = w - kernel_w;
948 for (int i = 0; i < kernel_h; i++)
949 {
950 for (int j = 0; j < kernel_w; j++)
951 {
952 space_ofs[p1] = p2;
953 p1++;
954 p2++;
955 }
956 p2 += gap;
957 }
958 }
959
960 if (pooling_type == PoolMethod_AVE)
961 {
962 if (avgpool_count_include_pad == 0)
963 {
964 int wtailpad = 0;
965 int htailpad = 0;
966
967 if (pad_mode == 0) // full padding
968 {
969 wtailpad = bottom_blob_bordered.w - bottom_blob.w - pad_left - pad_right;
970 htailpad = bottom_blob_bordered.h - bottom_blob.h - pad_top - pad_bottom;
971 }
972
973 if (elempack == 8)
974 {
975 #pragma omp parallel for num_threads(opt.num_threads)
976 for (int q = 0; q < channels; q++)
977 {
978 const Mat m = bottom_blob_bordered.channel(q);
979 __fp16* outptr = top_blob.channel(q);
980
981 for (int i = 0; i < outh; i++)
982 {
983 int sy0 = i * stride_h;
984
985 for (int j = 0; j < outw; j++)
986 {
987 int sx0 = j * stride_w;
988
989 float16x8_t _sum = vdupq_n_f16((__fp16)0.f);
990 int area = 0;
991
992 for (int ki = 0; ki < kernel_h; ki++)
993 {
994 int sy = sy0 + ki;
995
996 if (sy < pad_top)
997 continue;
998
999 if (sy >= h - pad_bottom - htailpad)
1000 break;
1001
1002 for (int kj = 0; kj < kernel_w; kj++)
1003 {
1004 int sx = sx0 + kj;
1005
1006 if (sx < pad_left)
1007 continue;
1008
1009 if (sx >= w - pad_right - wtailpad)
1010 break;
1011
1012 float16x8_t _val = vld1q_f16(m.row<const __fp16>(sy) + sx * 8);
1013 _sum = vaddq_f16(_sum, _val);
1014 area += 1;
1015 }
1016 }
1017
1018 float16x8_t _inv_area = vdupq_n_f16((__fp16)(1.f / area));
1019 float16x8_t _avg = vmulq_f16(_sum, _inv_area);
1020 vst1q_f16(outptr + j * 8, _avg);
1021 }
1022
1023 outptr += outw * 8;
1024 }
1025 }
1026 }
1027
1028 if (elempack == 4)
1029 {
1030 #pragma omp parallel for num_threads(opt.num_threads)
1031 for (int q = 0; q < channels; q++)
1032 {
1033 const Mat m = bottom_blob_bordered.channel(q);
1034 __fp16* outptr = top_blob.channel(q);
1035
1036 for (int i = 0; i < outh; i++)
1037 {
1038 int sy0 = i * stride_h;
1039
1040 for (int j = 0; j < outw; j++)
1041 {
1042 int sx0 = j * stride_w;
1043
1044 float16x4_t _sum = vdup_n_f16((__fp16)0.f);
1045 int area = 0;
1046
1047 for (int ki = 0; ki < kernel_h; ki++)
1048 {
1049 int sy = sy0 + ki;
1050
1051 if (sy < pad_top)
1052 continue;
1053
1054 if (sy >= h - pad_bottom - htailpad)
1055 break;
1056
1057 for (int kj = 0; kj < kernel_w; kj++)
1058 {
1059 int sx = sx0 + kj;
1060
1061 if (sx < pad_left)
1062 continue;
1063
1064 if (sx >= w - pad_right - wtailpad)
1065 break;
1066
1067 float16x4_t _val = vld1_f16(m.row<const __fp16>(sy) + sx * 4);
1068 _sum = vadd_f16(_sum, _val);
1069 area += 1;
1070 }
1071 }
1072
1073 float16x4_t _inv_area = vdup_n_f16((__fp16)(1.f / area));
1074 float16x4_t _avg = vmul_f16(_sum, _inv_area);
1075 vst1_f16(outptr + j * 4, _avg);
1076 }
1077
1078 outptr += outw * 4;
1079 }
1080 }
1081 }
1082
1083 if (elempack == 1)
1084 {
1085 #pragma omp parallel for num_threads(opt.num_threads)
1086 for (int q = 0; q < channels; q++)
1087 {
1088 const Mat m = bottom_blob_bordered.channel(q);
1089 __fp16* outptr = top_blob.channel(q);
1090
1091 for (int i = 0; i < outh; i++)
1092 {
1093 int sy0 = i * stride_h;
1094
1095 for (int j = 0; j < outw; j++)
1096 {
1097 int sx0 = j * stride_w;
1098
1099 __fp16 sum = (__fp16)0.f;
1100 int area = 0;
1101
1102 for (int ki = 0; ki < kernel_h; ki++)
1103 {
1104 int sy = sy0 + ki;
1105
1106 if (sy < pad_top)
1107 continue;
1108
1109 if (sy >= h - pad_bottom - htailpad)
1110 break;
1111
1112 for (int kj = 0; kj < kernel_w; kj++)
1113 {
1114 int sx = sx0 + kj;
1115
1116 if (sx < pad_left)
1117 continue;
1118
1119 if (sx >= w - pad_right - wtailpad)
1120 break;
1121
1122 __fp16 val = m.row<const __fp16>(sy)[sx];
1123 sum += val;
1124 area += 1;
1125 }
1126 }
1127
1128 outptr[j] = sum / area;
1129 }
1130
1131 outptr += outw;
1132 }
1133 }
1134 }
1135 }
1136
1137 if (avgpool_count_include_pad == 1)
1138 {
1139 if (elempack == 8)
1140 {
1141 #pragma omp parallel for num_threads(opt.num_threads)
1142 for (int q = 0; q < channels; q++)
1143 {
1144 const Mat m = bottom_blob_bordered.channel(q);
1145 __fp16* outptr = top_blob.channel(q);
1146
1147 float16x8_t _inv_maxk = vdupq_n_f16((__fp16)(1.f / maxk));
1148
1149 for (int i = 0; i < outh; i++)
1150 {
1151 for (int j = 0; j < outw; j++)
1152 {
1153 const __fp16* sptr = m.row<const __fp16>(i * stride_h) + j * stride_w * 8;
1154
1155 float16x8_t _sum = vdupq_n_f16((__fp16)0.f);
1156
1157 for (int k = 0; k < maxk; k++)
1158 {
1159 float16x8_t _val = vld1q_f16(sptr + space_ofs[k] * 8);
1160 _sum = vaddq_f16(_sum, _val);
1161 }
1162
1163 float16x8_t _avg = vmulq_f16(_sum, _inv_maxk);
1164 vst1q_f16(outptr + j * 8, _avg);
1165 }
1166
1167 outptr += outw * 8;
1168 }
1169 }
1170 }
1171
1172 if (elempack == 4)
1173 {
1174 #pragma omp parallel for num_threads(opt.num_threads)
1175 for (int q = 0; q < channels; q++)
1176 {
1177 const Mat m = bottom_blob_bordered.channel(q);
1178 __fp16* outptr = top_blob.channel(q);
1179
1180 float16x4_t _inv_maxk = vdup_n_f16((__fp16)(1.f / maxk));
1181
1182 for (int i = 0; i < outh; i++)
1183 {
1184 for (int j = 0; j < outw; j++)
1185 {
1186 const __fp16* sptr = m.row<const __fp16>(i * stride_h) + j * stride_w * 4;
1187
1188 float16x4_t _sum = vdup_n_f16((__fp16)0.f);
1189
1190 for (int k = 0; k < maxk; k++)
1191 {
1192 float16x4_t _val = vld1_f16(sptr + space_ofs[k] * 4);
1193 _sum = vadd_f16(_sum, _val);
1194 }
1195
1196 float16x4_t _avg = vmul_f16(_sum, _inv_maxk);
1197 vst1_f16(outptr + j * 4, _avg);
1198 }
1199
1200 outptr += outw * 4;
1201 }
1202 }
1203 }
1204
1205 if (elempack == 1)
1206 {
1207 #pragma omp parallel for num_threads(opt.num_threads)
1208 for (int q = 0; q < channels; q++)
1209 {
1210 const Mat m = bottom_blob_bordered.channel(q);
1211 __fp16* outptr = top_blob.channel(q);
1212
1213 for (int i = 0; i < outh; i++)
1214 {
1215 for (int j = 0; j < outw; j++)
1216 {
1217 const __fp16* sptr = m.row<const __fp16>(i * stride_h) + j * stride_w;
1218
1219 __fp16 sum = (__fp16)0.f;
1220
1221 for (int k = 0; k < maxk; k++)
1222 {
1223 __fp16 val = sptr[space_ofs[k]];
1224 sum += val;
1225 }
1226
1227 outptr[j] = sum / maxk;
1228 }
1229
1230 outptr += outw;
1231 }
1232 }
1233 }
1234 }
1235 }
1236
1237 return 0;
1238 }
1239 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1240
1241 #if NCNN_BF16
forward_bf16s(const Mat & bottom_blob,Mat & top_blob,const Option & opt) const1242 int Pooling_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
1243 {
1244 // max value in NxN window
1245 // avg value in NxN window
1246
1247 int w = bottom_blob.w;
1248 int h = bottom_blob.h;
1249 int channels = bottom_blob.c;
1250 size_t elemsize = bottom_blob.elemsize;
1251 int elempack = bottom_blob.elempack;
1252
1253 // NCNN_LOGE("Pooling input %d x %d pad = %d %d %d %d ksize=%d %d stride=%d %d", w, h, pad_left, pad_right, pad_top, pad_bottom, kernel_w, kernel_h, stride_w, stride_h);
1254
1255 if (global_pooling)
1256 {
1257 top_blob.create(channels, elemsize, elempack, opt.blob_allocator);
1258 if (top_blob.empty())
1259 return -100;
1260
1261 int size = w * h;
1262
1263 if (pooling_type == PoolMethod_MAX)
1264 {
1265 #if __ARM_NEON
1266 if (elempack == 4)
1267 {
1268 #pragma omp parallel for num_threads(opt.num_threads)
1269 for (int q = 0; q < channels; q++)
1270 {
1271 const unsigned short* ptr = bottom_blob.channel(q);
1272
1273 float32x4_t _max = vdupq_n_f32(-FLT_MAX);
1274 for (int i = 0; i < size; i++)
1275 {
1276 float32x4_t _val = vcvt_f32_bf16(vld1_u16(ptr));
1277 _max = vmaxq_f32(_max, _val);
1278 ptr += 4;
1279 }
1280
1281 unsigned short* outptr = top_blob;
1282 vst1_u16(outptr + q * 4, vcvt_bf16_f32(_max));
1283 }
1284 }
1285 #endif // __ARM_NEON
1286
1287 if (elempack == 1)
1288 {
1289 #pragma omp parallel for num_threads(opt.num_threads)
1290 for (int q = 0; q < channels; q++)
1291 {
1292 const unsigned short* ptr = bottom_blob.channel(q);
1293
1294 float max = -FLT_MAX;
1295 for (int i = 0; i < size; i++)
1296 {
1297 max = std::max(max, bfloat16_to_float32(ptr[i]));
1298 }
1299
1300 unsigned short* outptr = top_blob;
1301 outptr[q] = float32_to_bfloat16(max);
1302 }
1303 }
1304 }
1305
1306 if (pooling_type == PoolMethod_AVE)
1307 {
1308 #if __ARM_NEON
1309 if (elempack == 4)
1310 {
1311 #pragma omp parallel for num_threads(opt.num_threads)
1312 for (int q = 0; q < channels; q++)
1313 {
1314 const unsigned short* ptr = bottom_blob.channel(q);
1315
1316 float32x4_t _sum = vdupq_n_f32(0.f);
1317 for (int i = 0; i < size; i++)
1318 {
1319 float32x4_t _val = vcvt_f32_bf16(vld1_u16(ptr));
1320 _sum = vaddq_f32(_sum, _val);
1321 ptr += 4;
1322 }
1323
1324 float32x4_t _inv_size = vdupq_n_f32(1.f / size);
1325 float32x4_t _avg = vmulq_f32(_sum, _inv_size);
1326
1327 unsigned short* outptr = top_blob;
1328 vst1_u16(outptr + q * 4, vcvt_bf16_f32(_avg));
1329 }
1330 }
1331 #endif // __ARM_NEON
1332
1333 if (elempack == 1)
1334 {
1335 #pragma omp parallel for num_threads(opt.num_threads)
1336 for (int q = 0; q < channels; q++)
1337 {
1338 const unsigned short* ptr = bottom_blob.channel(q);
1339
1340 float sum = 0.f;
1341 for (int i = 0; i < size; i++)
1342 {
1343 sum += bfloat16_to_float32(ptr[i]);
1344 }
1345
1346 unsigned short* outptr = top_blob;
1347 outptr[q] = float32_to_bfloat16(sum / size);
1348 }
1349 }
1350 }
1351
1352 return 0;
1353 }
1354
1355 Mat bottom_blob_bordered;
1356 make_padding(bottom_blob, bottom_blob_bordered, opt);
1357 if (bottom_blob_bordered.empty())
1358 return -100;
1359
1360 w = bottom_blob_bordered.w;
1361 h = bottom_blob_bordered.h;
1362
1363 int outw = (w - kernel_w) / stride_w + 1;
1364 int outh = (h - kernel_h) / stride_h + 1;
1365
1366 top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_allocator);
1367 if (top_blob.empty())
1368 return -100;
1369
1370 const int maxk = kernel_w * kernel_h;
1371
1372 // kernel offsets
1373 std::vector<int> _space_ofs(maxk);
1374 int* space_ofs = &_space_ofs[0];
1375 {
1376 int p1 = 0;
1377 int p2 = 0;
1378 int gap = w - kernel_w;
1379 for (int i = 0; i < kernel_h; i++)
1380 {
1381 for (int j = 0; j < kernel_w; j++)
1382 {
1383 space_ofs[p1] = p2;
1384 p1++;
1385 p2++;
1386 }
1387 p2 += gap;
1388 }
1389 }
1390
1391 if (pooling_type == PoolMethod_MAX)
1392 {
1393 #if __ARM_NEON
1394 if (elempack == 4)
1395 {
1396 #pragma omp parallel for num_threads(opt.num_threads)
1397 for (int q = 0; q < channels; q++)
1398 {
1399 const Mat m = bottom_blob_bordered.channel(q);
1400 unsigned short* outptr = top_blob.channel(q);
1401
1402 for (int i = 0; i < outh; i++)
1403 {
1404 for (int j = 0; j < outw; j++)
1405 {
1406 const unsigned short* sptr = m.row<const unsigned short>(i * stride_h) + j * stride_w * 4;
1407
1408 float32x4_t _max = vdupq_n_f32(-FLT_MAX);
1409
1410 for (int k = 0; k < maxk; k++)
1411 {
1412 float32x4_t _val = vcvt_f32_bf16(vld1_u16(sptr + space_ofs[k] * 4));
1413 _max = vmaxq_f32(_max, _val);
1414 }
1415
1416 vst1_u16(outptr + j * 4, vcvt_bf16_f32(_max));
1417 }
1418
1419 outptr += outw * 4;
1420 }
1421 }
1422 }
1423 #endif // __ARM_NEON
1424
1425 if (elempack == 1)
1426 {
1427 #pragma omp parallel for num_threads(opt.num_threads)
1428 for (int q = 0; q < channels; q++)
1429 {
1430 const Mat m = bottom_blob_bordered.channel(q);
1431 unsigned short* outptr = top_blob.channel(q);
1432
1433 for (int i = 0; i < outh; i++)
1434 {
1435 for (int j = 0; j < outw; j++)
1436 {
1437 const unsigned short* sptr = m.row<const unsigned short>(i * stride_h) + j * stride_w;
1438
1439 float max = -FLT_MAX;
1440
1441 for (int k = 0; k < maxk; k++)
1442 {
1443 float val = bfloat16_to_float32(sptr[space_ofs[k]]);
1444 max = std::max(max, val);
1445 }
1446
1447 outptr[j] = float32_to_bfloat16(max);
1448 }
1449
1450 outptr += outw;
1451 }
1452 }
1453 }
1454 }
1455
1456 if (pooling_type == PoolMethod_AVE)
1457 {
1458 if (avgpool_count_include_pad == 0)
1459 {
1460 int wtailpad = 0;
1461 int htailpad = 0;
1462
1463 if (pad_mode == 0) // full padding
1464 {
1465 wtailpad = bottom_blob_bordered.w - bottom_blob.w - pad_left - pad_right;
1466 htailpad = bottom_blob_bordered.h - bottom_blob.h - pad_top - pad_bottom;
1467 }
1468
1469 #if __ARM_NEON
1470 if (elempack == 4)
1471 {
1472 #pragma omp parallel for num_threads(opt.num_threads)
1473 for (int q = 0; q < channels; q++)
1474 {
1475 const Mat m = bottom_blob_bordered.channel(q);
1476 unsigned short* outptr = top_blob.channel(q);
1477
1478 for (int i = 0; i < outh; i++)
1479 {
1480 int sy0 = i * stride_h;
1481
1482 for (int j = 0; j < outw; j++)
1483 {
1484 int sx0 = j * stride_w;
1485
1486 float32x4_t _sum = vdupq_n_f32(0.f);
1487 int area = 0;
1488
1489 for (int ki = 0; ki < kernel_h; ki++)
1490 {
1491 int sy = sy0 + ki;
1492
1493 if (sy < pad_top)
1494 continue;
1495
1496 if (sy >= h - pad_bottom - htailpad)
1497 break;
1498
1499 for (int kj = 0; kj < kernel_w; kj++)
1500 {
1501 int sx = sx0 + kj;
1502
1503 if (sx < pad_left)
1504 continue;
1505
1506 if (sx >= w - pad_right - wtailpad)
1507 break;
1508
1509 float32x4_t _val = vcvt_f32_bf16(vld1_u16(m.row<const unsigned short>(sy) + sx * 4));
1510 _sum = vaddq_f32(_sum, _val);
1511 area += 1;
1512 }
1513 }
1514
1515 float32x4_t _inv_area = vdupq_n_f32(1.f / area);
1516 float32x4_t _avg = vmulq_f32(_sum, _inv_area);
1517 vst1_u16(outptr + j * 4, vcvt_bf16_f32(_avg));
1518 }
1519
1520 outptr += outw * 4;
1521 }
1522 }
1523 }
1524 #endif // __ARM_NEON
1525
1526 if (elempack == 1)
1527 {
1528 #pragma omp parallel for num_threads(opt.num_threads)
1529 for (int q = 0; q < channels; q++)
1530 {
1531 const Mat m = bottom_blob_bordered.channel(q);
1532 unsigned short* outptr = top_blob.channel(q);
1533
1534 for (int i = 0; i < outh; i++)
1535 {
1536 int sy0 = i * stride_h;
1537
1538 for (int j = 0; j < outw; j++)
1539 {
1540 int sx0 = j * stride_w;
1541
1542 float sum = 0;
1543 int area = 0;
1544
1545 for (int ki = 0; ki < kernel_h; ki++)
1546 {
1547 int sy = sy0 + ki;
1548
1549 if (sy < pad_top)
1550 continue;
1551
1552 if (sy >= h - pad_bottom - htailpad)
1553 break;
1554
1555 for (int kj = 0; kj < kernel_w; kj++)
1556 {
1557 int sx = sx0 + kj;
1558
1559 if (sx < pad_left)
1560 continue;
1561
1562 if (sx >= w - pad_right - wtailpad)
1563 break;
1564
1565 float val = bfloat16_to_float32(m.row<const unsigned short>(sy)[sx]);
1566 sum += val;
1567 area += 1;
1568 }
1569 }
1570
1571 outptr[j] = float32_to_bfloat16(sum / area);
1572 }
1573
1574 outptr += outw;
1575 }
1576 }
1577 }
1578 }
1579
1580 if (avgpool_count_include_pad == 1)
1581 {
1582 #if __ARM_NEON
1583 if (elempack == 4)
1584 {
1585 #pragma omp parallel for num_threads(opt.num_threads)
1586 for (int q = 0; q < channels; q++)
1587 {
1588 const Mat m = bottom_blob_bordered.channel(q);
1589 unsigned short* outptr = top_blob.channel(q);
1590
1591 float32x4_t _inv_maxk = vdupq_n_f32(1.f / maxk);
1592
1593 for (int i = 0; i < outh; i++)
1594 {
1595 for (int j = 0; j < outw; j++)
1596 {
1597 const unsigned short* sptr = m.row<const unsigned short>(i * stride_h) + j * stride_w * 4;
1598
1599 float32x4_t _sum = vdupq_n_f32(0.f);
1600
1601 for (int k = 0; k < maxk; k++)
1602 {
1603 float32x4_t _val = vcvt_f32_bf16(vld1_u16(sptr + space_ofs[k] * 4));
1604 _sum = vaddq_f32(_sum, _val);
1605 }
1606
1607 float32x4_t _avg = vmulq_f32(_sum, _inv_maxk);
1608 vst1_u16(outptr + j * 4, vcvt_bf16_f32(_avg));
1609 }
1610
1611 outptr += outw * 4;
1612 }
1613 }
1614 }
1615 #endif // __ARM_NEON
1616
1617 if (elempack == 1)
1618 {
1619 #pragma omp parallel for num_threads(opt.num_threads)
1620 for (int q = 0; q < channels; q++)
1621 {
1622 const Mat m = bottom_blob_bordered.channel(q);
1623 unsigned short* outptr = top_blob.channel(q);
1624
1625 for (int i = 0; i < outh; i++)
1626 {
1627 for (int j = 0; j < outw; j++)
1628 {
1629 const unsigned short* sptr = m.row<const unsigned short>(i * stride_h) + j * stride_w;
1630
1631 float sum = 0.f;
1632
1633 for (int k = 0; k < maxk; k++)
1634 {
1635 float val = bfloat16_to_float32(sptr[space_ofs[k]]);
1636 sum += val;
1637 }
1638
1639 outptr[j] = float32_to_bfloat16(sum / maxk);
1640 }
1641
1642 outptr += outw;
1643 }
1644 }
1645 }
1646 }
1647 }
1648
1649 return 0;
1650 }
1651 #endif // NCNN_BF16
1652
1653 } // namespace ncnn
1654