1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14
15 #include "deconvolutiondepthwise_arm.h"
16
17 #include "layer_type.h"
18
19 #if __ARM_NEON
20 #include <arm_neon.h>
21 #endif // __ARM_NEON
22
23 #include "arm_activation.h"
24
25 namespace ncnn {
26
DeconvolutionDepthWise_arm()27 DeconvolutionDepthWise_arm::DeconvolutionDepthWise_arm()
28 {
29 #if __ARM_NEON
30 support_packing = true;
31 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
32 support_fp16_storage = true;
33 #endif
34 #endif // __ARM_NEON
35
36 support_bf16_storage = true;
37 }
38
create_pipeline(const Option & opt)39 int DeconvolutionDepthWise_arm::create_pipeline(const Option& opt)
40 {
41 // create Deconvolution op for each group
42 const int maxk = kernel_w * kernel_h;
43 int channels = (weight_data_size / group) / maxk / (num_output / group) * group;
44
45 // depth-wise
46 if (channels == group && group == num_output)
47 {
48 int elempack = (support_packing && opt.use_packing_layout && channels % 4 == 0) ? 4 : 1;
49
50 Mat weight_data_transposed(weight_data.w);
51 {
52 float* pt = weight_data_transposed;
53 const float* p = weight_data;
54
55 for (int i = 0; i < (channels / group) * (num_output / group) * group; i++)
56 {
57 for (int k = 0; k < maxk; k++)
58 {
59 pt[maxk - 1 - k] = p[k];
60 }
61
62 p += maxk;
63 pt += maxk;
64 }
65 }
66
67 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
68 if (opt.use_fp16_storage)
69 {
70 if (opt.use_packing_layout)
71 {
72 elempack = opt.use_fp16_arithmetic && channels % 8 == 0 ? 8 : channels % 4 == 0 ? 4 : 1;
73 }
74
75 if (elempack == 8)
76 {
77 Mat weight_data_r2 = weight_data_transposed.reshape(maxk, group);
78 Mat weight_data_r2_packed;
79 convert_packing(weight_data_r2, weight_data_r2_packed, 8);
80
81 ncnn::cast_float32_to_float16(weight_data_r2_packed, weight_data_fp16, opt);
82 }
83
84 if (elempack == 4)
85 {
86 Mat weight_data_r2 = weight_data_transposed.reshape(maxk, group);
87 Mat weight_data_r2_packed;
88 convert_packing(weight_data_r2, weight_data_r2_packed, 4);
89
90 ncnn::cast_float32_to_float16(weight_data_r2_packed, weight_data_fp16, opt);
91 }
92
93 if (elempack == 1)
94 {
95 ncnn::cast_float32_to_float16(weight_data_transposed, weight_data_fp16, opt);
96 }
97
98 ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt);
99
100 return 0;
101 }
102 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
103
104 if (opt.use_bf16_storage)
105 {
106 #if __ARM_NEON
107 if (elempack == 4)
108 {
109 Mat weight_data_r2 = weight_data_transposed.reshape(maxk, group);
110 convert_packing(weight_data_r2, weight_data_pack4, 4);
111
112 ncnn::cast_float32_to_bfloat16(weight_data_pack4, weight_data_bf16, opt);
113 }
114 #endif // __ARM_NEON
115
116 if (elempack == 1)
117 {
118 ncnn::cast_float32_to_bfloat16(weight_data_transposed, weight_data_bf16, opt);
119 }
120
121 return 0;
122 }
123
124 #if __ARM_NEON
125 // pack4
126 if (elempack == 4)
127 {
128 Mat weight_data_r2 = weight_data_transposed.reshape(maxk, group);
129 convert_packing(weight_data_r2, weight_data_pack4, 4);
130 }
131 #endif // __ARM_NEON
132
133 // pack1
134 if (elempack == 1)
135 {
136 weight_data_pack1 = weight_data_transposed;
137 }
138 }
139 else
140 {
141 // group deconvolution
142 for (int i = 0; i < (int)group_ops.size(); i++)
143 delete group_ops[i];
144
145 group_ops.clear();
146
147 const int channels_g = channels / group;
148 const int num_output_g = num_output / group;
149
150 group_ops.resize(group);
151
152 for (int g = 0; g < group; g++)
153 {
154 Mat weight_data_g = weight_data.range(maxk * channels_g * num_output_g * g, maxk * channels_g * num_output_g);
155 Mat bias_data_g;
156 if (bias_term)
157 bias_data_g = bias_data.range(num_output_g * g, num_output_g);
158
159 ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Deconvolution);
160
161 // set param
162 ncnn::ParamDict pd;
163 pd.set(0, num_output_g); // num_output
164 pd.set(1, kernel_w);
165 pd.set(11, kernel_h);
166 pd.set(2, dilation_w);
167 pd.set(12, dilation_h);
168 pd.set(3, stride_w);
169 pd.set(13, stride_h);
170 pd.set(4, 0); // pad_w
171 pd.set(14, 0); // pad_h
172 pd.set(5, bias_term);
173 pd.set(6, maxk * channels_g * num_output_g); // weight_data_size
174 pd.set(9, activation_type);
175 pd.set(10, activation_params);
176
177 op->load_param(pd);
178
179 // set weights
180 if (bias_term)
181 {
182 ncnn::Mat weights[2];
183 weights[0] = weight_data_g;
184 weights[1] = bias_data_g;
185
186 op->load_model(ModelBinFromMatArray(weights));
187 }
188 else
189 {
190 ncnn::Mat weights[1];
191 weights[0] = weight_data_g;
192
193 op->load_model(ModelBinFromMatArray(weights));
194 }
195
196 op->create_pipeline(opt);
197
198 group_ops[g] = op;
199 }
200 }
201
202 return 0;
203 }
204
destroy_pipeline(const Option & opt)205 int DeconvolutionDepthWise_arm::destroy_pipeline(const Option& opt)
206 {
207 for (int i = 0; i < (int)group_ops.size(); i++)
208 {
209 group_ops[i]->destroy_pipeline(opt);
210 delete group_ops[i];
211 }
212 group_ops.clear();
213
214 return 0;
215 }
216
forward(const Mat & bottom_blob,Mat & top_blob,const Option & opt) const217 int DeconvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
218 {
219 int elembits = bottom_blob.elembits();
220
221 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
222 if (opt.use_fp16_storage && elembits == 16)
223 {
224 if (opt.use_fp16_arithmetic)
225 return forward_fp16sa(bottom_blob, top_blob, opt);
226 else
227 return forward_fp16s(bottom_blob, top_blob, opt);
228 }
229 #endif
230
231 if (opt.use_bf16_storage && elembits == 16)
232 return forward_bf16s(bottom_blob, top_blob, opt);
233
234 // convolv with NxN kernel
235 // value = value + bias
236
237 int w = bottom_blob.w;
238 int h = bottom_blob.h;
239 int channels = bottom_blob.c;
240 size_t elemsize = bottom_blob.elemsize;
241 int elempack = bottom_blob.elempack;
242
243 const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
244 const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
245
246 int outw = (w - 1) * stride_w + kernel_extent_w;
247 int outh = (h - 1) * stride_h + kernel_extent_h;
248 int out_elempack = (support_packing && opt.use_packing_layout && num_output % 4 == 0) ? 4 : 1;
249 size_t out_elemsize = elemsize / elempack * out_elempack;
250
251 Mat top_blob_bordered;
252 if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || output_pad_right > 0 || output_pad_bottom > 0 || (output_w > 0 && output_h > 0))
253 {
254 top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_allocator);
255 }
256 else
257 {
258 top_blob_bordered = top_blob;
259 top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
260 }
261 if (top_blob_bordered.empty())
262 return -100;
263
264 const int maxk = kernel_w * kernel_h;
265
266 // depth-wise
267 if (channels * elempack == group && group == num_output)
268 {
269 #if __ARM_NEON
270 if (elempack == 4)
271 {
272 #pragma omp parallel for num_threads(opt.num_threads)
273 for (int g = 0; g < channels; g++)
274 {
275 float* outptr = top_blob_bordered.channel(g);
276 const float* kptr = (const float*)weight_data_pack4 + maxk * g * 4;
277 const Mat m = bottom_blob.channel(g);
278
279 for (int i = 0; i < outh; i++)
280 {
281 for (int j = 0; j < outw; j++)
282 {
283 float32x4_t _sum = vdupq_n_f32(0.f);
284
285 if (bias_term)
286 {
287 _sum = vld1q_f32((const float*)bias_data + g * 4);
288 }
289
290 for (int y = 0; y < kernel_h; y++)
291 {
292 int sys = (i + y * dilation_h - (kernel_extent_h - 1));
293 if (sys < 0 || sys % stride_h != 0)
294 continue;
295
296 int sy = sys / stride_h;
297 if (sy >= h)
298 continue;
299
300 for (int x = 0; x < kernel_w; x++)
301 {
302 int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
303 if (sxs < 0 || sxs % stride_w != 0)
304 continue;
305
306 int sx = sxs / stride_w;
307 if (sx >= w)
308 continue;
309
310 const float* sptr = m.row(sy) + sx * 4;
311
312 float32x4_t _val = vld1q_f32(sptr);
313
314 int k = y * kernel_w + x;
315
316 float32x4_t _w = vld1q_f32(kptr + k * 4);
317
318 _sum = vmlaq_f32(_sum, _val, _w);
319 }
320 }
321
322 _sum = activation_ps(_sum, activation_type, activation_params);
323
324 vst1q_f32(outptr + j * 4, _sum);
325 }
326
327 outptr += outw * 4;
328 }
329 }
330 }
331 #endif // __ARM_NEON
332
333 if (elempack == 1)
334 {
335 #pragma omp parallel for num_threads(opt.num_threads)
336 for (int g = 0; g < channels; g++)
337 {
338 float* outptr = top_blob_bordered.channel(g);
339 const float* kptr = (const float*)weight_data_pack1 + maxk * g;
340 const Mat m = bottom_blob.channel(g);
341
342 for (int i = 0; i < outh; i++)
343 {
344 for (int j = 0; j < outw; j++)
345 {
346 float sum = 0.f;
347
348 if (bias_term)
349 {
350 sum = bias_data[g];
351 }
352
353 for (int y = 0; y < kernel_h; y++)
354 {
355 int sys = (i + y * dilation_h - (kernel_extent_h - 1));
356 if (sys < 0 || sys % stride_h != 0)
357 continue;
358
359 int sy = sys / stride_h;
360 if (sy >= h)
361 continue;
362
363 const float* sptr = m.row(sy);
364
365 for (int x = 0; x < kernel_w; x++)
366 {
367 int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
368 if (sxs < 0 || sxs % stride_w != 0)
369 continue;
370
371 int sx = sxs / stride_w;
372 if (sx >= w)
373 continue;
374
375 float val = sptr[sx];
376
377 int k = y * kernel_w + x;
378
379 float w = kptr[k];
380
381 sum += val * w;
382 }
383 }
384
385 if (activation_type == 1)
386 {
387 sum = std::max(sum, 0.f);
388 }
389 else if (activation_type == 2)
390 {
391 float slope = activation_params[0];
392 sum = sum > 0.f ? sum : sum * slope;
393 }
394 else if (activation_type == 3)
395 {
396 float min = activation_params[0];
397 float max = activation_params[1];
398 if (sum < min)
399 sum = min;
400 if (sum > max)
401 sum = max;
402 }
403 else if (activation_type == 4)
404 {
405 sum = static_cast<float>(1.f / (1.f + exp(-sum)));
406 }
407
408 outptr[j] = sum;
409 }
410
411 outptr += outw;
412 }
413 }
414 }
415 }
416 else
417 {
418 // group deconvolution
419 const int channels_g = channels * elempack / group;
420 const int num_output_g = num_output / group;
421
422 int g_elempack = (support_packing && opt.use_packing_layout && channels_g % 4 == 0) ? 4 : 1;
423 int out_g_elempack = (support_packing && opt.use_packing_layout && num_output_g % 4 == 0) ? 4 : 1;
424
425 // unpacking
426 Mat bottom_blob_unpacked = bottom_blob;
427 if (elempack == 4 && g_elempack == 1)
428 {
429 Option opt_p = opt;
430 opt_p.blob_allocator = opt.workspace_allocator;
431 convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_p);
432 }
433
434 Mat top_blob_bordered_unpacked = top_blob_bordered;
435 if (out_g_elempack == 1 && out_elempack == 4)
436 {
437 top_blob_bordered_unpacked.create(outw, outh, num_output, out_elemsize / out_elempack, 1, opt.workspace_allocator);
438 if (top_blob_bordered_unpacked.empty())
439 return -100;
440 }
441
442 for (int g = 0; g < group; g++)
443 {
444 const Mat bottom_blob_g = bottom_blob_unpacked.channel_range(channels_g * g / g_elempack, channels_g / g_elempack);
445 Mat top_blob_bordered_g = top_blob_bordered_unpacked.channel_range(num_output_g * g / out_g_elempack, num_output_g / out_g_elempack);
446
447 const ncnn::Layer* op = group_ops[g];
448
449 Option opt_g = opt;
450 opt_g.blob_allocator = top_blob_bordered_unpacked.allocator;
451
452 // forward
453 op->forward(bottom_blob_g, top_blob_bordered_g, opt_g);
454 }
455
456 // packing
457 if (out_g_elempack == 1 && out_elempack == 4)
458 {
459 convert_packing(top_blob_bordered_unpacked, top_blob_bordered, 4, opt);
460 }
461 else
462 {
463 top_blob_bordered = top_blob_bordered_unpacked;
464 }
465 }
466
467 cut_padding(top_blob_bordered, top_blob, opt);
468 if (top_blob.empty())
469 return -100;
470
471 return 0;
472 }
473
474 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
forward_fp16s(const Mat & bottom_blob,Mat & top_blob,const Option & opt) const475 int DeconvolutionDepthWise_arm::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
476 {
477 int w = bottom_blob.w;
478 int h = bottom_blob.h;
479 int channels = bottom_blob.c;
480 size_t elemsize = bottom_blob.elemsize;
481 int elempack = bottom_blob.elempack;
482
483 const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
484 const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
485
486 int outw = (w - 1) * stride_w + kernel_extent_w;
487 int outh = (h - 1) * stride_h + kernel_extent_h;
488 int out_elempack = (support_packing && opt.use_packing_layout && num_output % 4 == 0) ? 4 : 1;
489 size_t out_elemsize = elemsize / elempack * out_elempack;
490
491 Mat top_blob_bordered;
492 if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || output_pad_right > 0 || output_pad_bottom > 0 || (output_w > 0 && output_h > 0))
493 {
494 top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_allocator);
495 }
496 else
497 {
498 top_blob_bordered = top_blob;
499 top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
500 }
501 if (top_blob_bordered.empty())
502 return -100;
503
504 const int maxk = kernel_w * kernel_h;
505
506 // depth-wise
507 if (channels * elempack == group && group == num_output)
508 {
509 if (elempack == 4)
510 {
511 {
512 #pragma omp parallel for num_threads(opt.num_threads)
513 for (int g = 0; g < channels; g++)
514 {
515 __fp16* outptr = top_blob_bordered.channel(g);
516 const __fp16* kptr = (const __fp16*)weight_data_fp16 + maxk * g * 4;
517 const Mat m = bottom_blob.channel(g);
518
519 for (int i = 0; i < outh; i++)
520 {
521 for (int j = 0; j < outw; j++)
522 {
523 float32x4_t _sum = vdupq_n_f32(0.f);
524
525 if (bias_term)
526 {
527 _sum = vld1q_f32((const float*)bias_data + g * 4);
528 }
529
530 for (int y = 0; y < kernel_h; y++)
531 {
532 int sys = (i + y * dilation_h - (kernel_extent_h - 1));
533 if (sys < 0 || sys % stride_h != 0)
534 continue;
535
536 int sy = sys / stride_h;
537 if (sy >= h)
538 continue;
539
540 for (int x = 0; x < kernel_w; x++)
541 {
542 int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
543 if (sxs < 0 || sxs % stride_w != 0)
544 continue;
545
546 int sx = sxs / stride_w;
547 if (sx >= w)
548 continue;
549
550 const __fp16* sptr = m.row<const __fp16>(sy) + sx * 4;
551
552 float32x4_t _val = vcvt_f32_f16(vld1_f16(sptr));
553
554 int k = y * kernel_w + x;
555
556 float32x4_t _w = vcvt_f32_f16(vld1_f16(kptr + k * 4));
557
558 _sum = vfmaq_f32(_sum, _val, _w);
559 }
560 }
561
562 _sum = activation_ps(_sum, activation_type, activation_params);
563
564 vst1_f16(outptr + j * 4, vcvt_f16_f32(_sum));
565 }
566
567 outptr += outw * 4;
568 }
569 }
570 }
571 }
572
573 if (elempack == 1)
574 {
575 {
576 #pragma omp parallel for num_threads(opt.num_threads)
577 for (int g = 0; g < channels; g++)
578 {
579 __fp16* outptr = top_blob_bordered.channel(g);
580 const __fp16* kptr = (const __fp16*)weight_data_fp16 + maxk * g;
581 const Mat m = bottom_blob.channel(g);
582
583 for (int i = 0; i < outh; i++)
584 {
585 for (int j = 0; j < outw; j++)
586 {
587 float sum = 0.f;
588
589 if (bias_term)
590 {
591 sum = bias_data[g];
592 }
593
594 for (int y = 0; y < kernel_h; y++)
595 {
596 int sys = (i + y * dilation_h - (kernel_extent_h - 1));
597 if (sys < 0 || sys % stride_h != 0)
598 continue;
599
600 int sy = sys / stride_h;
601 if (sy >= h)
602 continue;
603
604 const __fp16* sptr = m.row<const __fp16>(sy);
605
606 for (int x = 0; x < kernel_w; x++)
607 {
608 int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
609 if (sxs < 0 || sxs % stride_w != 0)
610 continue;
611
612 int sx = sxs / stride_w;
613 if (sx >= w)
614 continue;
615
616 float val = (float)sptr[sx];
617
618 int k = y * kernel_w + x;
619
620 float w = (float)kptr[k];
621
622 sum += val * w;
623 }
624 }
625
626 sum = activation_ss(sum, activation_type, activation_params);
627
628 outptr[j] = (__fp16)sum;
629 }
630
631 outptr += outw;
632 }
633 }
634 }
635 }
636 }
637 else
638 {
639 // group deconvolution
640 const int channels_g = channels * elempack / group;
641 const int num_output_g = num_output / group;
642
643 int g_elempack = (support_packing && opt.use_packing_layout && channels_g % 4 == 0) ? 4 : 1;
644 int out_g_elempack = (support_packing && opt.use_packing_layout && num_output_g % 4 == 0) ? 4 : 1;
645
646 // unpacking
647 Mat bottom_blob_unpacked = bottom_blob;
648 if (elempack == 4 && g_elempack == 1)
649 {
650 Option opt_p = opt;
651 opt_p.blob_allocator = opt.workspace_allocator;
652 convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_p);
653 }
654
655 Mat top_blob_bordered_unpacked = top_blob_bordered;
656 if (out_g_elempack == 1 && out_elempack == 4)
657 {
658 top_blob_bordered_unpacked.create(outw, outh, num_output, out_elemsize / out_elempack, 1, opt.workspace_allocator);
659 if (top_blob_bordered_unpacked.empty())
660 return -100;
661 }
662
663 for (int g = 0; g < group; g++)
664 {
665 const Mat bottom_blob_g = bottom_blob_unpacked.channel_range(channels_g * g / g_elempack, channels_g / g_elempack);
666 Mat top_blob_bordered_g = top_blob_bordered_unpacked.channel_range(num_output_g * g / out_g_elempack, num_output_g / out_g_elempack);
667
668 const ncnn::Layer* op = group_ops[g];
669
670 Option opt_g = opt;
671 opt_g.blob_allocator = top_blob_bordered_unpacked.allocator;
672
673 // forward
674 op->forward(bottom_blob_g, top_blob_bordered_g, opt_g);
675 }
676
677 // packing
678 if (out_g_elempack == 1 && out_elempack == 4)
679 {
680 convert_packing(top_blob_bordered_unpacked, top_blob_bordered, 4, opt);
681 }
682 else
683 {
684 top_blob_bordered = top_blob_bordered_unpacked;
685 }
686 }
687
688 cut_padding(top_blob_bordered, top_blob, opt);
689 if (top_blob.empty())
690 return -100;
691
692 return 0;
693 }
694
forward_fp16sa(const Mat & bottom_blob,Mat & top_blob,const Option & opt) const695 int DeconvolutionDepthWise_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
696 {
697 int w = bottom_blob.w;
698 int h = bottom_blob.h;
699 int channels = bottom_blob.c;
700 size_t elemsize = bottom_blob.elemsize;
701 int elempack = bottom_blob.elempack;
702
703 const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
704 const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
705
706 int outw = (w - 1) * stride_w + kernel_extent_w;
707 int outh = (h - 1) * stride_h + kernel_extent_h;
708 int out_elempack = 1;
709 if (opt.use_packing_layout)
710 {
711 out_elempack = opt.use_fp16_arithmetic && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
712 }
713 size_t out_elemsize = elemsize / elempack * out_elempack;
714
715 Mat top_blob_bordered;
716 if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || output_pad_right > 0 || output_pad_bottom > 0 || (output_w > 0 && output_h > 0))
717 {
718 top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_allocator);
719 }
720 else
721 {
722 top_blob_bordered = top_blob;
723 top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
724 }
725 if (top_blob_bordered.empty())
726 return -100;
727
728 const int maxk = kernel_w * kernel_h;
729
730 // depth-wise
731 if (channels * elempack == group && group == num_output)
732 {
733 if (elempack == 8)
734 {
735 {
736 #pragma omp parallel for num_threads(opt.num_threads)
737 for (int g = 0; g < channels; g++)
738 {
739 __fp16* outptr = top_blob_bordered.channel(g);
740 const __fp16* kptr = (const __fp16*)weight_data_fp16 + maxk * g * 8;
741 const Mat m = bottom_blob.channel(g);
742
743 for (int i = 0; i < outh; i++)
744 {
745 for (int j = 0; j < outw; j++)
746 {
747 float16x8_t _sum = vdupq_n_f16((__fp16)0.f);
748
749 if (bias_term)
750 {
751 _sum = vld1q_f16((const __fp16*)bias_data_fp16 + g * 8);
752 }
753
754 for (int y = 0; y < kernel_h; y++)
755 {
756 int sys = (i + y * dilation_h - (kernel_extent_h - 1));
757 if (sys < 0 || sys % stride_h != 0)
758 continue;
759
760 int sy = sys / stride_h;
761 if (sy >= h)
762 continue;
763
764 for (int x = 0; x < kernel_w; x++)
765 {
766 int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
767 if (sxs < 0 || sxs % stride_w != 0)
768 continue;
769
770 int sx = sxs / stride_w;
771 if (sx >= w)
772 continue;
773
774 const __fp16* sptr = m.row<const __fp16>(sy) + sx * 8;
775
776 float16x8_t _val = vld1q_f16(sptr);
777
778 int k = y * kernel_w + x;
779
780 float16x8_t _w = vld1q_f16(kptr + k * 8);
781
782 _sum = vfmaq_f16(_sum, _val, _w);
783 }
784 }
785
786 _sum = activation_ps(_sum, activation_type, activation_params);
787
788 vst1q_f16(outptr + j * 8, _sum);
789 }
790
791 outptr += outw * 8;
792 }
793 }
794 }
795 }
796
797 if (elempack == 4)
798 {
799 {
800 #pragma omp parallel for num_threads(opt.num_threads)
801 for (int g = 0; g < channels; g++)
802 {
803 __fp16* outptr = top_blob_bordered.channel(g);
804 const __fp16* kptr = (const __fp16*)weight_data_fp16 + maxk * g * 4;
805 const Mat m = bottom_blob.channel(g);
806
807 for (int i = 0; i < outh; i++)
808 {
809 for (int j = 0; j < outw; j++)
810 {
811 float16x4_t _sum = vdup_n_f16((__fp16)0.f);
812
813 if (bias_term)
814 {
815 _sum = vld1_f16((const __fp16*)bias_data_fp16 + g * 4);
816 }
817
818 for (int y = 0; y < kernel_h; y++)
819 {
820 int sys = (i + y * dilation_h - (kernel_extent_h - 1));
821 if (sys < 0 || sys % stride_h != 0)
822 continue;
823
824 int sy = sys / stride_h;
825 if (sy >= h)
826 continue;
827
828 for (int x = 0; x < kernel_w; x++)
829 {
830 int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
831 if (sxs < 0 || sxs % stride_w != 0)
832 continue;
833
834 int sx = sxs / stride_w;
835 if (sx >= w)
836 continue;
837
838 const __fp16* sptr = m.row<const __fp16>(sy) + sx * 4;
839
840 float16x4_t _val = vld1_f16(sptr);
841
842 int k = y * kernel_w + x;
843
844 float16x4_t _w = vld1_f16(kptr + k * 4);
845
846 _sum = vfma_f16(_sum, _val, _w);
847 }
848 }
849
850 _sum = activation_ps(_sum, activation_type, activation_params);
851
852 vst1_f16(outptr + j * 4, _sum);
853 }
854
855 outptr += outw * 4;
856 }
857 }
858 }
859 }
860
861 if (elempack == 1)
862 {
863 {
864 #pragma omp parallel for num_threads(opt.num_threads)
865 for (int g = 0; g < channels; g++)
866 {
867 __fp16* outptr = top_blob_bordered.channel(g);
868 const __fp16* kptr = (const __fp16*)weight_data_fp16 + maxk * g;
869 const Mat m = bottom_blob.channel(g);
870
871 for (int i = 0; i < outh; i++)
872 {
873 for (int j = 0; j < outw; j++)
874 {
875 float sum = 0.f;
876
877 if (bias_term)
878 {
879 sum = bias_data[g];
880 }
881
882 for (int y = 0; y < kernel_h; y++)
883 {
884 int sys = (i + y * dilation_h - (kernel_extent_h - 1));
885 if (sys < 0 || sys % stride_h != 0)
886 continue;
887
888 int sy = sys / stride_h;
889 if (sy >= h)
890 continue;
891
892 const __fp16* sptr = m.row<const __fp16>(sy);
893
894 for (int x = 0; x < kernel_w; x++)
895 {
896 int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
897 if (sxs < 0 || sxs % stride_w != 0)
898 continue;
899
900 int sx = sxs / stride_w;
901 if (sx >= w)
902 continue;
903
904 __fp16 val = sptr[sx];
905
906 int k = y * kernel_w + x;
907
908 __fp16 w = kptr[k];
909
910 sum += val * w;
911 }
912 }
913
914 sum = activation_ss(sum, activation_type, activation_params);
915
916 outptr[j] = (__fp16)sum;
917 }
918
919 outptr += outw;
920 }
921 }
922 }
923 }
924 }
925 else
926 {
927 // group deconvolution
928 const int channels_g = channels * elempack / group;
929 const int num_output_g = num_output / group;
930
931 int g_elempack = 1;
932 int out_g_elempack = 1;
933 if (opt.use_packing_layout)
934 {
935 g_elempack = opt.use_fp16_arithmetic && channels_g % 8 == 0 ? 8 : channels_g % 4 == 0 ? 4 : 1;
936 out_g_elempack = opt.use_fp16_arithmetic && num_output_g % 8 == 0 ? 8 : num_output_g % 4 == 0 ? 4 : 1;
937 }
938
939 // unpacking
940 Mat bottom_blob_unpacked = bottom_blob;
941 if (elempack > g_elempack)
942 {
943 Option opt_p = opt;
944 opt_p.blob_allocator = opt.workspace_allocator;
945 convert_packing(bottom_blob, bottom_blob_unpacked, g_elempack, opt_p);
946 }
947
948 Mat top_blob_bordered_unpacked = top_blob_bordered;
949 if (out_g_elempack < out_elempack)
950 {
951 top_blob_bordered_unpacked.create(outw, outh, num_output / out_g_elempack, out_elemsize / out_elempack * out_g_elempack, out_g_elempack, opt.workspace_allocator);
952 if (top_blob_bordered_unpacked.empty())
953 return -100;
954 }
955
956 for (int g = 0; g < group; g++)
957 {
958 const Mat bottom_blob_g = bottom_blob_unpacked.channel_range(channels_g * g / g_elempack, channels_g / g_elempack);
959 Mat top_blob_bordered_g = top_blob_bordered_unpacked.channel_range(num_output_g * g / out_g_elempack, num_output_g / out_g_elempack);
960
961 const ncnn::Layer* op = group_ops[g];
962
963 Option opt_g = opt;
964 opt_g.blob_allocator = top_blob_bordered_unpacked.allocator;
965
966 // forward
967 op->forward(bottom_blob_g, top_blob_bordered_g, opt_g);
968 }
969
970 // packing
971 if (out_g_elempack < out_elempack)
972 {
973 convert_packing(top_blob_bordered_unpacked, top_blob_bordered, out_elempack, opt);
974 }
975 else
976 {
977 top_blob_bordered = top_blob_bordered_unpacked;
978 }
979 }
980
981 cut_padding(top_blob_bordered, top_blob, opt);
982 if (top_blob.empty())
983 return -100;
984
985 return 0;
986 }
987 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
988
forward_bf16s(const Mat & bottom_blob,Mat & top_blob,const Option & opt) const989 int DeconvolutionDepthWise_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
990 {
991 int w = bottom_blob.w;
992 int h = bottom_blob.h;
993 int channels = bottom_blob.c;
994 size_t elemsize = bottom_blob.elemsize;
995 int elempack = bottom_blob.elempack;
996
997 const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
998 const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
999
1000 int outw = (w - 1) * stride_w + kernel_extent_w;
1001 int outh = (h - 1) * stride_h + kernel_extent_h;
1002 int out_elempack = opt.use_packing_layout && num_output % 4 == 0 ? 4 : 1;
1003 size_t out_elemsize = elemsize / elempack * out_elempack;
1004
1005 Mat top_blob_bordered;
1006 if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || output_pad_right > 0 || output_pad_bottom > 0 || (output_w > 0 && output_h > 0))
1007 {
1008 top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_allocator);
1009 }
1010 else
1011 {
1012 top_blob_bordered = top_blob;
1013 top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
1014 }
1015 if (top_blob_bordered.empty())
1016 return -100;
1017
1018 const int maxk = kernel_w * kernel_h;
1019
1020 // depth-wise
1021 if (channels * elempack == group && group == num_output)
1022 {
1023 #if __ARM_NEON
1024 if (elempack == 4)
1025 {
1026 #pragma omp parallel for num_threads(opt.num_threads)
1027 for (int g = 0; g < channels; g++)
1028 {
1029 unsigned short* outptr = top_blob_bordered.channel(g);
1030 const unsigned short* kptr = (const unsigned short*)weight_data_bf16 + maxk * g * 4;
1031 const Mat m = bottom_blob.channel(g);
1032
1033 for (int i = 0; i < outh; i++)
1034 {
1035 for (int j = 0; j < outw; j++)
1036 {
1037 float32x4_t _sum = vdupq_n_f32(0.f);
1038
1039 if (bias_term)
1040 {
1041 _sum = vld1q_f32((const float*)bias_data + g * 4);
1042 }
1043
1044 for (int y = 0; y < kernel_h; y++)
1045 {
1046 int sys = (i + y * dilation_h - (kernel_extent_h - 1));
1047 if (sys < 0 || sys % stride_h != 0)
1048 continue;
1049
1050 int sy = sys / stride_h;
1051 if (sy >= h)
1052 continue;
1053
1054 for (int x = 0; x < kernel_w; x++)
1055 {
1056 int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
1057 if (sxs < 0 || sxs % stride_w != 0)
1058 continue;
1059
1060 int sx = sxs / stride_w;
1061 if (sx >= w)
1062 continue;
1063
1064 const unsigned short* sptr = m.row<const unsigned short>(sy) + sx * 4;
1065
1066 float32x4_t _val = vcvt_f32_bf16(vld1_u16(sptr));
1067
1068 int k = y * kernel_w + x;
1069
1070 float32x4_t _w = vcvt_f32_bf16(vld1_u16(kptr + k * 4));
1071
1072 _sum = vmlaq_f32(_sum, _val, _w);
1073 }
1074 }
1075
1076 _sum = activation_ps(_sum, activation_type, activation_params);
1077
1078 vst1_u16(outptr + j * 4, vcvt_bf16_f32(_sum));
1079 }
1080
1081 outptr += outw * 4;
1082 }
1083 }
1084 }
1085 #endif // __ARM_NEON
1086
1087 if (elempack == 1)
1088 {
1089 #pragma omp parallel for num_threads(opt.num_threads)
1090 for (int g = 0; g < channels; g++)
1091 {
1092 unsigned short* outptr = top_blob_bordered.channel(g);
1093 const unsigned short* kptr = (const unsigned short*)weight_data_bf16 + maxk * g;
1094 const Mat m = bottom_blob.channel(g);
1095
1096 for (int i = 0; i < outh; i++)
1097 {
1098 for (int j = 0; j < outw; j++)
1099 {
1100 float sum = 0.f;
1101
1102 if (bias_term)
1103 {
1104 sum = bias_data[g];
1105 }
1106
1107 for (int y = 0; y < kernel_h; y++)
1108 {
1109 int sys = (i + y * dilation_h - (kernel_extent_h - 1));
1110 if (sys < 0 || sys % stride_h != 0)
1111 continue;
1112
1113 int sy = sys / stride_h;
1114 if (sy >= h)
1115 continue;
1116
1117 const unsigned short* sptr = m.row<const unsigned short>(sy);
1118
1119 for (int x = 0; x < kernel_w; x++)
1120 {
1121 int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
1122 if (sxs < 0 || sxs % stride_w != 0)
1123 continue;
1124
1125 int sx = sxs / stride_w;
1126 if (sx >= w)
1127 continue;
1128
1129 float val = bfloat16_to_float32(sptr[sx]);
1130
1131 int k = y * kernel_w + x;
1132
1133 float w = bfloat16_to_float32(kptr[k]);
1134
1135 sum += val * w;
1136 }
1137 }
1138
1139 if (activation_type == 1)
1140 {
1141 sum = std::max(sum, 0.f);
1142 }
1143 else if (activation_type == 2)
1144 {
1145 float slope = activation_params[0];
1146 sum = sum > 0.f ? sum : sum * slope;
1147 }
1148 else if (activation_type == 3)
1149 {
1150 float min = activation_params[0];
1151 float max = activation_params[1];
1152 if (sum < min)
1153 sum = min;
1154 if (sum > max)
1155 sum = max;
1156 }
1157 else if (activation_type == 4)
1158 {
1159 sum = static_cast<float>(1.f / (1.f + exp(-sum)));
1160 }
1161
1162 outptr[j] = float32_to_bfloat16(sum);
1163 }
1164
1165 outptr += outw;
1166 }
1167 }
1168 }
1169 }
1170 else
1171 {
1172 // group deconvolution
1173 const int channels_g = channels * elempack / group;
1174 const int num_output_g = num_output / group;
1175
1176 int g_elempack = (support_packing && opt.use_packing_layout && channels_g % 4 == 0) ? 4 : 1;
1177 int out_g_elempack = (support_packing && opt.use_packing_layout && num_output_g % 4 == 0) ? 4 : 1;
1178
1179 // unpacking
1180 Mat bottom_blob_unpacked = bottom_blob;
1181 if (elempack == 4 && g_elempack == 1)
1182 {
1183 Option opt_p = opt;
1184 opt_p.blob_allocator = opt.workspace_allocator;
1185 convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_p);
1186 }
1187
1188 Mat top_blob_bordered_unpacked = top_blob_bordered;
1189 if (out_g_elempack == 1 && out_elempack == 4)
1190 {
1191 top_blob_bordered_unpacked.create(outw, outh, num_output, out_elemsize / out_elempack, 1, opt.workspace_allocator);
1192 if (top_blob_bordered_unpacked.empty())
1193 return -100;
1194 }
1195
1196 for (int g = 0; g < group; g++)
1197 {
1198 const Mat bottom_blob_g = bottom_blob_unpacked.channel_range(channels_g * g / g_elempack, channels_g / g_elempack);
1199 Mat top_blob_bordered_g = top_blob_bordered_unpacked.channel_range(num_output_g * g / out_g_elempack, num_output_g / out_g_elempack);
1200
1201 const ncnn::Layer* op = group_ops[g];
1202
1203 Option opt_g = opt;
1204 opt_g.blob_allocator = top_blob_bordered_unpacked.allocator;
1205
1206 // forward
1207 op->forward(bottom_blob_g, top_blob_bordered_g, opt_g);
1208 }
1209
1210 // packing
1211 if (out_g_elempack == 1 && out_elempack == 4)
1212 {
1213 convert_packing(top_blob_bordered_unpacked, top_blob_bordered, 4, opt);
1214 }
1215 else
1216 {
1217 top_blob_bordered = top_blob_bordered_unpacked;
1218 }
1219 }
1220
1221 cut_padding(top_blob_bordered, top_blob, opt);
1222 if (top_blob.empty())
1223 return -100;
1224
1225 return 0;
1226 }
1227
1228 } // namespace ncnn
1229