1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14
15 #include "deconvolutiondepthwise_arm.h"
16
17 #include "layer_type.h"
18
19 #if __ARM_NEON
20 #include <arm_neon.h>
21 #include "neon_mathfun.h"
22 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
23 #include "neon_mathfun_fp16s.h"
24 #endif
25 #endif // __ARM_NEON
26
27 #include "neon_activation.h"
28
29 namespace ncnn {
30
DeconvolutionDepthWise_arm()31 DeconvolutionDepthWise_arm::DeconvolutionDepthWise_arm()
32 {
33 #if __ARM_NEON
34 support_packing = true;
35 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
36 support_fp16_storage = true;
37 #endif
38 #endif // __ARM_NEON
39
40 support_bf16_storage = true;
41 }
42
create_pipeline(const Option & opt)43 int DeconvolutionDepthWise_arm::create_pipeline(const Option& opt)
44 {
45 // create Deconvolution op for each group
46 const int maxk = kernel_w * kernel_h;
47 int channels = (weight_data_size / group) / maxk / (num_output / group) * group;
48
49 // depth-wise
50 if (channels == group && group == num_output)
51 {
52 int elempack = (support_packing && opt.use_packing_layout && channels % 4 == 0) ? 4 : 1;
53
54 Mat weight_data_transposed(weight_data.w);
55 {
56 float* pt = weight_data_transposed;
57 const float* p = weight_data;
58
59 for (int i = 0; i < (channels / group) * (num_output / group) * group; i++)
60 {
61 for (int k = 0; k < maxk; k++)
62 {
63 pt[maxk - 1 - k] = p[k];
64 }
65
66 p += maxk;
67 pt += maxk;
68 }
69 }
70
71 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
72 if (opt.use_fp16_storage)
73 {
74 if (opt.use_packing_layout)
75 {
76 elempack = opt.use_fp16_arithmetic && channels % 8 == 0 ? 8 : channels % 4 == 0 ? 4 : 1;
77 }
78
79 if (elempack == 8)
80 {
81 Mat weight_data_r2 = weight_data_transposed.reshape(maxk, group);
82 Mat weight_data_r2_packed;
83 convert_packing(weight_data_r2, weight_data_r2_packed, 8);
84
85 ncnn::cast_float32_to_float16(weight_data_r2_packed, weight_data_fp16, opt);
86 }
87
88 if (elempack == 4)
89 {
90 Mat weight_data_r2 = weight_data_transposed.reshape(maxk, group);
91 Mat weight_data_r2_packed;
92 convert_packing(weight_data_r2, weight_data_r2_packed, 4);
93
94 ncnn::cast_float32_to_float16(weight_data_r2_packed, weight_data_fp16, opt);
95 }
96
97 if (elempack == 1)
98 {
99 ncnn::cast_float32_to_float16(weight_data_transposed, weight_data_fp16, opt);
100 }
101
102 ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt);
103
104 return 0;
105 }
106 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
107
108 if (opt.use_bf16_storage)
109 {
110 #if __ARM_NEON
111 if (elempack == 4)
112 {
113 Mat weight_data_r2 = weight_data_transposed.reshape(maxk, group);
114 convert_packing(weight_data_r2, weight_data_pack4, 4);
115
116 ncnn::cast_float32_to_bfloat16(weight_data_pack4, weight_data_bf16, opt);
117 }
118 #endif // __ARM_NEON
119
120 if (elempack == 1)
121 {
122 ncnn::cast_float32_to_bfloat16(weight_data_transposed, weight_data_bf16, opt);
123 }
124
125 return 0;
126 }
127
128 #if __ARM_NEON
129 // pack4
130 if (elempack == 4)
131 {
132 Mat weight_data_r2 = weight_data_transposed.reshape(maxk, group);
133 convert_packing(weight_data_r2, weight_data_pack4, 4);
134 }
135 #endif // __ARM_NEON
136
137 // pack1
138 if (elempack == 1)
139 {
140 weight_data_pack1 = weight_data_transposed;
141 }
142 }
143 else
144 {
145 // group deconvolution
146 for (int i = 0; i < (int)group_ops.size(); i++)
147 delete group_ops[i];
148
149 group_ops.clear();
150
151 const int channels_g = channels / group;
152 const int num_output_g = num_output / group;
153
154 group_ops.resize(group);
155
156 for (int g = 0; g < group; g++)
157 {
158 Mat weight_data_g = weight_data.range(maxk * channels_g * num_output_g * g, maxk * channels_g * num_output_g);
159 Mat bias_data_g;
160 if (bias_term)
161 bias_data_g = bias_data.range(num_output_g * g, num_output_g);
162
163 ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Deconvolution);
164
165 // set param
166 ncnn::ParamDict pd;
167 pd.set(0, num_output_g); // num_output
168 pd.set(1, kernel_w);
169 pd.set(11, kernel_h);
170 pd.set(2, dilation_w);
171 pd.set(12, dilation_h);
172 pd.set(3, stride_w);
173 pd.set(13, stride_h);
174 pd.set(4, 0); // pad_w
175 pd.set(14, 0); // pad_h
176 pd.set(5, bias_term);
177 pd.set(6, maxk * channels_g * num_output_g); // weight_data_size
178 pd.set(9, activation_type);
179 pd.set(10, activation_params);
180
181 op->load_param(pd);
182
183 // set weights
184 if (bias_term)
185 {
186 ncnn::Mat weights[2];
187 weights[0] = weight_data_g;
188 weights[1] = bias_data_g;
189
190 op->load_model(ModelBinFromMatArray(weights));
191 }
192 else
193 {
194 ncnn::Mat weights[1];
195 weights[0] = weight_data_g;
196
197 op->load_model(ModelBinFromMatArray(weights));
198 }
199
200 op->create_pipeline(opt);
201
202 group_ops[g] = op;
203 }
204 }
205
206 return 0;
207 }
208
destroy_pipeline(const Option & opt)209 int DeconvolutionDepthWise_arm::destroy_pipeline(const Option& opt)
210 {
211 for (int i = 0; i < (int)group_ops.size(); i++)
212 {
213 group_ops[i]->destroy_pipeline(opt);
214 delete group_ops[i];
215 }
216 group_ops.clear();
217
218 return 0;
219 }
220
forward(const Mat & bottom_blob,Mat & top_blob,const Option & opt) const221 int DeconvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
222 {
223 int elembits = bottom_blob.elembits();
224
225 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
226 if (opt.use_fp16_storage && elembits == 16)
227 {
228 if (opt.use_fp16_arithmetic)
229 return forward_fp16sa(bottom_blob, top_blob, opt);
230 else
231 return forward_fp16s(bottom_blob, top_blob, opt);
232 }
233 #endif
234
235 if (opt.use_bf16_storage && elembits == 16)
236 return forward_bf16s(bottom_blob, top_blob, opt);
237
238 // convolv with NxN kernel
239 // value = value + bias
240
241 int w = bottom_blob.w;
242 int h = bottom_blob.h;
243 int channels = bottom_blob.c;
244 size_t elemsize = bottom_blob.elemsize;
245 int elempack = bottom_blob.elempack;
246
247 const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
248 const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
249
250 int outw = (w - 1) * stride_w + kernel_extent_w;
251 int outh = (h - 1) * stride_h + kernel_extent_h;
252 int out_elempack = (support_packing && opt.use_packing_layout && num_output % 4 == 0) ? 4 : 1;
253 size_t out_elemsize = elemsize / elempack * out_elempack;
254
255 Mat top_blob_bordered;
256 if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || output_pad_right > 0 || output_pad_bottom > 0 || (output_w > 0 && output_h > 0))
257 {
258 top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_allocator);
259 }
260 else
261 {
262 top_blob_bordered = top_blob;
263 top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
264 }
265 if (top_blob_bordered.empty())
266 return -100;
267
268 const int maxk = kernel_w * kernel_h;
269
270 // depth-wise
271 if (channels * elempack == group && group == num_output)
272 {
273 #if __ARM_NEON
274 if (elempack == 4)
275 {
276 #pragma omp parallel for num_threads(opt.num_threads)
277 for (int g = 0; g < channels; g++)
278 {
279 float* outptr = top_blob_bordered.channel(g);
280 const float* kptr = (const float*)weight_data_pack4 + maxk * g * 4;
281 const Mat m = bottom_blob.channel(g);
282
283 for (int i = 0; i < outh; i++)
284 {
285 for (int j = 0; j < outw; j++)
286 {
287 float32x4_t _sum = vdupq_n_f32(0.f);
288
289 if (bias_term)
290 {
291 _sum = vld1q_f32((const float*)bias_data + g * 4);
292 }
293
294 for (int y = 0; y < kernel_h; y++)
295 {
296 int sys = (i + y * dilation_h - (kernel_extent_h - 1));
297 if (sys < 0 || sys % stride_h != 0)
298 continue;
299
300 int sy = sys / stride_h;
301 if (sy >= h)
302 continue;
303
304 for (int x = 0; x < kernel_w; x++)
305 {
306 int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
307 if (sxs < 0 || sxs % stride_w != 0)
308 continue;
309
310 int sx = sxs / stride_w;
311 if (sx >= w)
312 continue;
313
314 const float* sptr = m.row(sy) + sx * 4;
315
316 float32x4_t _val = vld1q_f32(sptr);
317
318 int k = y * kernel_w + x;
319
320 float32x4_t _w = vld1q_f32(kptr + k * 4);
321
322 _sum = vmlaq_f32(_sum, _val, _w);
323 }
324 }
325
326 _sum = activation_ps(_sum, activation_type, activation_params);
327
328 vst1q_f32(outptr + j * 4, _sum);
329 }
330
331 outptr += outw * 4;
332 }
333 }
334 }
335 #endif // __ARM_NEON
336
337 if (elempack == 1)
338 {
339 #pragma omp parallel for num_threads(opt.num_threads)
340 for (int g = 0; g < channels; g++)
341 {
342 float* outptr = top_blob_bordered.channel(g);
343 const float* kptr = (const float*)weight_data_pack1 + maxk * g;
344 const Mat m = bottom_blob.channel(g);
345
346 for (int i = 0; i < outh; i++)
347 {
348 for (int j = 0; j < outw; j++)
349 {
350 float sum = 0.f;
351
352 if (bias_term)
353 {
354 sum = bias_data[g];
355 }
356
357 for (int y = 0; y < kernel_h; y++)
358 {
359 int sys = (i + y * dilation_h - (kernel_extent_h - 1));
360 if (sys < 0 || sys % stride_h != 0)
361 continue;
362
363 int sy = sys / stride_h;
364 if (sy >= h)
365 continue;
366
367 const float* sptr = m.row(sy);
368
369 for (int x = 0; x < kernel_w; x++)
370 {
371 int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
372 if (sxs < 0 || sxs % stride_w != 0)
373 continue;
374
375 int sx = sxs / stride_w;
376 if (sx >= w)
377 continue;
378
379 float val = sptr[sx];
380
381 int k = y * kernel_w + x;
382
383 float w = kptr[k];
384
385 sum += val * w;
386 }
387 }
388
389 if (activation_type == 1)
390 {
391 sum = std::max(sum, 0.f);
392 }
393 else if (activation_type == 2)
394 {
395 float slope = activation_params[0];
396 sum = sum > 0.f ? sum : sum * slope;
397 }
398 else if (activation_type == 3)
399 {
400 float min = activation_params[0];
401 float max = activation_params[1];
402 if (sum < min)
403 sum = min;
404 if (sum > max)
405 sum = max;
406 }
407 else if (activation_type == 4)
408 {
409 sum = static_cast<float>(1.f / (1.f + exp(-sum)));
410 }
411
412 outptr[j] = sum;
413 }
414
415 outptr += outw;
416 }
417 }
418 }
419 }
420 else
421 {
422 // group deconvolution
423 const int channels_g = channels * elempack / group;
424 const int num_output_g = num_output / group;
425
426 int g_elempack = (support_packing && opt.use_packing_layout && channels_g % 4 == 0) ? 4 : 1;
427 int out_g_elempack = (support_packing && opt.use_packing_layout && num_output_g % 4 == 0) ? 4 : 1;
428
429 // unpacking
430 Mat bottom_blob_unpacked = bottom_blob;
431 if (elempack == 4 && g_elempack == 1)
432 {
433 Option opt_p = opt;
434 opt_p.blob_allocator = opt.workspace_allocator;
435 convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_p);
436 }
437
438 Mat top_blob_bordered_unpacked = top_blob_bordered;
439 if (out_g_elempack == 1 && out_elempack == 4)
440 {
441 top_blob_bordered_unpacked.create(outw, outh, num_output, out_elemsize / out_elempack, 1, opt.workspace_allocator);
442 if (top_blob_bordered_unpacked.empty())
443 return -100;
444 }
445
446 for (int g = 0; g < group; g++)
447 {
448 const Mat bottom_blob_g = bottom_blob_unpacked.channel_range(channels_g * g / g_elempack, channels_g / g_elempack);
449 Mat top_blob_bordered_g = top_blob_bordered_unpacked.channel_range(num_output_g * g / out_g_elempack, num_output_g / out_g_elempack);
450
451 const ncnn::Layer* op = group_ops[g];
452
453 Option opt_g = opt;
454 opt_g.blob_allocator = top_blob_bordered_unpacked.allocator;
455
456 // forward
457 op->forward(bottom_blob_g, top_blob_bordered_g, opt_g);
458 }
459
460 // packing
461 if (out_g_elempack == 1 && out_elempack == 4)
462 {
463 convert_packing(top_blob_bordered_unpacked, top_blob_bordered, 4, opt);
464 }
465 else
466 {
467 top_blob_bordered = top_blob_bordered_unpacked;
468 }
469 }
470
471 cut_padding(top_blob_bordered, top_blob, opt);
472 if (top_blob.empty())
473 return -100;
474
475 return 0;
476 }
477
478 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
forward_fp16s(const Mat & bottom_blob,Mat & top_blob,const Option & opt) const479 int DeconvolutionDepthWise_arm::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
480 {
481 int w = bottom_blob.w;
482 int h = bottom_blob.h;
483 int channels = bottom_blob.c;
484 size_t elemsize = bottom_blob.elemsize;
485 int elempack = bottom_blob.elempack;
486
487 const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
488 const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
489
490 int outw = (w - 1) * stride_w + kernel_extent_w;
491 int outh = (h - 1) * stride_h + kernel_extent_h;
492 int out_elempack = (support_packing && opt.use_packing_layout && num_output % 4 == 0) ? 4 : 1;
493 size_t out_elemsize = elemsize / elempack * out_elempack;
494
495 Mat top_blob_bordered;
496 if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || output_pad_right > 0 || output_pad_bottom > 0 || (output_w > 0 && output_h > 0))
497 {
498 top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_allocator);
499 }
500 else
501 {
502 top_blob_bordered = top_blob;
503 top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
504 }
505 if (top_blob_bordered.empty())
506 return -100;
507
508 const int maxk = kernel_w * kernel_h;
509
510 // depth-wise
511 if (channels * elempack == group && group == num_output)
512 {
513 if (elempack == 4)
514 {
515 {
516 #pragma omp parallel for num_threads(opt.num_threads)
517 for (int g = 0; g < channels; g++)
518 {
519 __fp16* outptr = top_blob_bordered.channel(g);
520 const __fp16* kptr = (const __fp16*)weight_data_fp16 + maxk * g * 4;
521 const Mat m = bottom_blob.channel(g);
522
523 for (int i = 0; i < outh; i++)
524 {
525 for (int j = 0; j < outw; j++)
526 {
527 float32x4_t _sum = vdupq_n_f32(0.f);
528
529 if (bias_term)
530 {
531 _sum = vld1q_f32((const float*)bias_data + g * 4);
532 }
533
534 for (int y = 0; y < kernel_h; y++)
535 {
536 int sys = (i + y * dilation_h - (kernel_extent_h - 1));
537 if (sys < 0 || sys % stride_h != 0)
538 continue;
539
540 int sy = sys / stride_h;
541 if (sy >= h)
542 continue;
543
544 for (int x = 0; x < kernel_w; x++)
545 {
546 int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
547 if (sxs < 0 || sxs % stride_w != 0)
548 continue;
549
550 int sx = sxs / stride_w;
551 if (sx >= w)
552 continue;
553
554 const __fp16* sptr = m.row<const __fp16>(sy) + sx * 4;
555
556 float32x4_t _val = vcvt_f32_f16(vld1_f16(sptr));
557
558 int k = y * kernel_w + x;
559
560 float32x4_t _w = vcvt_f32_f16(vld1_f16(kptr + k * 4));
561
562 _sum = vfmaq_f32(_sum, _val, _w);
563 }
564 }
565
566 _sum = activation_ps(_sum, activation_type, activation_params);
567
568 vst1_f16(outptr + j * 4, vcvt_f16_f32(_sum));
569 }
570
571 outptr += outw * 4;
572 }
573 }
574 }
575 }
576
577 if (elempack == 1)
578 {
579 {
580 #pragma omp parallel for num_threads(opt.num_threads)
581 for (int g = 0; g < channels; g++)
582 {
583 __fp16* outptr = top_blob_bordered.channel(g);
584 const __fp16* kptr = (const __fp16*)weight_data_fp16 + maxk * g;
585 const Mat m = bottom_blob.channel(g);
586
587 for (int i = 0; i < outh; i++)
588 {
589 for (int j = 0; j < outw; j++)
590 {
591 float sum = 0.f;
592
593 if (bias_term)
594 {
595 sum = bias_data[g];
596 }
597
598 for (int y = 0; y < kernel_h; y++)
599 {
600 int sys = (i + y * dilation_h - (kernel_extent_h - 1));
601 if (sys < 0 || sys % stride_h != 0)
602 continue;
603
604 int sy = sys / stride_h;
605 if (sy >= h)
606 continue;
607
608 const __fp16* sptr = m.row<const __fp16>(sy);
609
610 for (int x = 0; x < kernel_w; x++)
611 {
612 int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
613 if (sxs < 0 || sxs % stride_w != 0)
614 continue;
615
616 int sx = sxs / stride_w;
617 if (sx >= w)
618 continue;
619
620 float val = (float)sptr[sx];
621
622 int k = y * kernel_w + x;
623
624 float w = (float)kptr[k];
625
626 sum += val * w;
627 }
628 }
629
630 sum = activation_ss(sum, activation_type, activation_params);
631
632 outptr[j] = (__fp16)sum;
633 }
634
635 outptr += outw;
636 }
637 }
638 }
639 }
640 }
641 else
642 {
643 // group deconvolution
644 const int channels_g = channels * elempack / group;
645 const int num_output_g = num_output / group;
646
647 int g_elempack = (support_packing && opt.use_packing_layout && channels_g % 4 == 0) ? 4 : 1;
648 int out_g_elempack = (support_packing && opt.use_packing_layout && num_output_g % 4 == 0) ? 4 : 1;
649
650 // unpacking
651 Mat bottom_blob_unpacked = bottom_blob;
652 if (elempack == 4 && g_elempack == 1)
653 {
654 Option opt_p = opt;
655 opt_p.blob_allocator = opt.workspace_allocator;
656 convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_p);
657 }
658
659 Mat top_blob_bordered_unpacked = top_blob_bordered;
660 if (out_g_elempack == 1 && out_elempack == 4)
661 {
662 top_blob_bordered_unpacked.create(outw, outh, num_output, out_elemsize / out_elempack, 1, opt.workspace_allocator);
663 if (top_blob_bordered_unpacked.empty())
664 return -100;
665 }
666
667 for (int g = 0; g < group; g++)
668 {
669 const Mat bottom_blob_g = bottom_blob_unpacked.channel_range(channels_g * g / g_elempack, channels_g / g_elempack);
670 Mat top_blob_bordered_g = top_blob_bordered_unpacked.channel_range(num_output_g * g / out_g_elempack, num_output_g / out_g_elempack);
671
672 const ncnn::Layer* op = group_ops[g];
673
674 Option opt_g = opt;
675 opt_g.blob_allocator = top_blob_bordered_unpacked.allocator;
676
677 // forward
678 op->forward(bottom_blob_g, top_blob_bordered_g, opt_g);
679 }
680
681 // packing
682 if (out_g_elempack == 1 && out_elempack == 4)
683 {
684 convert_packing(top_blob_bordered_unpacked, top_blob_bordered, 4, opt);
685 }
686 else
687 {
688 top_blob_bordered = top_blob_bordered_unpacked;
689 }
690 }
691
692 cut_padding(top_blob_bordered, top_blob, opt);
693 if (top_blob.empty())
694 return -100;
695
696 return 0;
697 }
698
forward_fp16sa(const Mat & bottom_blob,Mat & top_blob,const Option & opt) const699 int DeconvolutionDepthWise_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
700 {
701 int w = bottom_blob.w;
702 int h = bottom_blob.h;
703 int channels = bottom_blob.c;
704 size_t elemsize = bottom_blob.elemsize;
705 int elempack = bottom_blob.elempack;
706
707 const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
708 const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
709
710 int outw = (w - 1) * stride_w + kernel_extent_w;
711 int outh = (h - 1) * stride_h + kernel_extent_h;
712 int out_elempack = 1;
713 if (opt.use_packing_layout)
714 {
715 out_elempack = opt.use_fp16_arithmetic && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
716 }
717 size_t out_elemsize = elemsize / elempack * out_elempack;
718
719 Mat top_blob_bordered;
720 if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || output_pad_right > 0 || output_pad_bottom > 0 || (output_w > 0 && output_h > 0))
721 {
722 top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_allocator);
723 }
724 else
725 {
726 top_blob_bordered = top_blob;
727 top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
728 }
729 if (top_blob_bordered.empty())
730 return -100;
731
732 const int maxk = kernel_w * kernel_h;
733
734 // depth-wise
735 if (channels * elempack == group && group == num_output)
736 {
737 if (elempack == 8)
738 {
739 {
740 #pragma omp parallel for num_threads(opt.num_threads)
741 for (int g = 0; g < channels; g++)
742 {
743 __fp16* outptr = top_blob_bordered.channel(g);
744 const __fp16* kptr = (const __fp16*)weight_data_fp16 + maxk * g * 8;
745 const Mat m = bottom_blob.channel(g);
746
747 for (int i = 0; i < outh; i++)
748 {
749 for (int j = 0; j < outw; j++)
750 {
751 float16x8_t _sum = vdupq_n_f16((__fp16)0.f);
752
753 if (bias_term)
754 {
755 _sum = vld1q_f16((const __fp16*)bias_data_fp16 + g * 8);
756 }
757
758 for (int y = 0; y < kernel_h; y++)
759 {
760 int sys = (i + y * dilation_h - (kernel_extent_h - 1));
761 if (sys < 0 || sys % stride_h != 0)
762 continue;
763
764 int sy = sys / stride_h;
765 if (sy >= h)
766 continue;
767
768 for (int x = 0; x < kernel_w; x++)
769 {
770 int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
771 if (sxs < 0 || sxs % stride_w != 0)
772 continue;
773
774 int sx = sxs / stride_w;
775 if (sx >= w)
776 continue;
777
778 const __fp16* sptr = m.row<const __fp16>(sy) + sx * 8;
779
780 float16x8_t _val = vld1q_f16(sptr);
781
782 int k = y * kernel_w + x;
783
784 float16x8_t _w = vld1q_f16(kptr + k * 8);
785
786 _sum = vfmaq_f16(_sum, _val, _w);
787 }
788 }
789
790 _sum = activation_ps(_sum, activation_type, activation_params);
791
792 vst1q_f16(outptr + j * 8, _sum);
793 }
794
795 outptr += outw * 8;
796 }
797 }
798 }
799 }
800
801 if (elempack == 4)
802 {
803 {
804 #pragma omp parallel for num_threads(opt.num_threads)
805 for (int g = 0; g < channels; g++)
806 {
807 __fp16* outptr = top_blob_bordered.channel(g);
808 const __fp16* kptr = (const __fp16*)weight_data_fp16 + maxk * g * 4;
809 const Mat m = bottom_blob.channel(g);
810
811 for (int i = 0; i < outh; i++)
812 {
813 for (int j = 0; j < outw; j++)
814 {
815 float16x4_t _sum = vdup_n_f16((__fp16)0.f);
816
817 if (bias_term)
818 {
819 _sum = vld1_f16((const __fp16*)bias_data_fp16 + g * 4);
820 }
821
822 for (int y = 0; y < kernel_h; y++)
823 {
824 int sys = (i + y * dilation_h - (kernel_extent_h - 1));
825 if (sys < 0 || sys % stride_h != 0)
826 continue;
827
828 int sy = sys / stride_h;
829 if (sy >= h)
830 continue;
831
832 for (int x = 0; x < kernel_w; x++)
833 {
834 int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
835 if (sxs < 0 || sxs % stride_w != 0)
836 continue;
837
838 int sx = sxs / stride_w;
839 if (sx >= w)
840 continue;
841
842 const __fp16* sptr = m.row<const __fp16>(sy) + sx * 4;
843
844 float16x4_t _val = vld1_f16(sptr);
845
846 int k = y * kernel_w + x;
847
848 float16x4_t _w = vld1_f16(kptr + k * 4);
849
850 _sum = vfma_f16(_sum, _val, _w);
851 }
852 }
853
854 _sum = activation_ps(_sum, activation_type, activation_params);
855
856 vst1_f16(outptr + j * 4, _sum);
857 }
858
859 outptr += outw * 4;
860 }
861 }
862 }
863 }
864
865 if (elempack == 1)
866 {
867 {
868 #pragma omp parallel for num_threads(opt.num_threads)
869 for (int g = 0; g < channels; g++)
870 {
871 __fp16* outptr = top_blob_bordered.channel(g);
872 const __fp16* kptr = (const __fp16*)weight_data_fp16 + maxk * g;
873 const Mat m = bottom_blob.channel(g);
874
875 for (int i = 0; i < outh; i++)
876 {
877 for (int j = 0; j < outw; j++)
878 {
879 float sum = 0.f;
880
881 if (bias_term)
882 {
883 sum = bias_data[g];
884 }
885
886 for (int y = 0; y < kernel_h; y++)
887 {
888 int sys = (i + y * dilation_h - (kernel_extent_h - 1));
889 if (sys < 0 || sys % stride_h != 0)
890 continue;
891
892 int sy = sys / stride_h;
893 if (sy >= h)
894 continue;
895
896 const __fp16* sptr = m.row<const __fp16>(sy);
897
898 for (int x = 0; x < kernel_w; x++)
899 {
900 int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
901 if (sxs < 0 || sxs % stride_w != 0)
902 continue;
903
904 int sx = sxs / stride_w;
905 if (sx >= w)
906 continue;
907
908 __fp16 val = sptr[sx];
909
910 int k = y * kernel_w + x;
911
912 __fp16 w = kptr[k];
913
914 sum += val * w;
915 }
916 }
917
918 sum = activation_ss(sum, activation_type, activation_params);
919
920 outptr[j] = (__fp16)sum;
921 }
922
923 outptr += outw;
924 }
925 }
926 }
927 }
928 }
929 else
930 {
931 // group deconvolution
932 const int channels_g = channels * elempack / group;
933 const int num_output_g = num_output / group;
934
935 int g_elempack = 1;
936 int out_g_elempack = 1;
937 if (opt.use_packing_layout)
938 {
939 g_elempack = opt.use_fp16_arithmetic && channels_g % 8 == 0 ? 8 : channels_g % 4 == 0 ? 4 : 1;
940 out_g_elempack = opt.use_fp16_arithmetic && num_output_g % 8 == 0 ? 8 : num_output_g % 4 == 0 ? 4 : 1;
941 }
942
943 // unpacking
944 Mat bottom_blob_unpacked = bottom_blob;
945 if (elempack > g_elempack)
946 {
947 Option opt_p = opt;
948 opt_p.blob_allocator = opt.workspace_allocator;
949 convert_packing(bottom_blob, bottom_blob_unpacked, g_elempack, opt_p);
950 }
951
952 Mat top_blob_bordered_unpacked = top_blob_bordered;
953 if (out_g_elempack < out_elempack)
954 {
955 top_blob_bordered_unpacked.create(outw, outh, num_output / out_g_elempack, out_elemsize / out_elempack * out_g_elempack, out_g_elempack, opt.workspace_allocator);
956 if (top_blob_bordered_unpacked.empty())
957 return -100;
958 }
959
960 for (int g = 0; g < group; g++)
961 {
962 const Mat bottom_blob_g = bottom_blob_unpacked.channel_range(channels_g * g / g_elempack, channels_g / g_elempack);
963 Mat top_blob_bordered_g = top_blob_bordered_unpacked.channel_range(num_output_g * g / out_g_elempack, num_output_g / out_g_elempack);
964
965 const ncnn::Layer* op = group_ops[g];
966
967 Option opt_g = opt;
968 opt_g.blob_allocator = top_blob_bordered_unpacked.allocator;
969
970 // forward
971 op->forward(bottom_blob_g, top_blob_bordered_g, opt_g);
972 }
973
974 // packing
975 if (out_g_elempack < out_elempack)
976 {
977 convert_packing(top_blob_bordered_unpacked, top_blob_bordered, out_elempack, opt);
978 }
979 else
980 {
981 top_blob_bordered = top_blob_bordered_unpacked;
982 }
983 }
984
985 cut_padding(top_blob_bordered, top_blob, opt);
986 if (top_blob.empty())
987 return -100;
988
989 return 0;
990 }
991 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
992
forward_bf16s(const Mat & bottom_blob,Mat & top_blob,const Option & opt) const993 int DeconvolutionDepthWise_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
994 {
995 int w = bottom_blob.w;
996 int h = bottom_blob.h;
997 int channels = bottom_blob.c;
998 size_t elemsize = bottom_blob.elemsize;
999 int elempack = bottom_blob.elempack;
1000
1001 const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
1002 const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
1003
1004 int outw = (w - 1) * stride_w + kernel_extent_w;
1005 int outh = (h - 1) * stride_h + kernel_extent_h;
1006 int out_elempack = opt.use_packing_layout && num_output % 4 == 0 ? 4 : 1;
1007 size_t out_elemsize = elemsize / elempack * out_elempack;
1008
1009 Mat top_blob_bordered;
1010 if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || output_pad_right > 0 || output_pad_bottom > 0 || (output_w > 0 && output_h > 0))
1011 {
1012 top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_allocator);
1013 }
1014 else
1015 {
1016 top_blob_bordered = top_blob;
1017 top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
1018 }
1019 if (top_blob_bordered.empty())
1020 return -100;
1021
1022 const int maxk = kernel_w * kernel_h;
1023
1024 // depth-wise
1025 if (channels * elempack == group && group == num_output)
1026 {
1027 #if __ARM_NEON
1028 if (elempack == 4)
1029 {
1030 #pragma omp parallel for num_threads(opt.num_threads)
1031 for (int g = 0; g < channels; g++)
1032 {
1033 unsigned short* outptr = top_blob_bordered.channel(g);
1034 const unsigned short* kptr = (const unsigned short*)weight_data_bf16 + maxk * g * 4;
1035 const Mat m = bottom_blob.channel(g);
1036
1037 for (int i = 0; i < outh; i++)
1038 {
1039 for (int j = 0; j < outw; j++)
1040 {
1041 float32x4_t _sum = vdupq_n_f32(0.f);
1042
1043 if (bias_term)
1044 {
1045 _sum = vld1q_f32((const float*)bias_data + g * 4);
1046 }
1047
1048 for (int y = 0; y < kernel_h; y++)
1049 {
1050 int sys = (i + y * dilation_h - (kernel_extent_h - 1));
1051 if (sys < 0 || sys % stride_h != 0)
1052 continue;
1053
1054 int sy = sys / stride_h;
1055 if (sy >= h)
1056 continue;
1057
1058 for (int x = 0; x < kernel_w; x++)
1059 {
1060 int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
1061 if (sxs < 0 || sxs % stride_w != 0)
1062 continue;
1063
1064 int sx = sxs / stride_w;
1065 if (sx >= w)
1066 continue;
1067
1068 const unsigned short* sptr = m.row<const unsigned short>(sy) + sx * 4;
1069
1070 float32x4_t _val = vcvt_f32_bf16(vld1_u16(sptr));
1071
1072 int k = y * kernel_w + x;
1073
1074 float32x4_t _w = vcvt_f32_bf16(vld1_u16(kptr + k * 4));
1075
1076 _sum = vmlaq_f32(_sum, _val, _w);
1077 }
1078 }
1079
1080 _sum = activation_ps(_sum, activation_type, activation_params);
1081
1082 vst1_u16(outptr + j * 4, vcvt_bf16_f32(_sum));
1083 }
1084
1085 outptr += outw * 4;
1086 }
1087 }
1088 }
1089 #endif // __ARM_NEON
1090
1091 if (elempack == 1)
1092 {
1093 #pragma omp parallel for num_threads(opt.num_threads)
1094 for (int g = 0; g < channels; g++)
1095 {
1096 unsigned short* outptr = top_blob_bordered.channel(g);
1097 const unsigned short* kptr = (const unsigned short*)weight_data_bf16 + maxk * g;
1098 const Mat m = bottom_blob.channel(g);
1099
1100 for (int i = 0; i < outh; i++)
1101 {
1102 for (int j = 0; j < outw; j++)
1103 {
1104 float sum = 0.f;
1105
1106 if (bias_term)
1107 {
1108 sum = bias_data[g];
1109 }
1110
1111 for (int y = 0; y < kernel_h; y++)
1112 {
1113 int sys = (i + y * dilation_h - (kernel_extent_h - 1));
1114 if (sys < 0 || sys % stride_h != 0)
1115 continue;
1116
1117 int sy = sys / stride_h;
1118 if (sy >= h)
1119 continue;
1120
1121 const unsigned short* sptr = m.row<const unsigned short>(sy);
1122
1123 for (int x = 0; x < kernel_w; x++)
1124 {
1125 int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
1126 if (sxs < 0 || sxs % stride_w != 0)
1127 continue;
1128
1129 int sx = sxs / stride_w;
1130 if (sx >= w)
1131 continue;
1132
1133 float val = bfloat16_to_float32(sptr[sx]);
1134
1135 int k = y * kernel_w + x;
1136
1137 float w = bfloat16_to_float32(kptr[k]);
1138
1139 sum += val * w;
1140 }
1141 }
1142
1143 if (activation_type == 1)
1144 {
1145 sum = std::max(sum, 0.f);
1146 }
1147 else if (activation_type == 2)
1148 {
1149 float slope = activation_params[0];
1150 sum = sum > 0.f ? sum : sum * slope;
1151 }
1152 else if (activation_type == 3)
1153 {
1154 float min = activation_params[0];
1155 float max = activation_params[1];
1156 if (sum < min)
1157 sum = min;
1158 if (sum > max)
1159 sum = max;
1160 }
1161 else if (activation_type == 4)
1162 {
1163 sum = static_cast<float>(1.f / (1.f + exp(-sum)));
1164 }
1165
1166 outptr[j] = float32_to_bfloat16(sum);
1167 }
1168
1169 outptr += outw;
1170 }
1171 }
1172 }
1173 }
1174 else
1175 {
1176 // group deconvolution
1177 const int channels_g = channels * elempack / group;
1178 const int num_output_g = num_output / group;
1179
1180 int g_elempack = (support_packing && opt.use_packing_layout && channels_g % 4 == 0) ? 4 : 1;
1181 int out_g_elempack = (support_packing && opt.use_packing_layout && num_output_g % 4 == 0) ? 4 : 1;
1182
1183 // unpacking
1184 Mat bottom_blob_unpacked = bottom_blob;
1185 if (elempack == 4 && g_elempack == 1)
1186 {
1187 Option opt_p = opt;
1188 opt_p.blob_allocator = opt.workspace_allocator;
1189 convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_p);
1190 }
1191
1192 Mat top_blob_bordered_unpacked = top_blob_bordered;
1193 if (out_g_elempack == 1 && out_elempack == 4)
1194 {
1195 top_blob_bordered_unpacked.create(outw, outh, num_output, out_elemsize / out_elempack, 1, opt.workspace_allocator);
1196 if (top_blob_bordered_unpacked.empty())
1197 return -100;
1198 }
1199
1200 for (int g = 0; g < group; g++)
1201 {
1202 const Mat bottom_blob_g = bottom_blob_unpacked.channel_range(channels_g * g / g_elempack, channels_g / g_elempack);
1203 Mat top_blob_bordered_g = top_blob_bordered_unpacked.channel_range(num_output_g * g / out_g_elempack, num_output_g / out_g_elempack);
1204
1205 const ncnn::Layer* op = group_ops[g];
1206
1207 Option opt_g = opt;
1208 opt_g.blob_allocator = top_blob_bordered_unpacked.allocator;
1209
1210 // forward
1211 op->forward(bottom_blob_g, top_blob_bordered_g, opt_g);
1212 }
1213
1214 // packing
1215 if (out_g_elempack == 1 && out_elempack == 4)
1216 {
1217 convert_packing(top_blob_bordered_unpacked, top_blob_bordered, 4, opt);
1218 }
1219 else
1220 {
1221 top_blob_bordered = top_blob_bordered_unpacked;
1222 }
1223 }
1224
1225 cut_padding(top_blob_bordered, top_blob, opt);
1226 if (top_blob.empty())
1227 return -100;
1228
1229 return 0;
1230 }
1231
1232 } // namespace ncnn
1233