1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14 
15 #include "deconvolutiondepthwise_arm.h"
16 
17 #include "layer_type.h"
18 
19 #if __ARM_NEON
20 #include <arm_neon.h>
21 #endif // __ARM_NEON
22 
23 #include "arm_activation.h"
24 
25 namespace ncnn {
26 
DeconvolutionDepthWise_arm()27 DeconvolutionDepthWise_arm::DeconvolutionDepthWise_arm()
28 {
29 #if __ARM_NEON
30     support_packing = true;
31 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
32     support_fp16_storage = true;
33 #endif
34 #endif // __ARM_NEON
35 
36     support_bf16_storage = true;
37 }
38 
create_pipeline(const Option & opt)39 int DeconvolutionDepthWise_arm::create_pipeline(const Option& opt)
40 {
41     // create Deconvolution op for each group
42     const int maxk = kernel_w * kernel_h;
43     int channels = (weight_data_size / group) / maxk / (num_output / group) * group;
44 
45     // depth-wise
46     if (channels == group && group == num_output)
47     {
48         int elempack = (support_packing && opt.use_packing_layout && channels % 4 == 0) ? 4 : 1;
49 
50         Mat weight_data_transposed(weight_data.w);
51         {
52             float* pt = weight_data_transposed;
53             const float* p = weight_data;
54 
55             for (int i = 0; i < (channels / group) * (num_output / group) * group; i++)
56             {
57                 for (int k = 0; k < maxk; k++)
58                 {
59                     pt[maxk - 1 - k] = p[k];
60                 }
61 
62                 p += maxk;
63                 pt += maxk;
64             }
65         }
66 
67 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
68         if (opt.use_fp16_storage)
69         {
70             if (opt.use_packing_layout)
71             {
72                 elempack = opt.use_fp16_arithmetic && channels % 8 == 0 ? 8 : channels % 4 == 0 ? 4 : 1;
73             }
74 
75             if (elempack == 8)
76             {
77                 Mat weight_data_r2 = weight_data_transposed.reshape(maxk, group);
78                 Mat weight_data_r2_packed;
79                 convert_packing(weight_data_r2, weight_data_r2_packed, 8);
80 
81                 ncnn::cast_float32_to_float16(weight_data_r2_packed, weight_data_fp16, opt);
82             }
83 
84             if (elempack == 4)
85             {
86                 Mat weight_data_r2 = weight_data_transposed.reshape(maxk, group);
87                 Mat weight_data_r2_packed;
88                 convert_packing(weight_data_r2, weight_data_r2_packed, 4);
89 
90                 ncnn::cast_float32_to_float16(weight_data_r2_packed, weight_data_fp16, opt);
91             }
92 
93             if (elempack == 1)
94             {
95                 ncnn::cast_float32_to_float16(weight_data_transposed, weight_data_fp16, opt);
96             }
97 
98             ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt);
99 
100             return 0;
101         }
102 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
103 
104         if (opt.use_bf16_storage)
105         {
106 #if __ARM_NEON
107             if (elempack == 4)
108             {
109                 Mat weight_data_r2 = weight_data_transposed.reshape(maxk, group);
110                 convert_packing(weight_data_r2, weight_data_pack4, 4);
111 
112                 ncnn::cast_float32_to_bfloat16(weight_data_pack4, weight_data_bf16, opt);
113             }
114 #endif // __ARM_NEON
115 
116             if (elempack == 1)
117             {
118                 ncnn::cast_float32_to_bfloat16(weight_data_transposed, weight_data_bf16, opt);
119             }
120 
121             return 0;
122         }
123 
124 #if __ARM_NEON
125         // pack4
126         if (elempack == 4)
127         {
128             Mat weight_data_r2 = weight_data_transposed.reshape(maxk, group);
129             convert_packing(weight_data_r2, weight_data_pack4, 4);
130         }
131 #endif // __ARM_NEON
132 
133         // pack1
134         if (elempack == 1)
135         {
136             weight_data_pack1 = weight_data_transposed;
137         }
138     }
139     else
140     {
141         // group deconvolution
142         for (int i = 0; i < (int)group_ops.size(); i++)
143             delete group_ops[i];
144 
145         group_ops.clear();
146 
147         const int channels_g = channels / group;
148         const int num_output_g = num_output / group;
149 
150         group_ops.resize(group);
151 
152         for (int g = 0; g < group; g++)
153         {
154             Mat weight_data_g = weight_data.range(maxk * channels_g * num_output_g * g, maxk * channels_g * num_output_g);
155             Mat bias_data_g;
156             if (bias_term)
157                 bias_data_g = bias_data.range(num_output_g * g, num_output_g);
158 
159             ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Deconvolution);
160 
161             // set param
162             ncnn::ParamDict pd;
163             pd.set(0, num_output_g); // num_output
164             pd.set(1, kernel_w);
165             pd.set(11, kernel_h);
166             pd.set(2, dilation_w);
167             pd.set(12, dilation_h);
168             pd.set(3, stride_w);
169             pd.set(13, stride_h);
170             pd.set(4, 0);  // pad_w
171             pd.set(14, 0); // pad_h
172             pd.set(5, bias_term);
173             pd.set(6, maxk * channels_g * num_output_g); // weight_data_size
174             pd.set(9, activation_type);
175             pd.set(10, activation_params);
176 
177             op->load_param(pd);
178 
179             // set weights
180             if (bias_term)
181             {
182                 ncnn::Mat weights[2];
183                 weights[0] = weight_data_g;
184                 weights[1] = bias_data_g;
185 
186                 op->load_model(ModelBinFromMatArray(weights));
187             }
188             else
189             {
190                 ncnn::Mat weights[1];
191                 weights[0] = weight_data_g;
192 
193                 op->load_model(ModelBinFromMatArray(weights));
194             }
195 
196             op->create_pipeline(opt);
197 
198             group_ops[g] = op;
199         }
200     }
201 
202     return 0;
203 }
204 
destroy_pipeline(const Option & opt)205 int DeconvolutionDepthWise_arm::destroy_pipeline(const Option& opt)
206 {
207     for (int i = 0; i < (int)group_ops.size(); i++)
208     {
209         group_ops[i]->destroy_pipeline(opt);
210         delete group_ops[i];
211     }
212     group_ops.clear();
213 
214     return 0;
215 }
216 
forward(const Mat & bottom_blob,Mat & top_blob,const Option & opt) const217 int DeconvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
218 {
219     int elembits = bottom_blob.elembits();
220 
221 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
222     if (opt.use_fp16_storage && elembits == 16)
223     {
224         if (opt.use_fp16_arithmetic)
225             return forward_fp16sa(bottom_blob, top_blob, opt);
226         else
227             return forward_fp16s(bottom_blob, top_blob, opt);
228     }
229 #endif
230 
231     if (opt.use_bf16_storage && elembits == 16)
232         return forward_bf16s(bottom_blob, top_blob, opt);
233 
234     // convolv with NxN kernel
235     // value = value + bias
236 
237     int w = bottom_blob.w;
238     int h = bottom_blob.h;
239     int channels = bottom_blob.c;
240     size_t elemsize = bottom_blob.elemsize;
241     int elempack = bottom_blob.elempack;
242 
243     const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
244     const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
245 
246     int outw = (w - 1) * stride_w + kernel_extent_w;
247     int outh = (h - 1) * stride_h + kernel_extent_h;
248     int out_elempack = (support_packing && opt.use_packing_layout && num_output % 4 == 0) ? 4 : 1;
249     size_t out_elemsize = elemsize / elempack * out_elempack;
250 
251     Mat top_blob_bordered;
252     if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || output_pad_right > 0 || output_pad_bottom > 0 || (output_w > 0 && output_h > 0))
253     {
254         top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_allocator);
255     }
256     else
257     {
258         top_blob_bordered = top_blob;
259         top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
260     }
261     if (top_blob_bordered.empty())
262         return -100;
263 
264     const int maxk = kernel_w * kernel_h;
265 
266     // depth-wise
267     if (channels * elempack == group && group == num_output)
268     {
269 #if __ARM_NEON
270         if (elempack == 4)
271         {
272             #pragma omp parallel for num_threads(opt.num_threads)
273             for (int g = 0; g < channels; g++)
274             {
275                 float* outptr = top_blob_bordered.channel(g);
276                 const float* kptr = (const float*)weight_data_pack4 + maxk * g * 4;
277                 const Mat m = bottom_blob.channel(g);
278 
279                 for (int i = 0; i < outh; i++)
280                 {
281                     for (int j = 0; j < outw; j++)
282                     {
283                         float32x4_t _sum = vdupq_n_f32(0.f);
284 
285                         if (bias_term)
286                         {
287                             _sum = vld1q_f32((const float*)bias_data + g * 4);
288                         }
289 
290                         for (int y = 0; y < kernel_h; y++)
291                         {
292                             int sys = (i + y * dilation_h - (kernel_extent_h - 1));
293                             if (sys < 0 || sys % stride_h != 0)
294                                 continue;
295 
296                             int sy = sys / stride_h;
297                             if (sy >= h)
298                                 continue;
299 
300                             for (int x = 0; x < kernel_w; x++)
301                             {
302                                 int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
303                                 if (sxs < 0 || sxs % stride_w != 0)
304                                     continue;
305 
306                                 int sx = sxs / stride_w;
307                                 if (sx >= w)
308                                     continue;
309 
310                                 const float* sptr = m.row(sy) + sx * 4;
311 
312                                 float32x4_t _val = vld1q_f32(sptr);
313 
314                                 int k = y * kernel_w + x;
315 
316                                 float32x4_t _w = vld1q_f32(kptr + k * 4);
317 
318                                 _sum = vmlaq_f32(_sum, _val, _w);
319                             }
320                         }
321 
322                         _sum = activation_ps(_sum, activation_type, activation_params);
323 
324                         vst1q_f32(outptr + j * 4, _sum);
325                     }
326 
327                     outptr += outw * 4;
328                 }
329             }
330         }
331 #endif // __ARM_NEON
332 
333         if (elempack == 1)
334         {
335             #pragma omp parallel for num_threads(opt.num_threads)
336             for (int g = 0; g < channels; g++)
337             {
338                 float* outptr = top_blob_bordered.channel(g);
339                 const float* kptr = (const float*)weight_data_pack1 + maxk * g;
340                 const Mat m = bottom_blob.channel(g);
341 
342                 for (int i = 0; i < outh; i++)
343                 {
344                     for (int j = 0; j < outw; j++)
345                     {
346                         float sum = 0.f;
347 
348                         if (bias_term)
349                         {
350                             sum = bias_data[g];
351                         }
352 
353                         for (int y = 0; y < kernel_h; y++)
354                         {
355                             int sys = (i + y * dilation_h - (kernel_extent_h - 1));
356                             if (sys < 0 || sys % stride_h != 0)
357                                 continue;
358 
359                             int sy = sys / stride_h;
360                             if (sy >= h)
361                                 continue;
362 
363                             const float* sptr = m.row(sy);
364 
365                             for (int x = 0; x < kernel_w; x++)
366                             {
367                                 int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
368                                 if (sxs < 0 || sxs % stride_w != 0)
369                                     continue;
370 
371                                 int sx = sxs / stride_w;
372                                 if (sx >= w)
373                                     continue;
374 
375                                 float val = sptr[sx];
376 
377                                 int k = y * kernel_w + x;
378 
379                                 float w = kptr[k];
380 
381                                 sum += val * w;
382                             }
383                         }
384 
385                         if (activation_type == 1)
386                         {
387                             sum = std::max(sum, 0.f);
388                         }
389                         else if (activation_type == 2)
390                         {
391                             float slope = activation_params[0];
392                             sum = sum > 0.f ? sum : sum * slope;
393                         }
394                         else if (activation_type == 3)
395                         {
396                             float min = activation_params[0];
397                             float max = activation_params[1];
398                             if (sum < min)
399                                 sum = min;
400                             if (sum > max)
401                                 sum = max;
402                         }
403                         else if (activation_type == 4)
404                         {
405                             sum = static_cast<float>(1.f / (1.f + exp(-sum)));
406                         }
407 
408                         outptr[j] = sum;
409                     }
410 
411                     outptr += outw;
412                 }
413             }
414         }
415     }
416     else
417     {
418         // group deconvolution
419         const int channels_g = channels * elempack / group;
420         const int num_output_g = num_output / group;
421 
422         int g_elempack = (support_packing && opt.use_packing_layout && channels_g % 4 == 0) ? 4 : 1;
423         int out_g_elempack = (support_packing && opt.use_packing_layout && num_output_g % 4 == 0) ? 4 : 1;
424 
425         // unpacking
426         Mat bottom_blob_unpacked = bottom_blob;
427         if (elempack == 4 && g_elempack == 1)
428         {
429             Option opt_p = opt;
430             opt_p.blob_allocator = opt.workspace_allocator;
431             convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_p);
432         }
433 
434         Mat top_blob_bordered_unpacked = top_blob_bordered;
435         if (out_g_elempack == 1 && out_elempack == 4)
436         {
437             top_blob_bordered_unpacked.create(outw, outh, num_output, out_elemsize / out_elempack, 1, opt.workspace_allocator);
438             if (top_blob_bordered_unpacked.empty())
439                 return -100;
440         }
441 
442         for (int g = 0; g < group; g++)
443         {
444             const Mat bottom_blob_g = bottom_blob_unpacked.channel_range(channels_g * g / g_elempack, channels_g / g_elempack);
445             Mat top_blob_bordered_g = top_blob_bordered_unpacked.channel_range(num_output_g * g / out_g_elempack, num_output_g / out_g_elempack);
446 
447             const ncnn::Layer* op = group_ops[g];
448 
449             Option opt_g = opt;
450             opt_g.blob_allocator = top_blob_bordered_unpacked.allocator;
451 
452             // forward
453             op->forward(bottom_blob_g, top_blob_bordered_g, opt_g);
454         }
455 
456         // packing
457         if (out_g_elempack == 1 && out_elempack == 4)
458         {
459             convert_packing(top_blob_bordered_unpacked, top_blob_bordered, 4, opt);
460         }
461         else
462         {
463             top_blob_bordered = top_blob_bordered_unpacked;
464         }
465     }
466 
467     cut_padding(top_blob_bordered, top_blob, opt);
468     if (top_blob.empty())
469         return -100;
470 
471     return 0;
472 }
473 
474 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
forward_fp16s(const Mat & bottom_blob,Mat & top_blob,const Option & opt) const475 int DeconvolutionDepthWise_arm::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
476 {
477     int w = bottom_blob.w;
478     int h = bottom_blob.h;
479     int channels = bottom_blob.c;
480     size_t elemsize = bottom_blob.elemsize;
481     int elempack = bottom_blob.elempack;
482 
483     const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
484     const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
485 
486     int outw = (w - 1) * stride_w + kernel_extent_w;
487     int outh = (h - 1) * stride_h + kernel_extent_h;
488     int out_elempack = (support_packing && opt.use_packing_layout && num_output % 4 == 0) ? 4 : 1;
489     size_t out_elemsize = elemsize / elempack * out_elempack;
490 
491     Mat top_blob_bordered;
492     if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || output_pad_right > 0 || output_pad_bottom > 0 || (output_w > 0 && output_h > 0))
493     {
494         top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_allocator);
495     }
496     else
497     {
498         top_blob_bordered = top_blob;
499         top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
500     }
501     if (top_blob_bordered.empty())
502         return -100;
503 
504     const int maxk = kernel_w * kernel_h;
505 
506     // depth-wise
507     if (channels * elempack == group && group == num_output)
508     {
509         if (elempack == 4)
510         {
511             {
512                 #pragma omp parallel for num_threads(opt.num_threads)
513                 for (int g = 0; g < channels; g++)
514                 {
515                     __fp16* outptr = top_blob_bordered.channel(g);
516                     const __fp16* kptr = (const __fp16*)weight_data_fp16 + maxk * g * 4;
517                     const Mat m = bottom_blob.channel(g);
518 
519                     for (int i = 0; i < outh; i++)
520                     {
521                         for (int j = 0; j < outw; j++)
522                         {
523                             float32x4_t _sum = vdupq_n_f32(0.f);
524 
525                             if (bias_term)
526                             {
527                                 _sum = vld1q_f32((const float*)bias_data + g * 4);
528                             }
529 
530                             for (int y = 0; y < kernel_h; y++)
531                             {
532                                 int sys = (i + y * dilation_h - (kernel_extent_h - 1));
533                                 if (sys < 0 || sys % stride_h != 0)
534                                     continue;
535 
536                                 int sy = sys / stride_h;
537                                 if (sy >= h)
538                                     continue;
539 
540                                 for (int x = 0; x < kernel_w; x++)
541                                 {
542                                     int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
543                                     if (sxs < 0 || sxs % stride_w != 0)
544                                         continue;
545 
546                                     int sx = sxs / stride_w;
547                                     if (sx >= w)
548                                         continue;
549 
550                                     const __fp16* sptr = m.row<const __fp16>(sy) + sx * 4;
551 
552                                     float32x4_t _val = vcvt_f32_f16(vld1_f16(sptr));
553 
554                                     int k = y * kernel_w + x;
555 
556                                     float32x4_t _w = vcvt_f32_f16(vld1_f16(kptr + k * 4));
557 
558                                     _sum = vfmaq_f32(_sum, _val, _w);
559                                 }
560                             }
561 
562                             _sum = activation_ps(_sum, activation_type, activation_params);
563 
564                             vst1_f16(outptr + j * 4, vcvt_f16_f32(_sum));
565                         }
566 
567                         outptr += outw * 4;
568                     }
569                 }
570             }
571         }
572 
573         if (elempack == 1)
574         {
575             {
576                 #pragma omp parallel for num_threads(opt.num_threads)
577                 for (int g = 0; g < channels; g++)
578                 {
579                     __fp16* outptr = top_blob_bordered.channel(g);
580                     const __fp16* kptr = (const __fp16*)weight_data_fp16 + maxk * g;
581                     const Mat m = bottom_blob.channel(g);
582 
583                     for (int i = 0; i < outh; i++)
584                     {
585                         for (int j = 0; j < outw; j++)
586                         {
587                             float sum = 0.f;
588 
589                             if (bias_term)
590                             {
591                                 sum = bias_data[g];
592                             }
593 
594                             for (int y = 0; y < kernel_h; y++)
595                             {
596                                 int sys = (i + y * dilation_h - (kernel_extent_h - 1));
597                                 if (sys < 0 || sys % stride_h != 0)
598                                     continue;
599 
600                                 int sy = sys / stride_h;
601                                 if (sy >= h)
602                                     continue;
603 
604                                 const __fp16* sptr = m.row<const __fp16>(sy);
605 
606                                 for (int x = 0; x < kernel_w; x++)
607                                 {
608                                     int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
609                                     if (sxs < 0 || sxs % stride_w != 0)
610                                         continue;
611 
612                                     int sx = sxs / stride_w;
613                                     if (sx >= w)
614                                         continue;
615 
616                                     float val = (float)sptr[sx];
617 
618                                     int k = y * kernel_w + x;
619 
620                                     float w = (float)kptr[k];
621 
622                                     sum += val * w;
623                                 }
624                             }
625 
626                             sum = activation_ss(sum, activation_type, activation_params);
627 
628                             outptr[j] = (__fp16)sum;
629                         }
630 
631                         outptr += outw;
632                     }
633                 }
634             }
635         }
636     }
637     else
638     {
639         // group deconvolution
640         const int channels_g = channels * elempack / group;
641         const int num_output_g = num_output / group;
642 
643         int g_elempack = (support_packing && opt.use_packing_layout && channels_g % 4 == 0) ? 4 : 1;
644         int out_g_elempack = (support_packing && opt.use_packing_layout && num_output_g % 4 == 0) ? 4 : 1;
645 
646         // unpacking
647         Mat bottom_blob_unpacked = bottom_blob;
648         if (elempack == 4 && g_elempack == 1)
649         {
650             Option opt_p = opt;
651             opt_p.blob_allocator = opt.workspace_allocator;
652             convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_p);
653         }
654 
655         Mat top_blob_bordered_unpacked = top_blob_bordered;
656         if (out_g_elempack == 1 && out_elempack == 4)
657         {
658             top_blob_bordered_unpacked.create(outw, outh, num_output, out_elemsize / out_elempack, 1, opt.workspace_allocator);
659             if (top_blob_bordered_unpacked.empty())
660                 return -100;
661         }
662 
663         for (int g = 0; g < group; g++)
664         {
665             const Mat bottom_blob_g = bottom_blob_unpacked.channel_range(channels_g * g / g_elempack, channels_g / g_elempack);
666             Mat top_blob_bordered_g = top_blob_bordered_unpacked.channel_range(num_output_g * g / out_g_elempack, num_output_g / out_g_elempack);
667 
668             const ncnn::Layer* op = group_ops[g];
669 
670             Option opt_g = opt;
671             opt_g.blob_allocator = top_blob_bordered_unpacked.allocator;
672 
673             // forward
674             op->forward(bottom_blob_g, top_blob_bordered_g, opt_g);
675         }
676 
677         // packing
678         if (out_g_elempack == 1 && out_elempack == 4)
679         {
680             convert_packing(top_blob_bordered_unpacked, top_blob_bordered, 4, opt);
681         }
682         else
683         {
684             top_blob_bordered = top_blob_bordered_unpacked;
685         }
686     }
687 
688     cut_padding(top_blob_bordered, top_blob, opt);
689     if (top_blob.empty())
690         return -100;
691 
692     return 0;
693 }
694 
forward_fp16sa(const Mat & bottom_blob,Mat & top_blob,const Option & opt) const695 int DeconvolutionDepthWise_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
696 {
697     int w = bottom_blob.w;
698     int h = bottom_blob.h;
699     int channels = bottom_blob.c;
700     size_t elemsize = bottom_blob.elemsize;
701     int elempack = bottom_blob.elempack;
702 
703     const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
704     const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
705 
706     int outw = (w - 1) * stride_w + kernel_extent_w;
707     int outh = (h - 1) * stride_h + kernel_extent_h;
708     int out_elempack = 1;
709     if (opt.use_packing_layout)
710     {
711         out_elempack = opt.use_fp16_arithmetic && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
712     }
713     size_t out_elemsize = elemsize / elempack * out_elempack;
714 
715     Mat top_blob_bordered;
716     if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || output_pad_right > 0 || output_pad_bottom > 0 || (output_w > 0 && output_h > 0))
717     {
718         top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_allocator);
719     }
720     else
721     {
722         top_blob_bordered = top_blob;
723         top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
724     }
725     if (top_blob_bordered.empty())
726         return -100;
727 
728     const int maxk = kernel_w * kernel_h;
729 
730     // depth-wise
731     if (channels * elempack == group && group == num_output)
732     {
733         if (elempack == 8)
734         {
735             {
736                 #pragma omp parallel for num_threads(opt.num_threads)
737                 for (int g = 0; g < channels; g++)
738                 {
739                     __fp16* outptr = top_blob_bordered.channel(g);
740                     const __fp16* kptr = (const __fp16*)weight_data_fp16 + maxk * g * 8;
741                     const Mat m = bottom_blob.channel(g);
742 
743                     for (int i = 0; i < outh; i++)
744                     {
745                         for (int j = 0; j < outw; j++)
746                         {
747                             float16x8_t _sum = vdupq_n_f16((__fp16)0.f);
748 
749                             if (bias_term)
750                             {
751                                 _sum = vld1q_f16((const __fp16*)bias_data_fp16 + g * 8);
752                             }
753 
754                             for (int y = 0; y < kernel_h; y++)
755                             {
756                                 int sys = (i + y * dilation_h - (kernel_extent_h - 1));
757                                 if (sys < 0 || sys % stride_h != 0)
758                                     continue;
759 
760                                 int sy = sys / stride_h;
761                                 if (sy >= h)
762                                     continue;
763 
764                                 for (int x = 0; x < kernel_w; x++)
765                                 {
766                                     int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
767                                     if (sxs < 0 || sxs % stride_w != 0)
768                                         continue;
769 
770                                     int sx = sxs / stride_w;
771                                     if (sx >= w)
772                                         continue;
773 
774                                     const __fp16* sptr = m.row<const __fp16>(sy) + sx * 8;
775 
776                                     float16x8_t _val = vld1q_f16(sptr);
777 
778                                     int k = y * kernel_w + x;
779 
780                                     float16x8_t _w = vld1q_f16(kptr + k * 8);
781 
782                                     _sum = vfmaq_f16(_sum, _val, _w);
783                                 }
784                             }
785 
786                             _sum = activation_ps(_sum, activation_type, activation_params);
787 
788                             vst1q_f16(outptr + j * 8, _sum);
789                         }
790 
791                         outptr += outw * 8;
792                     }
793                 }
794             }
795         }
796 
797         if (elempack == 4)
798         {
799             {
800                 #pragma omp parallel for num_threads(opt.num_threads)
801                 for (int g = 0; g < channels; g++)
802                 {
803                     __fp16* outptr = top_blob_bordered.channel(g);
804                     const __fp16* kptr = (const __fp16*)weight_data_fp16 + maxk * g * 4;
805                     const Mat m = bottom_blob.channel(g);
806 
807                     for (int i = 0; i < outh; i++)
808                     {
809                         for (int j = 0; j < outw; j++)
810                         {
811                             float16x4_t _sum = vdup_n_f16((__fp16)0.f);
812 
813                             if (bias_term)
814                             {
815                                 _sum = vld1_f16((const __fp16*)bias_data_fp16 + g * 4);
816                             }
817 
818                             for (int y = 0; y < kernel_h; y++)
819                             {
820                                 int sys = (i + y * dilation_h - (kernel_extent_h - 1));
821                                 if (sys < 0 || sys % stride_h != 0)
822                                     continue;
823 
824                                 int sy = sys / stride_h;
825                                 if (sy >= h)
826                                     continue;
827 
828                                 for (int x = 0; x < kernel_w; x++)
829                                 {
830                                     int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
831                                     if (sxs < 0 || sxs % stride_w != 0)
832                                         continue;
833 
834                                     int sx = sxs / stride_w;
835                                     if (sx >= w)
836                                         continue;
837 
838                                     const __fp16* sptr = m.row<const __fp16>(sy) + sx * 4;
839 
840                                     float16x4_t _val = vld1_f16(sptr);
841 
842                                     int k = y * kernel_w + x;
843 
844                                     float16x4_t _w = vld1_f16(kptr + k * 4);
845 
846                                     _sum = vfma_f16(_sum, _val, _w);
847                                 }
848                             }
849 
850                             _sum = activation_ps(_sum, activation_type, activation_params);
851 
852                             vst1_f16(outptr + j * 4, _sum);
853                         }
854 
855                         outptr += outw * 4;
856                     }
857                 }
858             }
859         }
860 
861         if (elempack == 1)
862         {
863             {
864                 #pragma omp parallel for num_threads(opt.num_threads)
865                 for (int g = 0; g < channels; g++)
866                 {
867                     __fp16* outptr = top_blob_bordered.channel(g);
868                     const __fp16* kptr = (const __fp16*)weight_data_fp16 + maxk * g;
869                     const Mat m = bottom_blob.channel(g);
870 
871                     for (int i = 0; i < outh; i++)
872                     {
873                         for (int j = 0; j < outw; j++)
874                         {
875                             float sum = 0.f;
876 
877                             if (bias_term)
878                             {
879                                 sum = bias_data[g];
880                             }
881 
882                             for (int y = 0; y < kernel_h; y++)
883                             {
884                                 int sys = (i + y * dilation_h - (kernel_extent_h - 1));
885                                 if (sys < 0 || sys % stride_h != 0)
886                                     continue;
887 
888                                 int sy = sys / stride_h;
889                                 if (sy >= h)
890                                     continue;
891 
892                                 const __fp16* sptr = m.row<const __fp16>(sy);
893 
894                                 for (int x = 0; x < kernel_w; x++)
895                                 {
896                                     int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
897                                     if (sxs < 0 || sxs % stride_w != 0)
898                                         continue;
899 
900                                     int sx = sxs / stride_w;
901                                     if (sx >= w)
902                                         continue;
903 
904                                     __fp16 val = sptr[sx];
905 
906                                     int k = y * kernel_w + x;
907 
908                                     __fp16 w = kptr[k];
909 
910                                     sum += val * w;
911                                 }
912                             }
913 
914                             sum = activation_ss(sum, activation_type, activation_params);
915 
916                             outptr[j] = (__fp16)sum;
917                         }
918 
919                         outptr += outw;
920                     }
921                 }
922             }
923         }
924     }
925     else
926     {
927         // group deconvolution
928         const int channels_g = channels * elempack / group;
929         const int num_output_g = num_output / group;
930 
931         int g_elempack = 1;
932         int out_g_elempack = 1;
933         if (opt.use_packing_layout)
934         {
935             g_elempack = opt.use_fp16_arithmetic && channels_g % 8 == 0 ? 8 : channels_g % 4 == 0 ? 4 : 1;
936             out_g_elempack = opt.use_fp16_arithmetic && num_output_g % 8 == 0 ? 8 : num_output_g % 4 == 0 ? 4 : 1;
937         }
938 
939         // unpacking
940         Mat bottom_blob_unpacked = bottom_blob;
941         if (elempack > g_elempack)
942         {
943             Option opt_p = opt;
944             opt_p.blob_allocator = opt.workspace_allocator;
945             convert_packing(bottom_blob, bottom_blob_unpacked, g_elempack, opt_p);
946         }
947 
948         Mat top_blob_bordered_unpacked = top_blob_bordered;
949         if (out_g_elempack < out_elempack)
950         {
951             top_blob_bordered_unpacked.create(outw, outh, num_output / out_g_elempack, out_elemsize / out_elempack * out_g_elempack, out_g_elempack, opt.workspace_allocator);
952             if (top_blob_bordered_unpacked.empty())
953                 return -100;
954         }
955 
956         for (int g = 0; g < group; g++)
957         {
958             const Mat bottom_blob_g = bottom_blob_unpacked.channel_range(channels_g * g / g_elempack, channels_g / g_elempack);
959             Mat top_blob_bordered_g = top_blob_bordered_unpacked.channel_range(num_output_g * g / out_g_elempack, num_output_g / out_g_elempack);
960 
961             const ncnn::Layer* op = group_ops[g];
962 
963             Option opt_g = opt;
964             opt_g.blob_allocator = top_blob_bordered_unpacked.allocator;
965 
966             // forward
967             op->forward(bottom_blob_g, top_blob_bordered_g, opt_g);
968         }
969 
970         // packing
971         if (out_g_elempack < out_elempack)
972         {
973             convert_packing(top_blob_bordered_unpacked, top_blob_bordered, out_elempack, opt);
974         }
975         else
976         {
977             top_blob_bordered = top_blob_bordered_unpacked;
978         }
979     }
980 
981     cut_padding(top_blob_bordered, top_blob, opt);
982     if (top_blob.empty())
983         return -100;
984 
985     return 0;
986 }
987 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
988 
forward_bf16s(const Mat & bottom_blob,Mat & top_blob,const Option & opt) const989 int DeconvolutionDepthWise_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
990 {
991     int w = bottom_blob.w;
992     int h = bottom_blob.h;
993     int channels = bottom_blob.c;
994     size_t elemsize = bottom_blob.elemsize;
995     int elempack = bottom_blob.elempack;
996 
997     const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
998     const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
999 
1000     int outw = (w - 1) * stride_w + kernel_extent_w;
1001     int outh = (h - 1) * stride_h + kernel_extent_h;
1002     int out_elempack = opt.use_packing_layout && num_output % 4 == 0 ? 4 : 1;
1003     size_t out_elemsize = elemsize / elempack * out_elempack;
1004 
1005     Mat top_blob_bordered;
1006     if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || output_pad_right > 0 || output_pad_bottom > 0 || (output_w > 0 && output_h > 0))
1007     {
1008         top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_allocator);
1009     }
1010     else
1011     {
1012         top_blob_bordered = top_blob;
1013         top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
1014     }
1015     if (top_blob_bordered.empty())
1016         return -100;
1017 
1018     const int maxk = kernel_w * kernel_h;
1019 
1020     // depth-wise
1021     if (channels * elempack == group && group == num_output)
1022     {
1023 #if __ARM_NEON
1024         if (elempack == 4)
1025         {
1026             #pragma omp parallel for num_threads(opt.num_threads)
1027             for (int g = 0; g < channels; g++)
1028             {
1029                 unsigned short* outptr = top_blob_bordered.channel(g);
1030                 const unsigned short* kptr = (const unsigned short*)weight_data_bf16 + maxk * g * 4;
1031                 const Mat m = bottom_blob.channel(g);
1032 
1033                 for (int i = 0; i < outh; i++)
1034                 {
1035                     for (int j = 0; j < outw; j++)
1036                     {
1037                         float32x4_t _sum = vdupq_n_f32(0.f);
1038 
1039                         if (bias_term)
1040                         {
1041                             _sum = vld1q_f32((const float*)bias_data + g * 4);
1042                         }
1043 
1044                         for (int y = 0; y < kernel_h; y++)
1045                         {
1046                             int sys = (i + y * dilation_h - (kernel_extent_h - 1));
1047                             if (sys < 0 || sys % stride_h != 0)
1048                                 continue;
1049 
1050                             int sy = sys / stride_h;
1051                             if (sy >= h)
1052                                 continue;
1053 
1054                             for (int x = 0; x < kernel_w; x++)
1055                             {
1056                                 int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
1057                                 if (sxs < 0 || sxs % stride_w != 0)
1058                                     continue;
1059 
1060                                 int sx = sxs / stride_w;
1061                                 if (sx >= w)
1062                                     continue;
1063 
1064                                 const unsigned short* sptr = m.row<const unsigned short>(sy) + sx * 4;
1065 
1066                                 float32x4_t _val = vcvt_f32_bf16(vld1_u16(sptr));
1067 
1068                                 int k = y * kernel_w + x;
1069 
1070                                 float32x4_t _w = vcvt_f32_bf16(vld1_u16(kptr + k * 4));
1071 
1072                                 _sum = vmlaq_f32(_sum, _val, _w);
1073                             }
1074                         }
1075 
1076                         _sum = activation_ps(_sum, activation_type, activation_params);
1077 
1078                         vst1_u16(outptr + j * 4, vcvt_bf16_f32(_sum));
1079                     }
1080 
1081                     outptr += outw * 4;
1082                 }
1083             }
1084         }
1085 #endif // __ARM_NEON
1086 
1087         if (elempack == 1)
1088         {
1089             #pragma omp parallel for num_threads(opt.num_threads)
1090             for (int g = 0; g < channels; g++)
1091             {
1092                 unsigned short* outptr = top_blob_bordered.channel(g);
1093                 const unsigned short* kptr = (const unsigned short*)weight_data_bf16 + maxk * g;
1094                 const Mat m = bottom_blob.channel(g);
1095 
1096                 for (int i = 0; i < outh; i++)
1097                 {
1098                     for (int j = 0; j < outw; j++)
1099                     {
1100                         float sum = 0.f;
1101 
1102                         if (bias_term)
1103                         {
1104                             sum = bias_data[g];
1105                         }
1106 
1107                         for (int y = 0; y < kernel_h; y++)
1108                         {
1109                             int sys = (i + y * dilation_h - (kernel_extent_h - 1));
1110                             if (sys < 0 || sys % stride_h != 0)
1111                                 continue;
1112 
1113                             int sy = sys / stride_h;
1114                             if (sy >= h)
1115                                 continue;
1116 
1117                             const unsigned short* sptr = m.row<const unsigned short>(sy);
1118 
1119                             for (int x = 0; x < kernel_w; x++)
1120                             {
1121                                 int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
1122                                 if (sxs < 0 || sxs % stride_w != 0)
1123                                     continue;
1124 
1125                                 int sx = sxs / stride_w;
1126                                 if (sx >= w)
1127                                     continue;
1128 
1129                                 float val = bfloat16_to_float32(sptr[sx]);
1130 
1131                                 int k = y * kernel_w + x;
1132 
1133                                 float w = bfloat16_to_float32(kptr[k]);
1134 
1135                                 sum += val * w;
1136                             }
1137                         }
1138 
1139                         if (activation_type == 1)
1140                         {
1141                             sum = std::max(sum, 0.f);
1142                         }
1143                         else if (activation_type == 2)
1144                         {
1145                             float slope = activation_params[0];
1146                             sum = sum > 0.f ? sum : sum * slope;
1147                         }
1148                         else if (activation_type == 3)
1149                         {
1150                             float min = activation_params[0];
1151                             float max = activation_params[1];
1152                             if (sum < min)
1153                                 sum = min;
1154                             if (sum > max)
1155                                 sum = max;
1156                         }
1157                         else if (activation_type == 4)
1158                         {
1159                             sum = static_cast<float>(1.f / (1.f + exp(-sum)));
1160                         }
1161 
1162                         outptr[j] = float32_to_bfloat16(sum);
1163                     }
1164 
1165                     outptr += outw;
1166                 }
1167             }
1168         }
1169     }
1170     else
1171     {
1172         // group deconvolution
1173         const int channels_g = channels * elempack / group;
1174         const int num_output_g = num_output / group;
1175 
1176         int g_elempack = (support_packing && opt.use_packing_layout && channels_g % 4 == 0) ? 4 : 1;
1177         int out_g_elempack = (support_packing && opt.use_packing_layout && num_output_g % 4 == 0) ? 4 : 1;
1178 
1179         // unpacking
1180         Mat bottom_blob_unpacked = bottom_blob;
1181         if (elempack == 4 && g_elempack == 1)
1182         {
1183             Option opt_p = opt;
1184             opt_p.blob_allocator = opt.workspace_allocator;
1185             convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_p);
1186         }
1187 
1188         Mat top_blob_bordered_unpacked = top_blob_bordered;
1189         if (out_g_elempack == 1 && out_elempack == 4)
1190         {
1191             top_blob_bordered_unpacked.create(outw, outh, num_output, out_elemsize / out_elempack, 1, opt.workspace_allocator);
1192             if (top_blob_bordered_unpacked.empty())
1193                 return -100;
1194         }
1195 
1196         for (int g = 0; g < group; g++)
1197         {
1198             const Mat bottom_blob_g = bottom_blob_unpacked.channel_range(channels_g * g / g_elempack, channels_g / g_elempack);
1199             Mat top_blob_bordered_g = top_blob_bordered_unpacked.channel_range(num_output_g * g / out_g_elempack, num_output_g / out_g_elempack);
1200 
1201             const ncnn::Layer* op = group_ops[g];
1202 
1203             Option opt_g = opt;
1204             opt_g.blob_allocator = top_blob_bordered_unpacked.allocator;
1205 
1206             // forward
1207             op->forward(bottom_blob_g, top_blob_bordered_g, opt_g);
1208         }
1209 
1210         // packing
1211         if (out_g_elempack == 1 && out_elempack == 4)
1212         {
1213             convert_packing(top_blob_bordered_unpacked, top_blob_bordered, 4, opt);
1214         }
1215         else
1216         {
1217             top_blob_bordered = top_blob_bordered_unpacked;
1218         }
1219     }
1220 
1221     cut_padding(top_blob_bordered, top_blob, opt);
1222     if (top_blob.empty())
1223         return -100;
1224 
1225     return 0;
1226 }
1227 
1228 } // namespace ncnn
1229