1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14 
15 #include "deconvolutiondepthwise_arm.h"
16 
17 #include "layer_type.h"
18 
19 #if __ARM_NEON
20 #include <arm_neon.h>
21 #include "neon_mathfun.h"
22 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
23 #include "neon_mathfun_fp16s.h"
24 #endif
25 #endif // __ARM_NEON
26 
27 #include "neon_activation.h"
28 
29 namespace ncnn {
30 
DeconvolutionDepthWise_arm()31 DeconvolutionDepthWise_arm::DeconvolutionDepthWise_arm()
32 {
33 #if __ARM_NEON
34     support_packing = true;
35 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
36     support_fp16_storage = true;
37 #endif
38 #endif // __ARM_NEON
39 
40     support_bf16_storage = true;
41 }
42 
create_pipeline(const Option & opt)43 int DeconvolutionDepthWise_arm::create_pipeline(const Option& opt)
44 {
45     // create Deconvolution op for each group
46     const int maxk = kernel_w * kernel_h;
47     int channels = (weight_data_size / group) / maxk / (num_output / group) * group;
48 
49     // depth-wise
50     if (channels == group && group == num_output)
51     {
52         int elempack = (support_packing && opt.use_packing_layout && channels % 4 == 0) ? 4 : 1;
53 
54         Mat weight_data_transposed(weight_data.w);
55         {
56             float* pt = weight_data_transposed;
57             const float* p = weight_data;
58 
59             for (int i = 0; i < (channels / group) * (num_output / group) * group; i++)
60             {
61                 for (int k = 0; k < maxk; k++)
62                 {
63                     pt[maxk - 1 - k] = p[k];
64                 }
65 
66                 p += maxk;
67                 pt += maxk;
68             }
69         }
70 
71 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
72         if (opt.use_fp16_storage)
73         {
74             if (opt.use_packing_layout)
75             {
76                 elempack = opt.use_fp16_arithmetic && channels % 8 == 0 ? 8 : channels % 4 == 0 ? 4 : 1;
77             }
78 
79             if (elempack == 8)
80             {
81                 Mat weight_data_r2 = weight_data_transposed.reshape(maxk, group);
82                 Mat weight_data_r2_packed;
83                 convert_packing(weight_data_r2, weight_data_r2_packed, 8);
84 
85                 ncnn::cast_float32_to_float16(weight_data_r2_packed, weight_data_fp16, opt);
86             }
87 
88             if (elempack == 4)
89             {
90                 Mat weight_data_r2 = weight_data_transposed.reshape(maxk, group);
91                 Mat weight_data_r2_packed;
92                 convert_packing(weight_data_r2, weight_data_r2_packed, 4);
93 
94                 ncnn::cast_float32_to_float16(weight_data_r2_packed, weight_data_fp16, opt);
95             }
96 
97             if (elempack == 1)
98             {
99                 ncnn::cast_float32_to_float16(weight_data_transposed, weight_data_fp16, opt);
100             }
101 
102             ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt);
103 
104             return 0;
105         }
106 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
107 
108         if (opt.use_bf16_storage)
109         {
110 #if __ARM_NEON
111             if (elempack == 4)
112             {
113                 Mat weight_data_r2 = weight_data_transposed.reshape(maxk, group);
114                 convert_packing(weight_data_r2, weight_data_pack4, 4);
115 
116                 ncnn::cast_float32_to_bfloat16(weight_data_pack4, weight_data_bf16, opt);
117             }
118 #endif // __ARM_NEON
119 
120             if (elempack == 1)
121             {
122                 ncnn::cast_float32_to_bfloat16(weight_data_transposed, weight_data_bf16, opt);
123             }
124 
125             return 0;
126         }
127 
128 #if __ARM_NEON
129         // pack4
130         if (elempack == 4)
131         {
132             Mat weight_data_r2 = weight_data_transposed.reshape(maxk, group);
133             convert_packing(weight_data_r2, weight_data_pack4, 4);
134         }
135 #endif // __ARM_NEON
136 
137         // pack1
138         if (elempack == 1)
139         {
140             weight_data_pack1 = weight_data_transposed;
141         }
142     }
143     else
144     {
145         // group deconvolution
146         for (int i = 0; i < (int)group_ops.size(); i++)
147             delete group_ops[i];
148 
149         group_ops.clear();
150 
151         const int channels_g = channels / group;
152         const int num_output_g = num_output / group;
153 
154         group_ops.resize(group);
155 
156         for (int g = 0; g < group; g++)
157         {
158             Mat weight_data_g = weight_data.range(maxk * channels_g * num_output_g * g, maxk * channels_g * num_output_g);
159             Mat bias_data_g;
160             if (bias_term)
161                 bias_data_g = bias_data.range(num_output_g * g, num_output_g);
162 
163             ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Deconvolution);
164 
165             // set param
166             ncnn::ParamDict pd;
167             pd.set(0, num_output_g); // num_output
168             pd.set(1, kernel_w);
169             pd.set(11, kernel_h);
170             pd.set(2, dilation_w);
171             pd.set(12, dilation_h);
172             pd.set(3, stride_w);
173             pd.set(13, stride_h);
174             pd.set(4, 0);  // pad_w
175             pd.set(14, 0); // pad_h
176             pd.set(5, bias_term);
177             pd.set(6, maxk * channels_g * num_output_g); // weight_data_size
178             pd.set(9, activation_type);
179             pd.set(10, activation_params);
180 
181             op->load_param(pd);
182 
183             // set weights
184             if (bias_term)
185             {
186                 ncnn::Mat weights[2];
187                 weights[0] = weight_data_g;
188                 weights[1] = bias_data_g;
189 
190                 op->load_model(ModelBinFromMatArray(weights));
191             }
192             else
193             {
194                 ncnn::Mat weights[1];
195                 weights[0] = weight_data_g;
196 
197                 op->load_model(ModelBinFromMatArray(weights));
198             }
199 
200             op->create_pipeline(opt);
201 
202             group_ops[g] = op;
203         }
204     }
205 
206     return 0;
207 }
208 
destroy_pipeline(const Option & opt)209 int DeconvolutionDepthWise_arm::destroy_pipeline(const Option& opt)
210 {
211     for (int i = 0; i < (int)group_ops.size(); i++)
212     {
213         group_ops[i]->destroy_pipeline(opt);
214         delete group_ops[i];
215     }
216     group_ops.clear();
217 
218     return 0;
219 }
220 
forward(const Mat & bottom_blob,Mat & top_blob,const Option & opt) const221 int DeconvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
222 {
223     int elembits = bottom_blob.elembits();
224 
225 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
226     if (opt.use_fp16_storage && elembits == 16)
227     {
228         if (opt.use_fp16_arithmetic)
229             return forward_fp16sa(bottom_blob, top_blob, opt);
230         else
231             return forward_fp16s(bottom_blob, top_blob, opt);
232     }
233 #endif
234 
235     if (opt.use_bf16_storage && elembits == 16)
236         return forward_bf16s(bottom_blob, top_blob, opt);
237 
238     // convolv with NxN kernel
239     // value = value + bias
240 
241     int w = bottom_blob.w;
242     int h = bottom_blob.h;
243     int channels = bottom_blob.c;
244     size_t elemsize = bottom_blob.elemsize;
245     int elempack = bottom_blob.elempack;
246 
247     const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
248     const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
249 
250     int outw = (w - 1) * stride_w + kernel_extent_w;
251     int outh = (h - 1) * stride_h + kernel_extent_h;
252     int out_elempack = (support_packing && opt.use_packing_layout && num_output % 4 == 0) ? 4 : 1;
253     size_t out_elemsize = elemsize / elempack * out_elempack;
254 
255     Mat top_blob_bordered;
256     if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || output_pad_right > 0 || output_pad_bottom > 0 || (output_w > 0 && output_h > 0))
257     {
258         top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_allocator);
259     }
260     else
261     {
262         top_blob_bordered = top_blob;
263         top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
264     }
265     if (top_blob_bordered.empty())
266         return -100;
267 
268     const int maxk = kernel_w * kernel_h;
269 
270     // depth-wise
271     if (channels * elempack == group && group == num_output)
272     {
273 #if __ARM_NEON
274         if (elempack == 4)
275         {
276             #pragma omp parallel for num_threads(opt.num_threads)
277             for (int g = 0; g < channels; g++)
278             {
279                 float* outptr = top_blob_bordered.channel(g);
280                 const float* kptr = (const float*)weight_data_pack4 + maxk * g * 4;
281                 const Mat m = bottom_blob.channel(g);
282 
283                 for (int i = 0; i < outh; i++)
284                 {
285                     for (int j = 0; j < outw; j++)
286                     {
287                         float32x4_t _sum = vdupq_n_f32(0.f);
288 
289                         if (bias_term)
290                         {
291                             _sum = vld1q_f32((const float*)bias_data + g * 4);
292                         }
293 
294                         for (int y = 0; y < kernel_h; y++)
295                         {
296                             int sys = (i + y * dilation_h - (kernel_extent_h - 1));
297                             if (sys < 0 || sys % stride_h != 0)
298                                 continue;
299 
300                             int sy = sys / stride_h;
301                             if (sy >= h)
302                                 continue;
303 
304                             for (int x = 0; x < kernel_w; x++)
305                             {
306                                 int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
307                                 if (sxs < 0 || sxs % stride_w != 0)
308                                     continue;
309 
310                                 int sx = sxs / stride_w;
311                                 if (sx >= w)
312                                     continue;
313 
314                                 const float* sptr = m.row(sy) + sx * 4;
315 
316                                 float32x4_t _val = vld1q_f32(sptr);
317 
318                                 int k = y * kernel_w + x;
319 
320                                 float32x4_t _w = vld1q_f32(kptr + k * 4);
321 
322                                 _sum = vmlaq_f32(_sum, _val, _w);
323                             }
324                         }
325 
326                         _sum = activation_ps(_sum, activation_type, activation_params);
327 
328                         vst1q_f32(outptr + j * 4, _sum);
329                     }
330 
331                     outptr += outw * 4;
332                 }
333             }
334         }
335 #endif // __ARM_NEON
336 
337         if (elempack == 1)
338         {
339             #pragma omp parallel for num_threads(opt.num_threads)
340             for (int g = 0; g < channels; g++)
341             {
342                 float* outptr = top_blob_bordered.channel(g);
343                 const float* kptr = (const float*)weight_data_pack1 + maxk * g;
344                 const Mat m = bottom_blob.channel(g);
345 
346                 for (int i = 0; i < outh; i++)
347                 {
348                     for (int j = 0; j < outw; j++)
349                     {
350                         float sum = 0.f;
351 
352                         if (bias_term)
353                         {
354                             sum = bias_data[g];
355                         }
356 
357                         for (int y = 0; y < kernel_h; y++)
358                         {
359                             int sys = (i + y * dilation_h - (kernel_extent_h - 1));
360                             if (sys < 0 || sys % stride_h != 0)
361                                 continue;
362 
363                             int sy = sys / stride_h;
364                             if (sy >= h)
365                                 continue;
366 
367                             const float* sptr = m.row(sy);
368 
369                             for (int x = 0; x < kernel_w; x++)
370                             {
371                                 int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
372                                 if (sxs < 0 || sxs % stride_w != 0)
373                                     continue;
374 
375                                 int sx = sxs / stride_w;
376                                 if (sx >= w)
377                                     continue;
378 
379                                 float val = sptr[sx];
380 
381                                 int k = y * kernel_w + x;
382 
383                                 float w = kptr[k];
384 
385                                 sum += val * w;
386                             }
387                         }
388 
389                         if (activation_type == 1)
390                         {
391                             sum = std::max(sum, 0.f);
392                         }
393                         else if (activation_type == 2)
394                         {
395                             float slope = activation_params[0];
396                             sum = sum > 0.f ? sum : sum * slope;
397                         }
398                         else if (activation_type == 3)
399                         {
400                             float min = activation_params[0];
401                             float max = activation_params[1];
402                             if (sum < min)
403                                 sum = min;
404                             if (sum > max)
405                                 sum = max;
406                         }
407                         else if (activation_type == 4)
408                         {
409                             sum = static_cast<float>(1.f / (1.f + exp(-sum)));
410                         }
411 
412                         outptr[j] = sum;
413                     }
414 
415                     outptr += outw;
416                 }
417             }
418         }
419     }
420     else
421     {
422         // group deconvolution
423         const int channels_g = channels * elempack / group;
424         const int num_output_g = num_output / group;
425 
426         int g_elempack = (support_packing && opt.use_packing_layout && channels_g % 4 == 0) ? 4 : 1;
427         int out_g_elempack = (support_packing && opt.use_packing_layout && num_output_g % 4 == 0) ? 4 : 1;
428 
429         // unpacking
430         Mat bottom_blob_unpacked = bottom_blob;
431         if (elempack == 4 && g_elempack == 1)
432         {
433             Option opt_p = opt;
434             opt_p.blob_allocator = opt.workspace_allocator;
435             convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_p);
436         }
437 
438         Mat top_blob_bordered_unpacked = top_blob_bordered;
439         if (out_g_elempack == 1 && out_elempack == 4)
440         {
441             top_blob_bordered_unpacked.create(outw, outh, num_output, out_elemsize / out_elempack, 1, opt.workspace_allocator);
442             if (top_blob_bordered_unpacked.empty())
443                 return -100;
444         }
445 
446         for (int g = 0; g < group; g++)
447         {
448             const Mat bottom_blob_g = bottom_blob_unpacked.channel_range(channels_g * g / g_elempack, channels_g / g_elempack);
449             Mat top_blob_bordered_g = top_blob_bordered_unpacked.channel_range(num_output_g * g / out_g_elempack, num_output_g / out_g_elempack);
450 
451             const ncnn::Layer* op = group_ops[g];
452 
453             Option opt_g = opt;
454             opt_g.blob_allocator = top_blob_bordered_unpacked.allocator;
455 
456             // forward
457             op->forward(bottom_blob_g, top_blob_bordered_g, opt_g);
458         }
459 
460         // packing
461         if (out_g_elempack == 1 && out_elempack == 4)
462         {
463             convert_packing(top_blob_bordered_unpacked, top_blob_bordered, 4, opt);
464         }
465         else
466         {
467             top_blob_bordered = top_blob_bordered_unpacked;
468         }
469     }
470 
471     cut_padding(top_blob_bordered, top_blob, opt);
472     if (top_blob.empty())
473         return -100;
474 
475     return 0;
476 }
477 
478 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
forward_fp16s(const Mat & bottom_blob,Mat & top_blob,const Option & opt) const479 int DeconvolutionDepthWise_arm::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
480 {
481     int w = bottom_blob.w;
482     int h = bottom_blob.h;
483     int channels = bottom_blob.c;
484     size_t elemsize = bottom_blob.elemsize;
485     int elempack = bottom_blob.elempack;
486 
487     const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
488     const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
489 
490     int outw = (w - 1) * stride_w + kernel_extent_w;
491     int outh = (h - 1) * stride_h + kernel_extent_h;
492     int out_elempack = (support_packing && opt.use_packing_layout && num_output % 4 == 0) ? 4 : 1;
493     size_t out_elemsize = elemsize / elempack * out_elempack;
494 
495     Mat top_blob_bordered;
496     if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || output_pad_right > 0 || output_pad_bottom > 0 || (output_w > 0 && output_h > 0))
497     {
498         top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_allocator);
499     }
500     else
501     {
502         top_blob_bordered = top_blob;
503         top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
504     }
505     if (top_blob_bordered.empty())
506         return -100;
507 
508     const int maxk = kernel_w * kernel_h;
509 
510     // depth-wise
511     if (channels * elempack == group && group == num_output)
512     {
513         if (elempack == 4)
514         {
515             {
516                 #pragma omp parallel for num_threads(opt.num_threads)
517                 for (int g = 0; g < channels; g++)
518                 {
519                     __fp16* outptr = top_blob_bordered.channel(g);
520                     const __fp16* kptr = (const __fp16*)weight_data_fp16 + maxk * g * 4;
521                     const Mat m = bottom_blob.channel(g);
522 
523                     for (int i = 0; i < outh; i++)
524                     {
525                         for (int j = 0; j < outw; j++)
526                         {
527                             float32x4_t _sum = vdupq_n_f32(0.f);
528 
529                             if (bias_term)
530                             {
531                                 _sum = vld1q_f32((const float*)bias_data + g * 4);
532                             }
533 
534                             for (int y = 0; y < kernel_h; y++)
535                             {
536                                 int sys = (i + y * dilation_h - (kernel_extent_h - 1));
537                                 if (sys < 0 || sys % stride_h != 0)
538                                     continue;
539 
540                                 int sy = sys / stride_h;
541                                 if (sy >= h)
542                                     continue;
543 
544                                 for (int x = 0; x < kernel_w; x++)
545                                 {
546                                     int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
547                                     if (sxs < 0 || sxs % stride_w != 0)
548                                         continue;
549 
550                                     int sx = sxs / stride_w;
551                                     if (sx >= w)
552                                         continue;
553 
554                                     const __fp16* sptr = m.row<const __fp16>(sy) + sx * 4;
555 
556                                     float32x4_t _val = vcvt_f32_f16(vld1_f16(sptr));
557 
558                                     int k = y * kernel_w + x;
559 
560                                     float32x4_t _w = vcvt_f32_f16(vld1_f16(kptr + k * 4));
561 
562                                     _sum = vfmaq_f32(_sum, _val, _w);
563                                 }
564                             }
565 
566                             _sum = activation_ps(_sum, activation_type, activation_params);
567 
568                             vst1_f16(outptr + j * 4, vcvt_f16_f32(_sum));
569                         }
570 
571                         outptr += outw * 4;
572                     }
573                 }
574             }
575         }
576 
577         if (elempack == 1)
578         {
579             {
580                 #pragma omp parallel for num_threads(opt.num_threads)
581                 for (int g = 0; g < channels; g++)
582                 {
583                     __fp16* outptr = top_blob_bordered.channel(g);
584                     const __fp16* kptr = (const __fp16*)weight_data_fp16 + maxk * g;
585                     const Mat m = bottom_blob.channel(g);
586 
587                     for (int i = 0; i < outh; i++)
588                     {
589                         for (int j = 0; j < outw; j++)
590                         {
591                             float sum = 0.f;
592 
593                             if (bias_term)
594                             {
595                                 sum = bias_data[g];
596                             }
597 
598                             for (int y = 0; y < kernel_h; y++)
599                             {
600                                 int sys = (i + y * dilation_h - (kernel_extent_h - 1));
601                                 if (sys < 0 || sys % stride_h != 0)
602                                     continue;
603 
604                                 int sy = sys / stride_h;
605                                 if (sy >= h)
606                                     continue;
607 
608                                 const __fp16* sptr = m.row<const __fp16>(sy);
609 
610                                 for (int x = 0; x < kernel_w; x++)
611                                 {
612                                     int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
613                                     if (sxs < 0 || sxs % stride_w != 0)
614                                         continue;
615 
616                                     int sx = sxs / stride_w;
617                                     if (sx >= w)
618                                         continue;
619 
620                                     float val = (float)sptr[sx];
621 
622                                     int k = y * kernel_w + x;
623 
624                                     float w = (float)kptr[k];
625 
626                                     sum += val * w;
627                                 }
628                             }
629 
630                             sum = activation_ss(sum, activation_type, activation_params);
631 
632                             outptr[j] = (__fp16)sum;
633                         }
634 
635                         outptr += outw;
636                     }
637                 }
638             }
639         }
640     }
641     else
642     {
643         // group deconvolution
644         const int channels_g = channels * elempack / group;
645         const int num_output_g = num_output / group;
646 
647         int g_elempack = (support_packing && opt.use_packing_layout && channels_g % 4 == 0) ? 4 : 1;
648         int out_g_elempack = (support_packing && opt.use_packing_layout && num_output_g % 4 == 0) ? 4 : 1;
649 
650         // unpacking
651         Mat bottom_blob_unpacked = bottom_blob;
652         if (elempack == 4 && g_elempack == 1)
653         {
654             Option opt_p = opt;
655             opt_p.blob_allocator = opt.workspace_allocator;
656             convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_p);
657         }
658 
659         Mat top_blob_bordered_unpacked = top_blob_bordered;
660         if (out_g_elempack == 1 && out_elempack == 4)
661         {
662             top_blob_bordered_unpacked.create(outw, outh, num_output, out_elemsize / out_elempack, 1, opt.workspace_allocator);
663             if (top_blob_bordered_unpacked.empty())
664                 return -100;
665         }
666 
667         for (int g = 0; g < group; g++)
668         {
669             const Mat bottom_blob_g = bottom_blob_unpacked.channel_range(channels_g * g / g_elempack, channels_g / g_elempack);
670             Mat top_blob_bordered_g = top_blob_bordered_unpacked.channel_range(num_output_g * g / out_g_elempack, num_output_g / out_g_elempack);
671 
672             const ncnn::Layer* op = group_ops[g];
673 
674             Option opt_g = opt;
675             opt_g.blob_allocator = top_blob_bordered_unpacked.allocator;
676 
677             // forward
678             op->forward(bottom_blob_g, top_blob_bordered_g, opt_g);
679         }
680 
681         // packing
682         if (out_g_elempack == 1 && out_elempack == 4)
683         {
684             convert_packing(top_blob_bordered_unpacked, top_blob_bordered, 4, opt);
685         }
686         else
687         {
688             top_blob_bordered = top_blob_bordered_unpacked;
689         }
690     }
691 
692     cut_padding(top_blob_bordered, top_blob, opt);
693     if (top_blob.empty())
694         return -100;
695 
696     return 0;
697 }
698 
forward_fp16sa(const Mat & bottom_blob,Mat & top_blob,const Option & opt) const699 int DeconvolutionDepthWise_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
700 {
701     int w = bottom_blob.w;
702     int h = bottom_blob.h;
703     int channels = bottom_blob.c;
704     size_t elemsize = bottom_blob.elemsize;
705     int elempack = bottom_blob.elempack;
706 
707     const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
708     const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
709 
710     int outw = (w - 1) * stride_w + kernel_extent_w;
711     int outh = (h - 1) * stride_h + kernel_extent_h;
712     int out_elempack = 1;
713     if (opt.use_packing_layout)
714     {
715         out_elempack = opt.use_fp16_arithmetic && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
716     }
717     size_t out_elemsize = elemsize / elempack * out_elempack;
718 
719     Mat top_blob_bordered;
720     if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || output_pad_right > 0 || output_pad_bottom > 0 || (output_w > 0 && output_h > 0))
721     {
722         top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_allocator);
723     }
724     else
725     {
726         top_blob_bordered = top_blob;
727         top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
728     }
729     if (top_blob_bordered.empty())
730         return -100;
731 
732     const int maxk = kernel_w * kernel_h;
733 
734     // depth-wise
735     if (channels * elempack == group && group == num_output)
736     {
737         if (elempack == 8)
738         {
739             {
740                 #pragma omp parallel for num_threads(opt.num_threads)
741                 for (int g = 0; g < channels; g++)
742                 {
743                     __fp16* outptr = top_blob_bordered.channel(g);
744                     const __fp16* kptr = (const __fp16*)weight_data_fp16 + maxk * g * 8;
745                     const Mat m = bottom_blob.channel(g);
746 
747                     for (int i = 0; i < outh; i++)
748                     {
749                         for (int j = 0; j < outw; j++)
750                         {
751                             float16x8_t _sum = vdupq_n_f16((__fp16)0.f);
752 
753                             if (bias_term)
754                             {
755                                 _sum = vld1q_f16((const __fp16*)bias_data_fp16 + g * 8);
756                             }
757 
758                             for (int y = 0; y < kernel_h; y++)
759                             {
760                                 int sys = (i + y * dilation_h - (kernel_extent_h - 1));
761                                 if (sys < 0 || sys % stride_h != 0)
762                                     continue;
763 
764                                 int sy = sys / stride_h;
765                                 if (sy >= h)
766                                     continue;
767 
768                                 for (int x = 0; x < kernel_w; x++)
769                                 {
770                                     int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
771                                     if (sxs < 0 || sxs % stride_w != 0)
772                                         continue;
773 
774                                     int sx = sxs / stride_w;
775                                     if (sx >= w)
776                                         continue;
777 
778                                     const __fp16* sptr = m.row<const __fp16>(sy) + sx * 8;
779 
780                                     float16x8_t _val = vld1q_f16(sptr);
781 
782                                     int k = y * kernel_w + x;
783 
784                                     float16x8_t _w = vld1q_f16(kptr + k * 8);
785 
786                                     _sum = vfmaq_f16(_sum, _val, _w);
787                                 }
788                             }
789 
790                             _sum = activation_ps(_sum, activation_type, activation_params);
791 
792                             vst1q_f16(outptr + j * 8, _sum);
793                         }
794 
795                         outptr += outw * 8;
796                     }
797                 }
798             }
799         }
800 
801         if (elempack == 4)
802         {
803             {
804                 #pragma omp parallel for num_threads(opt.num_threads)
805                 for (int g = 0; g < channels; g++)
806                 {
807                     __fp16* outptr = top_blob_bordered.channel(g);
808                     const __fp16* kptr = (const __fp16*)weight_data_fp16 + maxk * g * 4;
809                     const Mat m = bottom_blob.channel(g);
810 
811                     for (int i = 0; i < outh; i++)
812                     {
813                         for (int j = 0; j < outw; j++)
814                         {
815                             float16x4_t _sum = vdup_n_f16((__fp16)0.f);
816 
817                             if (bias_term)
818                             {
819                                 _sum = vld1_f16((const __fp16*)bias_data_fp16 + g * 4);
820                             }
821 
822                             for (int y = 0; y < kernel_h; y++)
823                             {
824                                 int sys = (i + y * dilation_h - (kernel_extent_h - 1));
825                                 if (sys < 0 || sys % stride_h != 0)
826                                     continue;
827 
828                                 int sy = sys / stride_h;
829                                 if (sy >= h)
830                                     continue;
831 
832                                 for (int x = 0; x < kernel_w; x++)
833                                 {
834                                     int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
835                                     if (sxs < 0 || sxs % stride_w != 0)
836                                         continue;
837 
838                                     int sx = sxs / stride_w;
839                                     if (sx >= w)
840                                         continue;
841 
842                                     const __fp16* sptr = m.row<const __fp16>(sy) + sx * 4;
843 
844                                     float16x4_t _val = vld1_f16(sptr);
845 
846                                     int k = y * kernel_w + x;
847 
848                                     float16x4_t _w = vld1_f16(kptr + k * 4);
849 
850                                     _sum = vfma_f16(_sum, _val, _w);
851                                 }
852                             }
853 
854                             _sum = activation_ps(_sum, activation_type, activation_params);
855 
856                             vst1_f16(outptr + j * 4, _sum);
857                         }
858 
859                         outptr += outw * 4;
860                     }
861                 }
862             }
863         }
864 
865         if (elempack == 1)
866         {
867             {
868                 #pragma omp parallel for num_threads(opt.num_threads)
869                 for (int g = 0; g < channels; g++)
870                 {
871                     __fp16* outptr = top_blob_bordered.channel(g);
872                     const __fp16* kptr = (const __fp16*)weight_data_fp16 + maxk * g;
873                     const Mat m = bottom_blob.channel(g);
874 
875                     for (int i = 0; i < outh; i++)
876                     {
877                         for (int j = 0; j < outw; j++)
878                         {
879                             float sum = 0.f;
880 
881                             if (bias_term)
882                             {
883                                 sum = bias_data[g];
884                             }
885 
886                             for (int y = 0; y < kernel_h; y++)
887                             {
888                                 int sys = (i + y * dilation_h - (kernel_extent_h - 1));
889                                 if (sys < 0 || sys % stride_h != 0)
890                                     continue;
891 
892                                 int sy = sys / stride_h;
893                                 if (sy >= h)
894                                     continue;
895 
896                                 const __fp16* sptr = m.row<const __fp16>(sy);
897 
898                                 for (int x = 0; x < kernel_w; x++)
899                                 {
900                                     int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
901                                     if (sxs < 0 || sxs % stride_w != 0)
902                                         continue;
903 
904                                     int sx = sxs / stride_w;
905                                     if (sx >= w)
906                                         continue;
907 
908                                     __fp16 val = sptr[sx];
909 
910                                     int k = y * kernel_w + x;
911 
912                                     __fp16 w = kptr[k];
913 
914                                     sum += val * w;
915                                 }
916                             }
917 
918                             sum = activation_ss(sum, activation_type, activation_params);
919 
920                             outptr[j] = (__fp16)sum;
921                         }
922 
923                         outptr += outw;
924                     }
925                 }
926             }
927         }
928     }
929     else
930     {
931         // group deconvolution
932         const int channels_g = channels * elempack / group;
933         const int num_output_g = num_output / group;
934 
935         int g_elempack = 1;
936         int out_g_elempack = 1;
937         if (opt.use_packing_layout)
938         {
939             g_elempack = opt.use_fp16_arithmetic && channels_g % 8 == 0 ? 8 : channels_g % 4 == 0 ? 4 : 1;
940             out_g_elempack = opt.use_fp16_arithmetic && num_output_g % 8 == 0 ? 8 : num_output_g % 4 == 0 ? 4 : 1;
941         }
942 
943         // unpacking
944         Mat bottom_blob_unpacked = bottom_blob;
945         if (elempack > g_elempack)
946         {
947             Option opt_p = opt;
948             opt_p.blob_allocator = opt.workspace_allocator;
949             convert_packing(bottom_blob, bottom_blob_unpacked, g_elempack, opt_p);
950         }
951 
952         Mat top_blob_bordered_unpacked = top_blob_bordered;
953         if (out_g_elempack < out_elempack)
954         {
955             top_blob_bordered_unpacked.create(outw, outh, num_output / out_g_elempack, out_elemsize / out_elempack * out_g_elempack, out_g_elempack, opt.workspace_allocator);
956             if (top_blob_bordered_unpacked.empty())
957                 return -100;
958         }
959 
960         for (int g = 0; g < group; g++)
961         {
962             const Mat bottom_blob_g = bottom_blob_unpacked.channel_range(channels_g * g / g_elempack, channels_g / g_elempack);
963             Mat top_blob_bordered_g = top_blob_bordered_unpacked.channel_range(num_output_g * g / out_g_elempack, num_output_g / out_g_elempack);
964 
965             const ncnn::Layer* op = group_ops[g];
966 
967             Option opt_g = opt;
968             opt_g.blob_allocator = top_blob_bordered_unpacked.allocator;
969 
970             // forward
971             op->forward(bottom_blob_g, top_blob_bordered_g, opt_g);
972         }
973 
974         // packing
975         if (out_g_elempack < out_elempack)
976         {
977             convert_packing(top_blob_bordered_unpacked, top_blob_bordered, out_elempack, opt);
978         }
979         else
980         {
981             top_blob_bordered = top_blob_bordered_unpacked;
982         }
983     }
984 
985     cut_padding(top_blob_bordered, top_blob, opt);
986     if (top_blob.empty())
987         return -100;
988 
989     return 0;
990 }
991 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
992 
forward_bf16s(const Mat & bottom_blob,Mat & top_blob,const Option & opt) const993 int DeconvolutionDepthWise_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
994 {
995     int w = bottom_blob.w;
996     int h = bottom_blob.h;
997     int channels = bottom_blob.c;
998     size_t elemsize = bottom_blob.elemsize;
999     int elempack = bottom_blob.elempack;
1000 
1001     const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
1002     const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
1003 
1004     int outw = (w - 1) * stride_w + kernel_extent_w;
1005     int outh = (h - 1) * stride_h + kernel_extent_h;
1006     int out_elempack = opt.use_packing_layout && num_output % 4 == 0 ? 4 : 1;
1007     size_t out_elemsize = elemsize / elempack * out_elempack;
1008 
1009     Mat top_blob_bordered;
1010     if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || output_pad_right > 0 || output_pad_bottom > 0 || (output_w > 0 && output_h > 0))
1011     {
1012         top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_allocator);
1013     }
1014     else
1015     {
1016         top_blob_bordered = top_blob;
1017         top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
1018     }
1019     if (top_blob_bordered.empty())
1020         return -100;
1021 
1022     const int maxk = kernel_w * kernel_h;
1023 
1024     // depth-wise
1025     if (channels * elempack == group && group == num_output)
1026     {
1027 #if __ARM_NEON
1028         if (elempack == 4)
1029         {
1030             #pragma omp parallel for num_threads(opt.num_threads)
1031             for (int g = 0; g < channels; g++)
1032             {
1033                 unsigned short* outptr = top_blob_bordered.channel(g);
1034                 const unsigned short* kptr = (const unsigned short*)weight_data_bf16 + maxk * g * 4;
1035                 const Mat m = bottom_blob.channel(g);
1036 
1037                 for (int i = 0; i < outh; i++)
1038                 {
1039                     for (int j = 0; j < outw; j++)
1040                     {
1041                         float32x4_t _sum = vdupq_n_f32(0.f);
1042 
1043                         if (bias_term)
1044                         {
1045                             _sum = vld1q_f32((const float*)bias_data + g * 4);
1046                         }
1047 
1048                         for (int y = 0; y < kernel_h; y++)
1049                         {
1050                             int sys = (i + y * dilation_h - (kernel_extent_h - 1));
1051                             if (sys < 0 || sys % stride_h != 0)
1052                                 continue;
1053 
1054                             int sy = sys / stride_h;
1055                             if (sy >= h)
1056                                 continue;
1057 
1058                             for (int x = 0; x < kernel_w; x++)
1059                             {
1060                                 int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
1061                                 if (sxs < 0 || sxs % stride_w != 0)
1062                                     continue;
1063 
1064                                 int sx = sxs / stride_w;
1065                                 if (sx >= w)
1066                                     continue;
1067 
1068                                 const unsigned short* sptr = m.row<const unsigned short>(sy) + sx * 4;
1069 
1070                                 float32x4_t _val = vcvt_f32_bf16(vld1_u16(sptr));
1071 
1072                                 int k = y * kernel_w + x;
1073 
1074                                 float32x4_t _w = vcvt_f32_bf16(vld1_u16(kptr + k * 4));
1075 
1076                                 _sum = vmlaq_f32(_sum, _val, _w);
1077                             }
1078                         }
1079 
1080                         _sum = activation_ps(_sum, activation_type, activation_params);
1081 
1082                         vst1_u16(outptr + j * 4, vcvt_bf16_f32(_sum));
1083                     }
1084 
1085                     outptr += outw * 4;
1086                 }
1087             }
1088         }
1089 #endif // __ARM_NEON
1090 
1091         if (elempack == 1)
1092         {
1093             #pragma omp parallel for num_threads(opt.num_threads)
1094             for (int g = 0; g < channels; g++)
1095             {
1096                 unsigned short* outptr = top_blob_bordered.channel(g);
1097                 const unsigned short* kptr = (const unsigned short*)weight_data_bf16 + maxk * g;
1098                 const Mat m = bottom_blob.channel(g);
1099 
1100                 for (int i = 0; i < outh; i++)
1101                 {
1102                     for (int j = 0; j < outw; j++)
1103                     {
1104                         float sum = 0.f;
1105 
1106                         if (bias_term)
1107                         {
1108                             sum = bias_data[g];
1109                         }
1110 
1111                         for (int y = 0; y < kernel_h; y++)
1112                         {
1113                             int sys = (i + y * dilation_h - (kernel_extent_h - 1));
1114                             if (sys < 0 || sys % stride_h != 0)
1115                                 continue;
1116 
1117                             int sy = sys / stride_h;
1118                             if (sy >= h)
1119                                 continue;
1120 
1121                             const unsigned short* sptr = m.row<const unsigned short>(sy);
1122 
1123                             for (int x = 0; x < kernel_w; x++)
1124                             {
1125                                 int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
1126                                 if (sxs < 0 || sxs % stride_w != 0)
1127                                     continue;
1128 
1129                                 int sx = sxs / stride_w;
1130                                 if (sx >= w)
1131                                     continue;
1132 
1133                                 float val = bfloat16_to_float32(sptr[sx]);
1134 
1135                                 int k = y * kernel_w + x;
1136 
1137                                 float w = bfloat16_to_float32(kptr[k]);
1138 
1139                                 sum += val * w;
1140                             }
1141                         }
1142 
1143                         if (activation_type == 1)
1144                         {
1145                             sum = std::max(sum, 0.f);
1146                         }
1147                         else if (activation_type == 2)
1148                         {
1149                             float slope = activation_params[0];
1150                             sum = sum > 0.f ? sum : sum * slope;
1151                         }
1152                         else if (activation_type == 3)
1153                         {
1154                             float min = activation_params[0];
1155                             float max = activation_params[1];
1156                             if (sum < min)
1157                                 sum = min;
1158                             if (sum > max)
1159                                 sum = max;
1160                         }
1161                         else if (activation_type == 4)
1162                         {
1163                             sum = static_cast<float>(1.f / (1.f + exp(-sum)));
1164                         }
1165 
1166                         outptr[j] = float32_to_bfloat16(sum);
1167                     }
1168 
1169                     outptr += outw;
1170                 }
1171             }
1172         }
1173     }
1174     else
1175     {
1176         // group deconvolution
1177         const int channels_g = channels * elempack / group;
1178         const int num_output_g = num_output / group;
1179 
1180         int g_elempack = (support_packing && opt.use_packing_layout && channels_g % 4 == 0) ? 4 : 1;
1181         int out_g_elempack = (support_packing && opt.use_packing_layout && num_output_g % 4 == 0) ? 4 : 1;
1182 
1183         // unpacking
1184         Mat bottom_blob_unpacked = bottom_blob;
1185         if (elempack == 4 && g_elempack == 1)
1186         {
1187             Option opt_p = opt;
1188             opt_p.blob_allocator = opt.workspace_allocator;
1189             convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_p);
1190         }
1191 
1192         Mat top_blob_bordered_unpacked = top_blob_bordered;
1193         if (out_g_elempack == 1 && out_elempack == 4)
1194         {
1195             top_blob_bordered_unpacked.create(outw, outh, num_output, out_elemsize / out_elempack, 1, opt.workspace_allocator);
1196             if (top_blob_bordered_unpacked.empty())
1197                 return -100;
1198         }
1199 
1200         for (int g = 0; g < group; g++)
1201         {
1202             const Mat bottom_blob_g = bottom_blob_unpacked.channel_range(channels_g * g / g_elempack, channels_g / g_elempack);
1203             Mat top_blob_bordered_g = top_blob_bordered_unpacked.channel_range(num_output_g * g / out_g_elempack, num_output_g / out_g_elempack);
1204 
1205             const ncnn::Layer* op = group_ops[g];
1206 
1207             Option opt_g = opt;
1208             opt_g.blob_allocator = top_blob_bordered_unpacked.allocator;
1209 
1210             // forward
1211             op->forward(bottom_blob_g, top_blob_bordered_g, opt_g);
1212         }
1213 
1214         // packing
1215         if (out_g_elempack == 1 && out_elempack == 4)
1216         {
1217             convert_packing(top_blob_bordered_unpacked, top_blob_bordered, 4, opt);
1218         }
1219         else
1220         {
1221             top_blob_bordered = top_blob_bordered_unpacked;
1222         }
1223     }
1224 
1225     cut_padding(top_blob_bordered, top_blob, opt);
1226     if (top_blob.empty())
1227         return -100;
1228 
1229     return 0;
1230 }
1231 
1232 } // namespace ncnn
1233