1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14 
15 #include "deconvolution_arm.h"
16 
17 #include "layer_type.h"
18 
19 #if __ARM_NEON
20 #include <arm_neon.h>
21 #endif // __ARM_NEON
22 
23 #include "arm_activation.h"
24 
25 namespace ncnn {
26 
27 #include "deconvolution_3x3.h"
28 #include "deconvolution_4x4.h"
29 
30 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
31 #include "deconvolution_4x4_fp16s.h"
32 #endif
33 
Deconvolution_arm()34 Deconvolution_arm::Deconvolution_arm()
35 {
36 #if __ARM_NEON
37     support_packing = true;
38 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
39     support_fp16_storage = true;
40 #endif
41 #endif // __ARM_NEON
42 
43     support_bf16_storage = true;
44 
45     activation = 0;
46 }
47 
create_pipeline(const Option & opt)48 int Deconvolution_arm::create_pipeline(const Option& opt)
49 {
50     if (activation_type == 1)
51     {
52         activation = ncnn::create_layer(ncnn::LayerType::ReLU);
53 
54         ncnn::ParamDict pd;
55         activation->load_param(pd);
56     }
57     else if (activation_type == 2)
58     {
59         activation = ncnn::create_layer(ncnn::LayerType::ReLU);
60 
61         ncnn::ParamDict pd;
62         pd.set(0, activation_params[0]); // slope
63         activation->load_param(pd);
64     }
65     else if (activation_type == 3)
66     {
67         activation = ncnn::create_layer(ncnn::LayerType::Clip);
68 
69         ncnn::ParamDict pd;
70         pd.set(0, activation_params[0]); // min
71         pd.set(1, activation_params[1]); // max
72         activation->load_param(pd);
73     }
74     else if (activation_type == 4)
75     {
76         activation = ncnn::create_layer(ncnn::LayerType::Sigmoid);
77 
78         ncnn::ParamDict pd;
79         activation->load_param(pd);
80     }
81 
82     if (activation)
83     {
84         activation->create_pipeline(opt);
85     }
86 
87 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
88     if (opt.use_fp16_storage)
89     {
90         return create_pipeline_fp16s(opt);
91     }
92 #endif
93 
94     if (opt.use_bf16_storage)
95     {
96         return create_pipeline_bf16s(opt);
97     }
98 
99     const int maxk = kernel_w * kernel_h;
100     int num_input = weight_data_size / maxk / num_output;
101 
102     Mat weight_data_transposed(weight_data.w);
103     {
104         float* pt = weight_data_transposed;
105         const float* p = weight_data;
106 
107         for (int i = 0; i < num_input * num_output; i++)
108         {
109             for (int k = 0; k < maxk; k++)
110             {
111                 pt[maxk - 1 - k] = p[k];
112             }
113 
114             p += maxk;
115             pt += maxk;
116         }
117     }
118 
119     int elempack = (support_packing && opt.use_packing_layout && num_input % 4 == 0) ? 4 : 1;
120     int out_elempack = (support_packing && opt.use_packing_layout && num_output % 4 == 0) ? 4 : 1;
121 
122 #if __ARM_NEON
123     // pack4
124     if (elempack == 4 && out_elempack == 4)
125     {
126         // src = kw-kh-inch-outch
127         // dst = 4b-4a-kw-kh-inch/4a-outch/4b
128         {
129             Mat weight_data_r2 = weight_data_transposed.reshape(maxk, num_input, num_output);
130 
131             weight_data_pack4.create(maxk, num_input / 4, num_output / 4, (size_t)4 * 16, 16);
132 
133             for (int q = 0; q + 3 < num_output; q += 4)
134             {
135                 const Mat k0 = weight_data_r2.channel(q);
136                 const Mat k1 = weight_data_r2.channel(q + 1);
137                 const Mat k2 = weight_data_r2.channel(q + 2);
138                 const Mat k3 = weight_data_r2.channel(q + 3);
139 
140                 Mat g0 = weight_data_pack4.channel(q / 4);
141 
142                 for (int p = 0; p + 3 < num_input; p += 4)
143                 {
144                     const float* k00 = k0.row(p);
145                     const float* k01 = k0.row(p + 1);
146                     const float* k02 = k0.row(p + 2);
147                     const float* k03 = k0.row(p + 3);
148 
149                     const float* k10 = k1.row(p);
150                     const float* k11 = k1.row(p + 1);
151                     const float* k12 = k1.row(p + 2);
152                     const float* k13 = k1.row(p + 3);
153 
154                     const float* k20 = k2.row(p);
155                     const float* k21 = k2.row(p + 1);
156                     const float* k22 = k2.row(p + 2);
157                     const float* k23 = k2.row(p + 3);
158 
159                     const float* k30 = k3.row(p);
160                     const float* k31 = k3.row(p + 1);
161                     const float* k32 = k3.row(p + 2);
162                     const float* k33 = k3.row(p + 3);
163 
164                     float* g00 = g0.row(p / 4);
165 
166                     for (int k = 0; k < maxk; k++)
167                     {
168                         g00[0] = k00[k];
169                         g00[1] = k10[k];
170                         g00[2] = k20[k];
171                         g00[3] = k30[k];
172 
173                         g00[4] = k01[k];
174                         g00[5] = k11[k];
175                         g00[6] = k21[k];
176                         g00[7] = k31[k];
177 
178                         g00[8] = k02[k];
179                         g00[9] = k12[k];
180                         g00[10] = k22[k];
181                         g00[11] = k32[k];
182 
183                         g00[12] = k03[k];
184                         g00[13] = k13[k];
185                         g00[14] = k23[k];
186                         g00[15] = k33[k];
187 
188                         g00 += 16;
189                     }
190                 }
191             }
192         }
193     }
194 
195     // pack1to4
196     if (elempack == 1 && out_elempack == 4)
197     {
198         // src = kw-kh-inch-outch
199         // dst = 4b-kw-kh-inch-outch/4b
200         {
201             Mat weight_data_r2 = weight_data_transposed.reshape(maxk, num_input, num_output);
202 
203             weight_data_pack1to4.create(maxk, num_input, num_output / 4, (size_t)4 * 4, 4);
204 
205             for (int q = 0; q + 3 < num_output; q += 4)
206             {
207                 const Mat k0 = weight_data_r2.channel(q);
208                 const Mat k1 = weight_data_r2.channel(q + 1);
209                 const Mat k2 = weight_data_r2.channel(q + 2);
210                 const Mat k3 = weight_data_r2.channel(q + 3);
211 
212                 Mat g0 = weight_data_pack1to4.channel(q / 4);
213 
214                 for (int p = 0; p < num_input; p++)
215                 {
216                     const float* k00 = k0.row(p);
217                     const float* k10 = k1.row(p);
218                     const float* k20 = k2.row(p);
219                     const float* k30 = k3.row(p);
220 
221                     float* g00 = g0.row(p);
222 
223                     for (int k = 0; k < maxk; k++)
224                     {
225                         g00[0] = k00[k];
226                         g00[1] = k10[k];
227                         g00[2] = k20[k];
228                         g00[3] = k30[k];
229 
230                         g00 += 4;
231                     }
232                 }
233             }
234         }
235     }
236 
237     // pack4to1
238     if (elempack == 4 && out_elempack == 1)
239     {
240         // src = kw-kh-inch-outch
241         // dst = 4a-kw-kh-inch/4a-outch
242         {
243             Mat weight_data_r2 = weight_data_transposed.reshape(maxk, num_input, num_output);
244 
245             weight_data_pack4to1.create(maxk, num_input / 4, num_output, (size_t)4 * 4, 4);
246 
247             for (int q = 0; q < num_output; q++)
248             {
249                 const Mat k0 = weight_data_r2.channel(q);
250                 Mat g0 = weight_data_pack4to1.channel(q);
251 
252                 for (int p = 0; p + 3 < num_input; p += 4)
253                 {
254                     const float* k00 = k0.row(p);
255                     const float* k01 = k0.row(p + 1);
256                     const float* k02 = k0.row(p + 2);
257                     const float* k03 = k0.row(p + 3);
258 
259                     float* g00 = g0.row(p / 4);
260 
261                     for (int k = 0; k < maxk; k++)
262                     {
263                         g00[0] = k00[k];
264                         g00[1] = k01[k];
265                         g00[2] = k02[k];
266                         g00[3] = k03[k];
267 
268                         g00 += 4;
269                     }
270                 }
271             }
272         }
273     }
274 #endif // __ARM_NEON
275 
276     // pack1
277     if (elempack == 1 && out_elempack == 1)
278     {
279         weight_data_pack1 = weight_data_transposed;
280     }
281 
282     return 0;
283 }
284 
destroy_pipeline(const Option & opt)285 int Deconvolution_arm::destroy_pipeline(const Option& opt)
286 {
287     if (activation)
288     {
289         activation->destroy_pipeline(opt);
290         delete activation;
291         activation = 0;
292     }
293 
294     return 0;
295 }
296 
forward(const Mat & bottom_blob,Mat & top_blob,const Option & opt) const297 int Deconvolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
298 {
299     int elembits = bottom_blob.elembits();
300 
301 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
302     if (opt.use_fp16_storage && elembits == 16)
303     {
304         if (opt.use_fp16_arithmetic)
305             return forward_fp16sa(bottom_blob, top_blob, opt);
306         else
307             return forward_fp16s(bottom_blob, top_blob, opt);
308     }
309 #endif
310 
311     if (opt.use_bf16_storage && elembits == 16)
312         return forward_bf16s(bottom_blob, top_blob, opt);
313 
314     // deconvolv with NxN kernel
315     // value = value + bias
316 
317     int w = bottom_blob.w;
318     int h = bottom_blob.h;
319     int channels = bottom_blob.c;
320     size_t elemsize = bottom_blob.elemsize;
321     int elempack = bottom_blob.elempack;
322 
323     //     NCNN_LOGE("Deconvolution input %d x %d  pad = %d %d  ksize=%d %d  stride=%d %d", w, h, pad_w, pad_h, kernel_w, kernel_h, stride_w, stride_h);
324 
325     const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
326     const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
327 
328     int outw = (w - 1) * stride_w + kernel_extent_w;
329     int outh = (h - 1) * stride_h + kernel_extent_h;
330     int out_elempack = (support_packing && opt.use_packing_layout && num_output % 4 == 0) ? 4 : 1;
331     size_t out_elemsize = elemsize / elempack * out_elempack;
332 
333     Mat top_blob_bordered;
334     if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || output_pad_right > 0 || output_pad_bottom > 0 || (output_w > 0 && output_h > 0))
335     {
336         top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_allocator);
337     }
338     else
339     {
340         top_blob_bordered = top_blob;
341         top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
342     }
343     if (top_blob_bordered.empty())
344         return -100;
345 
346     const int maxk = kernel_w * kernel_h;
347 
348 #if __ARM_NEON
349     if (elempack == 4 && out_elempack == 4)
350     {
351         // num_output
352         #pragma omp parallel for num_threads(opt.num_threads)
353         for (int p = 0; p < num_output / out_elempack; p++)
354         {
355             float* outptr = top_blob_bordered.channel(p);
356 
357             for (int i = 0; i < outh; i++)
358             {
359                 for (int j = 0; j < outw; j++)
360                 {
361                     float32x4_t _sum = vdupq_n_f32(0.f);
362 
363                     if (bias_term)
364                     {
365                         _sum = vld1q_f32(((const float*)bias_data) + p * 4);
366                     }
367 
368                     const float* kptr = (const float*)weight_data_pack4 + maxk * channels * p * 16;
369 
370                     // channels
371                     for (int q = 0; q < channels; q++)
372                     {
373                         const Mat m = bottom_blob.channel(q);
374 
375                         for (int y = 0; y < kernel_h; y++)
376                         {
377                             int sys = (i + y * dilation_h - (kernel_extent_h - 1));
378                             if (sys < 0 || sys % stride_h != 0)
379                                 continue;
380 
381                             int sy = sys / stride_h;
382                             if (sy >= h)
383                                 continue;
384 
385                             for (int x = 0; x < kernel_w; x++)
386                             {
387                                 int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
388                                 if (sxs < 0 || sxs % stride_w != 0)
389                                     continue;
390 
391                                 int sx = sxs / stride_w;
392                                 if (sx >= w)
393                                     continue;
394 
395                                 const float* sptr = m.row(sy) + sx * 4;
396 
397                                 float32x4_t _val = vld1q_f32(sptr);
398 
399                                 int k = y * kernel_w + x;
400 
401                                 float32x4_t _w0 = vld1q_f32(kptr + k * 16);
402                                 float32x4_t _w1 = vld1q_f32(kptr + k * 16 + 4);
403                                 float32x4_t _w2 = vld1q_f32(kptr + k * 16 + 8);
404                                 float32x4_t _w3 = vld1q_f32(kptr + k * 16 + 12);
405 
406 #if __aarch64__
407                                 _sum = vmlaq_laneq_f32(_sum, _w0, _val, 0);
408                                 _sum = vmlaq_laneq_f32(_sum, _w1, _val, 1);
409                                 _sum = vmlaq_laneq_f32(_sum, _w2, _val, 2);
410                                 _sum = vmlaq_laneq_f32(_sum, _w3, _val, 3);
411 #else
412                                 _sum = vmlaq_lane_f32(_sum, _w0, vget_low_f32(_val), 0);
413                                 _sum = vmlaq_lane_f32(_sum, _w1, vget_low_f32(_val), 1);
414                                 _sum = vmlaq_lane_f32(_sum, _w2, vget_high_f32(_val), 0);
415                                 _sum = vmlaq_lane_f32(_sum, _w3, vget_high_f32(_val), 1);
416 #endif
417                             }
418                         }
419 
420                         kptr += maxk * 16;
421                     }
422 
423                     _sum = activation_ps(_sum, activation_type, activation_params);
424 
425                     vst1q_f32(outptr + j * 4, _sum);
426                 }
427 
428                 outptr += outw * 4;
429             }
430         }
431     }
432 
433     if (elempack == 1 && out_elempack == 4)
434     {
435         // num_output
436         #pragma omp parallel for num_threads(opt.num_threads)
437         for (int p = 0; p < num_output / out_elempack; p++)
438         {
439             float* outptr = top_blob_bordered.channel(p);
440 
441             for (int i = 0; i < outh; i++)
442             {
443                 for (int j = 0; j < outw; j++)
444                 {
445                     float32x4_t _sum = vdupq_n_f32(0.f);
446 
447                     if (bias_term)
448                     {
449                         _sum = vld1q_f32(((const float*)bias_data) + p * 4);
450                     }
451 
452                     const float* kptr = (const float*)weight_data_pack1to4 + maxk * channels * p * 4;
453 
454                     // channels
455                     for (int q = 0; q < channels; q++)
456                     {
457                         const Mat m = bottom_blob.channel(q);
458 
459                         for (int y = 0; y < kernel_h; y++)
460                         {
461                             int sys = (i + y * dilation_h - (kernel_extent_h - 1));
462                             if (sys < 0 || sys % stride_h != 0)
463                                 continue;
464 
465                             int sy = sys / stride_h;
466                             if (sy >= h)
467                                 continue;
468 
469                             const float* sptr = m.row(sy);
470 
471                             for (int x = 0; x < kernel_w; x++)
472                             {
473                                 int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
474                                 if (sxs < 0 || sxs % stride_w != 0)
475                                     continue;
476 
477                                 int sx = sxs / stride_w;
478                                 if (sx >= w)
479                                     continue;
480 
481                                 float32x4_t _val = vdupq_n_f32(sptr[sx]);
482 
483                                 int k = y * kernel_w + x;
484 
485                                 float32x4_t _w = vld1q_f32(kptr + k * 4);
486 
487                                 _sum = vmlaq_f32(_sum, _val, _w);
488                             }
489                         }
490 
491                         kptr += maxk * 4;
492                     }
493 
494                     _sum = activation_ps(_sum, activation_type, activation_params);
495 
496                     vst1q_f32(outptr + j * 4, _sum);
497                 }
498 
499                 outptr += outw * 4;
500             }
501         }
502     }
503 
504     if (elempack == 4 && out_elempack == 1)
505     {
506         // num_output
507         #pragma omp parallel for num_threads(opt.num_threads)
508         for (int p = 0; p < num_output / out_elempack; p++)
509         {
510             float* outptr = top_blob_bordered.channel(p);
511 
512             for (int i = 0; i < outh; i++)
513             {
514                 for (int j = 0; j < outw; j++)
515                 {
516                     float sum = 0.f;
517 
518                     if (bias_term)
519                     {
520                         sum = bias_data[p];
521                     }
522 
523                     const float* kptr = (const float*)weight_data_pack4to1 + maxk * channels * p * 4;
524 
525                     // channels
526                     for (int q = 0; q < channels; q++)
527                     {
528                         const Mat m = bottom_blob.channel(q);
529 
530                         for (int y = 0; y < kernel_h; y++)
531                         {
532                             int sys = (i + y * dilation_h - (kernel_extent_h - 1));
533                             if (sys < 0 || sys % stride_h != 0)
534                                 continue;
535 
536                             int sy = sys / stride_h;
537                             if (sy >= h)
538                                 continue;
539 
540                             for (int x = 0; x < kernel_w; x++)
541                             {
542                                 int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
543                                 if (sxs < 0 || sxs % stride_w != 0)
544                                     continue;
545 
546                                 int sx = sxs / stride_w;
547                                 if (sx >= w)
548                                     continue;
549 
550                                 const float* sptr = m.row(sy) + sx * 4;
551 
552                                 float32x4_t _val = vld1q_f32(sptr);
553 
554                                 int k = y * kernel_w + x;
555 
556                                 float32x4_t _w = vld1q_f32(kptr + k * 4);
557 
558                                 float32x4_t _s4 = vmulq_f32(_val, _w);
559 #if __aarch64__
560                                 sum += vaddvq_f32(_s4); // dot
561 #else
562                                 float32x2_t _ss = vadd_f32(vget_low_f32(_s4), vget_high_f32(_s4));
563                                 _ss = vpadd_f32(_ss, _ss);
564                                 sum += vget_lane_f32(_ss, 0);
565 #endif
566                             }
567                         }
568 
569                         kptr += maxk * 4;
570                     }
571 
572                     sum = activation_ss(sum, activation_type, activation_params);
573 
574                     outptr[j] = sum;
575                 }
576 
577                 outptr += outw;
578             }
579         }
580     }
581 #endif // __ARM_NEON
582 
583     if (elempack == 1 && out_elempack == 1)
584     {
585         if (kernel_w == 3 && kernel_h == 3 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1)
586         {
587             deconv3x3s1_neon(bottom_blob, top_blob_bordered, weight_data, bias_data, opt);
588 
589             if (activation)
590             {
591                 activation->forward_inplace(top_blob_bordered, opt);
592             }
593         }
594         else if (kernel_w == 3 && kernel_h == 3 && stride_w == 2 && stride_h == 2 && dilation_w == 1 && dilation_h == 1)
595         {
596             deconv3x3s2_neon(bottom_blob, top_blob_bordered, weight_data, bias_data, opt);
597 
598             if (activation)
599             {
600                 activation->forward_inplace(top_blob_bordered, opt);
601             }
602         }
603         else if (kernel_w == 4 && kernel_h == 4 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1)
604         {
605             deconv4x4s1_neon(bottom_blob, top_blob_bordered, weight_data, bias_data, opt);
606 
607             if (activation)
608             {
609                 activation->forward_inplace(top_blob_bordered, opt);
610             }
611         }
612         else if (kernel_w == 4 && kernel_h == 4 && stride_w == 2 && stride_h == 2 && dilation_w == 1 && dilation_h == 1)
613         {
614             deconv4x4s2_neon(bottom_blob, top_blob_bordered, weight_data, bias_data, opt);
615 
616             if (activation)
617             {
618                 activation->forward_inplace(top_blob_bordered, opt);
619             }
620         }
621         else
622         {
623             // num_output
624             #pragma omp parallel for num_threads(opt.num_threads)
625             for (int p = 0; p < num_output; p++)
626             {
627                 float* outptr = top_blob_bordered.channel(p);
628 
629                 for (int i = 0; i < outh; i++)
630                 {
631                     for (int j = 0; j < outw; j++)
632                     {
633                         float sum = 0.f;
634 
635                         if (bias_term)
636                         {
637                             sum = bias_data[p];
638                         }
639 
640                         const float* kptr = (const float*)weight_data_pack1 + maxk * channels * p;
641 
642                         // channels
643                         for (int q = 0; q < channels; q++)
644                         {
645                             const Mat m = bottom_blob.channel(q);
646 
647                             for (int y = 0; y < kernel_h; y++)
648                             {
649                                 int sys = (i + y * dilation_h - (kernel_extent_h - 1));
650                                 if (sys < 0 || sys % stride_h != 0)
651                                     continue;
652 
653                                 int sy = sys / stride_h;
654                                 if (sy >= h)
655                                     continue;
656 
657                                 const float* sptr = m.row(sy);
658 
659                                 for (int x = 0; x < kernel_w; x++)
660                                 {
661                                     int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
662                                     if (sxs < 0 || sxs % stride_w != 0)
663                                         continue;
664 
665                                     int sx = sxs / stride_w;
666                                     if (sx >= w)
667                                         continue;
668 
669                                     float val = sptr[sx];
670 
671                                     int k = y * kernel_w + x;
672 
673                                     float w = kptr[k];
674 
675                                     sum += val * w;
676                                 }
677                             }
678 
679                             kptr += maxk;
680                         }
681 
682                         if (activation_type == 1)
683                         {
684                             sum = std::max(sum, 0.f);
685                         }
686                         else if (activation_type == 2)
687                         {
688                             float slope = activation_params[0];
689                             sum = sum > 0.f ? sum : sum * slope;
690                         }
691                         else if (activation_type == 3)
692                         {
693                             float min = activation_params[0];
694                             float max = activation_params[1];
695                             if (sum < min)
696                                 sum = min;
697                             if (sum > max)
698                                 sum = max;
699                         }
700                         else if (activation_type == 4)
701                         {
702                             sum = static_cast<float>(1.f / (1.f + exp(-sum)));
703                         }
704 
705                         outptr[j] = sum;
706                     }
707 
708                     outptr += outw;
709                 }
710             }
711         }
712     }
713 
714     cut_padding(top_blob_bordered, top_blob, opt);
715     if (top_blob.empty())
716         return -100;
717 
718     return 0;
719 }
720 
721 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
create_pipeline_fp16s(const Option & opt)722 int Deconvolution_arm::create_pipeline_fp16s(const Option& opt)
723 {
724     const int maxk = kernel_w * kernel_h;
725     const int num_input = weight_data_size / maxk / num_output;
726 
727     int elempack = 1;
728     int out_elempack = 1;
729 
730     if (opt.use_packing_layout)
731     {
732         elempack = opt.use_fp16_arithmetic && num_input % 8 == 0 ? 8 : num_input % 4 == 0 ? 4 : 1;
733         out_elempack = opt.use_fp16_arithmetic && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
734     }
735 
736     Mat weight_data_transposed(weight_data.w);
737     {
738         float* pt = weight_data_transposed;
739         const float* p = weight_data;
740 
741         for (int i = 0; i < num_input * num_output; i++)
742         {
743             for (int k = 0; k < maxk; k++)
744             {
745                 pt[maxk - 1 - k] = p[k];
746             }
747 
748             p += maxk;
749             pt += maxk;
750         }
751     }
752 
753     // src = kw-kh-inch-outch
754     // dst = pb-pa-kw-kh-inch/pa-outch/pb
755     {
756         Mat weight_data_r2 = weight_data_transposed.reshape(maxk, num_input, num_output);
757 
758         weight_data_fp16.create(maxk, num_input / elempack, num_output / out_elempack, (size_t)2u * elempack * out_elempack, elempack * out_elempack);
759 
760         for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack)
761         {
762             Mat g0 = weight_data_fp16.channel(q / out_elempack);
763 
764             for (int p = 0; p + (elempack - 1) < num_input; p += elempack)
765             {
766                 __fp16* g00 = g0.row<__fp16>(p / elempack);
767 
768                 for (int k = 0; k < maxk; k++)
769                 {
770                     for (int i = 0; i < elempack; i++)
771                     {
772                         for (int j = 0; j < out_elempack; j++)
773                         {
774                             const float* k00 = weight_data_r2.channel(q + j).row(p + i);
775 
776                             g00[0] = (__fp16)k00[k];
777 
778                             g00++;
779                         }
780                     }
781                 }
782             }
783         }
784     }
785 
786     if (elempack == 1 && out_elempack == 1 && opt.use_fp16_arithmetic)
787     {
788         if (kernel_w == 4 && kernel_h == 4 && stride_w == 2 && stride_h == 2 && dilation_w == 1 && dilation_h == 1)
789         {
790             ncnn::cast_float32_to_float16(weight_data, weight_data_fp16, opt);
791         }
792     }
793 
794     ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt);
795 
796     return 0;
797 }
798 
forward_fp16s(const Mat & bottom_blob,Mat & top_blob,const Option & opt) const799 int Deconvolution_arm::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
800 {
801     // deconvolv with NxN kernel
802     // value = value + bias
803 
804     int w = bottom_blob.w;
805     int h = bottom_blob.h;
806     int channels = bottom_blob.c;
807     size_t elemsize = bottom_blob.elemsize;
808     int elempack = bottom_blob.elempack;
809 
810     //     NCNN_LOGE("Deconvolution input %d x %d  pad = %d %d  ksize=%d %d  stride=%d %d", w, h, pad_w, pad_h, kernel_w, kernel_h, stride_w, stride_h);
811 
812     const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
813     const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
814 
815     int outw = (w - 1) * stride_w + kernel_extent_w;
816     int outh = (h - 1) * stride_h + kernel_extent_h;
817     int out_elempack = opt.use_packing_layout && num_output % 4 == 0 ? 4 : 1;
818     size_t out_elemsize = elemsize / elempack * out_elempack;
819 
820     Mat top_blob_bordered;
821     if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || output_pad_right > 0 || output_pad_bottom > 0 || (output_w > 0 && output_h > 0))
822     {
823         top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_allocator);
824     }
825     else
826     {
827         top_blob_bordered = top_blob;
828         top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
829     }
830     if (top_blob_bordered.empty())
831         return -100;
832 
833     const int maxk = kernel_w * kernel_h;
834 
835     if (elempack == 4 && out_elempack == 4)
836     {
837         {
838             // num_output
839             #pragma omp parallel for num_threads(opt.num_threads)
840             for (int p = 0; p < num_output / out_elempack; p++)
841             {
842                 __fp16* outptr = top_blob_bordered.channel(p);
843 
844                 for (int i = 0; i < outh; i++)
845                 {
846                     for (int j = 0; j < outw; j++)
847                     {
848                         float32x4_t _sum = vdupq_n_f32(0.f);
849 
850                         if (bias_term)
851                         {
852                             _sum = vld1q_f32(((const float*)bias_data) + p * 4);
853                         }
854 
855                         const __fp16* kptr = weight_data_fp16.channel(p);
856 
857                         // channels
858                         for (int q = 0; q < channels; q++)
859                         {
860                             const Mat m = bottom_blob.channel(q);
861 
862                             for (int y = 0; y < kernel_h; y++)
863                             {
864                                 int sys = (i + y * dilation_h - (kernel_extent_h - 1));
865                                 if (sys < 0 || sys % stride_h != 0)
866                                     continue;
867 
868                                 int sy = sys / stride_h;
869                                 if (sy >= h)
870                                     continue;
871 
872                                 for (int x = 0; x < kernel_w; x++)
873                                 {
874                                     int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
875                                     if (sxs < 0 || sxs % stride_w != 0)
876                                         continue;
877 
878                                     int sx = sxs / stride_w;
879                                     if (sx >= w)
880                                         continue;
881 
882                                     const __fp16* sptr = m.row<const __fp16>(sy) + sx * 4;
883 
884                                     float32x4_t _val = vcvt_f32_f16(vld1_f16(sptr));
885 
886                                     int k = y * kernel_w + x;
887 
888                                     float32x4_t _w0 = vcvt_f32_f16(vld1_f16(kptr + k * 16));
889                                     float32x4_t _w1 = vcvt_f32_f16(vld1_f16(kptr + k * 16 + 4));
890                                     float32x4_t _w2 = vcvt_f32_f16(vld1_f16(kptr + k * 16 + 8));
891                                     float32x4_t _w3 = vcvt_f32_f16(vld1_f16(kptr + k * 16 + 12));
892 
893                                     _sum = vfmaq_laneq_f32(_sum, _w0, _val, 0);
894                                     _sum = vfmaq_laneq_f32(_sum, _w1, _val, 1);
895                                     _sum = vfmaq_laneq_f32(_sum, _w2, _val, 2);
896                                     _sum = vfmaq_laneq_f32(_sum, _w3, _val, 3);
897                                 }
898                             }
899 
900                             kptr += maxk * 16;
901                         }
902 
903                         _sum = activation_ps(_sum, activation_type, activation_params);
904 
905                         vst1_f16(outptr + j * 4, vcvt_f16_f32(_sum));
906                     }
907 
908                     outptr += outw * 4;
909                 }
910             }
911         }
912     }
913 
914     if (elempack == 1 && out_elempack == 4)
915     {
916         {
917             // num_output
918             #pragma omp parallel for num_threads(opt.num_threads)
919             for (int p = 0; p < num_output / out_elempack; p++)
920             {
921                 __fp16* outptr = top_blob_bordered.channel(p);
922 
923                 for (int i = 0; i < outh; i++)
924                 {
925                     for (int j = 0; j < outw; j++)
926                     {
927                         float32x4_t _sum = vdupq_n_f32(0.f);
928 
929                         if (bias_term)
930                         {
931                             _sum = vld1q_f32(((const float*)bias_data) + p * 4);
932                         }
933 
934                         const __fp16* kptr = weight_data_fp16.channel(p);
935 
936                         // channels
937                         for (int q = 0; q < channels; q++)
938                         {
939                             const Mat m = bottom_blob.channel(q);
940 
941                             for (int y = 0; y < kernel_h; y++)
942                             {
943                                 int sys = (i + y * dilation_h - (kernel_extent_h - 1));
944                                 if (sys < 0 || sys % stride_h != 0)
945                                     continue;
946 
947                                 int sy = sys / stride_h;
948                                 if (sy >= h)
949                                     continue;
950 
951                                 const __fp16* sptr = m.row<const __fp16>(sy);
952 
953                                 for (int x = 0; x < kernel_w; x++)
954                                 {
955                                     int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
956                                     if (sxs < 0 || sxs % stride_w != 0)
957                                         continue;
958 
959                                     int sx = sxs / stride_w;
960                                     if (sx >= w)
961                                         continue;
962 
963                                     float32x4_t _val = vdupq_n_f32((float)sptr[sx]);
964 
965                                     int k = y * kernel_w + x;
966 
967                                     float32x4_t _w = vcvt_f32_f16(vld1_f16(kptr + k * 4));
968 
969                                     _sum = vfmaq_f32(_sum, _val, _w);
970                                 }
971                             }
972 
973                             kptr += maxk * 4;
974                         }
975 
976                         _sum = activation_ps(_sum, activation_type, activation_params);
977 
978                         vst1_f16(outptr + j * 4, vcvt_f16_f32(_sum));
979                     }
980 
981                     outptr += outw * 4;
982                 }
983             }
984         }
985     }
986 
987     if (elempack == 4 && out_elempack == 1)
988     {
989         {
990             // num_output
991             #pragma omp parallel for num_threads(opt.num_threads)
992             for (int p = 0; p < num_output / out_elempack; p++)
993             {
994                 __fp16* outptr = top_blob_bordered.channel(p);
995 
996                 for (int i = 0; i < outh; i++)
997                 {
998                     for (int j = 0; j < outw; j++)
999                     {
1000                         float sum = 0.f;
1001 
1002                         if (bias_term)
1003                         {
1004                             sum = bias_data[p];
1005                         }
1006 
1007                         const __fp16* kptr = weight_data_fp16.channel(p);
1008 
1009                         // channels
1010                         for (int q = 0; q < channels; q++)
1011                         {
1012                             const Mat m = bottom_blob.channel(q);
1013 
1014                             for (int y = 0; y < kernel_h; y++)
1015                             {
1016                                 int sys = (i + y * dilation_h - (kernel_extent_h - 1));
1017                                 if (sys < 0 || sys % stride_h != 0)
1018                                     continue;
1019 
1020                                 int sy = sys / stride_h;
1021                                 if (sy >= h)
1022                                     continue;
1023 
1024                                 for (int x = 0; x < kernel_w; x++)
1025                                 {
1026                                     int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
1027                                     if (sxs < 0 || sxs % stride_w != 0)
1028                                         continue;
1029 
1030                                     int sx = sxs / stride_w;
1031                                     if (sx >= w)
1032                                         continue;
1033 
1034                                     const __fp16* sptr = m.row<const __fp16>(sy) + sx * 4;
1035 
1036                                     float32x4_t _val = vcvt_f32_f16(vld1_f16(sptr));
1037 
1038                                     int k = y * kernel_w + x;
1039 
1040                                     float32x4_t _w = vcvt_f32_f16(vld1_f16(kptr + k * 4));
1041 
1042                                     float32x4_t _s4 = vmulq_f32(_val, _w);
1043 
1044                                     sum += vaddvq_f32(_s4); // dot
1045                                 }
1046                             }
1047 
1048                             kptr += maxk * 4;
1049                         }
1050 
1051                         sum = activation_ss(sum, activation_type, activation_params);
1052 
1053                         outptr[j] = (__fp16)sum;
1054                     }
1055 
1056                     outptr += outw;
1057                 }
1058             }
1059         }
1060     }
1061 
1062     if (elempack == 1 && out_elempack == 1)
1063     {
1064         {
1065             // num_output
1066             #pragma omp parallel for num_threads(opt.num_threads)
1067             for (int p = 0; p < num_output; p++)
1068             {
1069                 __fp16* outptr = top_blob_bordered.channel(p);
1070 
1071                 for (int i = 0; i < outh; i++)
1072                 {
1073                     for (int j = 0; j < outw; j++)
1074                     {
1075                         float sum = 0.f;
1076 
1077                         if (bias_term)
1078                         {
1079                             sum = bias_data[p];
1080                         }
1081 
1082                         const __fp16* kptr = weight_data_fp16.channel(p);
1083 
1084                         // channels
1085                         for (int q = 0; q < channels; q++)
1086                         {
1087                             const Mat m = bottom_blob.channel(q);
1088 
1089                             for (int y = 0; y < kernel_h; y++)
1090                             {
1091                                 int sys = (i + y * dilation_h - (kernel_extent_h - 1));
1092                                 if (sys < 0 || sys % stride_h != 0)
1093                                     continue;
1094 
1095                                 int sy = sys / stride_h;
1096                                 if (sy >= h)
1097                                     continue;
1098 
1099                                 const __fp16* sptr = m.row<const __fp16>(sy);
1100 
1101                                 for (int x = 0; x < kernel_w; x++)
1102                                 {
1103                                     int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
1104                                     if (sxs < 0 || sxs % stride_w != 0)
1105                                         continue;
1106 
1107                                     int sx = sxs / stride_w;
1108                                     if (sx >= w)
1109                                         continue;
1110 
1111                                     float val = (float)sptr[sx];
1112 
1113                                     int k = y * kernel_w + x;
1114 
1115                                     float w = (float)kptr[k];
1116 
1117                                     sum += val * w;
1118                                 }
1119                             }
1120 
1121                             kptr += maxk;
1122                         }
1123 
1124                         sum = activation_ss(sum, activation_type, activation_params);
1125 
1126                         outptr[j] = (__fp16)sum;
1127                     }
1128 
1129                     outptr += outw;
1130                 }
1131             }
1132         }
1133     }
1134 
1135     cut_padding(top_blob_bordered, top_blob, opt);
1136     if (top_blob.empty())
1137         return -100;
1138 
1139     return 0;
1140 }
1141 
forward_fp16sa(const Mat & bottom_blob,Mat & top_blob,const Option & opt) const1142 int Deconvolution_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
1143 {
1144     // deconvolv with NxN kernel
1145     // value = value + bias
1146 
1147     int w = bottom_blob.w;
1148     int h = bottom_blob.h;
1149     int channels = bottom_blob.c;
1150     size_t elemsize = bottom_blob.elemsize;
1151     int elempack = bottom_blob.elempack;
1152 
1153     //     NCNN_LOGE("Deconvolution input %d x %d  pad = %d %d  ksize=%d %d  stride=%d %d", w, h, pad_w, pad_h, kernel_w, kernel_h, stride_w, stride_h);
1154 
1155     const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
1156     const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
1157 
1158     int outw = (w - 1) * stride_w + kernel_extent_w;
1159     int outh = (h - 1) * stride_h + kernel_extent_h;
1160     int out_elempack = 1;
1161     if (opt.use_packing_layout)
1162     {
1163         out_elempack = opt.use_fp16_arithmetic && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
1164     }
1165     size_t out_elemsize = elemsize / elempack * out_elempack;
1166 
1167     Mat top_blob_bordered;
1168     if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || output_pad_right > 0 || output_pad_bottom > 0 || (output_w > 0 && output_h > 0))
1169     {
1170         top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_allocator);
1171     }
1172     else
1173     {
1174         top_blob_bordered = top_blob;
1175         top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
1176     }
1177     if (top_blob_bordered.empty())
1178         return -100;
1179 
1180     const int maxk = kernel_w * kernel_h;
1181 
1182     if (elempack == 8 && out_elempack == 8)
1183     {
1184         {
1185             // num_output
1186             #pragma omp parallel for num_threads(opt.num_threads)
1187             for (int p = 0; p < num_output / out_elempack; p++)
1188             {
1189                 __fp16* outptr = top_blob_bordered.channel(p);
1190 
1191                 for (int i = 0; i < outh; i++)
1192                 {
1193                     for (int j = 0; j < outw; j++)
1194                     {
1195                         float16x8_t _sum = vdupq_n_f16((__fp16)0.f);
1196 
1197                         if (bias_term)
1198                         {
1199                             _sum = vld1q_f16((const __fp16*)bias_data_fp16 + p * 8);
1200                         }
1201 
1202                         const __fp16* kptr = weight_data_fp16.channel(p);
1203 
1204                         // channels
1205                         for (int q = 0; q < channels; q++)
1206                         {
1207                             const Mat m = bottom_blob.channel(q);
1208 
1209                             for (int y = 0; y < kernel_h; y++)
1210                             {
1211                                 int sys = (i + y * dilation_h - (kernel_extent_h - 1));
1212                                 if (sys < 0 || sys % stride_h != 0)
1213                                     continue;
1214 
1215                                 int sy = sys / stride_h;
1216                                 if (sy >= h)
1217                                     continue;
1218 
1219                                 for (int x = 0; x < kernel_w; x++)
1220                                 {
1221                                     int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
1222                                     if (sxs < 0 || sxs % stride_w != 0)
1223                                         continue;
1224 
1225                                     int sx = sxs / stride_w;
1226                                     if (sx >= w)
1227                                         continue;
1228 
1229                                     const __fp16* sptr = m.row<const __fp16>(sy) + sx * 8;
1230 
1231                                     float16x8_t _val = vld1q_f16(sptr);
1232 
1233                                     int k = y * kernel_w + x;
1234 
1235                                     float16x8_t _w0 = vld1q_f16(kptr + k * 64);
1236                                     float16x8_t _w1 = vld1q_f16(kptr + k * 64 + 8);
1237                                     float16x8_t _w2 = vld1q_f16(kptr + k * 64 + 16);
1238                                     float16x8_t _w3 = vld1q_f16(kptr + k * 64 + 24);
1239                                     float16x8_t _w4 = vld1q_f16(kptr + k * 64 + 32);
1240                                     float16x8_t _w5 = vld1q_f16(kptr + k * 64 + 40);
1241                                     float16x8_t _w6 = vld1q_f16(kptr + k * 64 + 48);
1242                                     float16x8_t _w7 = vld1q_f16(kptr + k * 64 + 56);
1243 
1244                                     _sum = vfmaq_laneq_f16(_sum, _w0, _val, 0);
1245                                     _sum = vfmaq_laneq_f16(_sum, _w1, _val, 1);
1246                                     _sum = vfmaq_laneq_f16(_sum, _w2, _val, 2);
1247                                     _sum = vfmaq_laneq_f16(_sum, _w3, _val, 3);
1248                                     _sum = vfmaq_laneq_f16(_sum, _w4, _val, 4);
1249                                     _sum = vfmaq_laneq_f16(_sum, _w5, _val, 5);
1250                                     _sum = vfmaq_laneq_f16(_sum, _w6, _val, 6);
1251                                     _sum = vfmaq_laneq_f16(_sum, _w7, _val, 7);
1252                                 }
1253                             }
1254 
1255                             kptr += maxk * 64;
1256                         }
1257 
1258                         _sum = activation_ps(_sum, activation_type, activation_params);
1259 
1260                         vst1q_f16(outptr + j * 8, _sum);
1261                     }
1262 
1263                     outptr += outw * 8;
1264                 }
1265             }
1266         }
1267     }
1268 
1269     if (elempack == 1 && out_elempack == 8)
1270     {
1271         {
1272             // num_output
1273             #pragma omp parallel for num_threads(opt.num_threads)
1274             for (int p = 0; p < num_output / out_elempack; p++)
1275             {
1276                 __fp16* outptr = top_blob_bordered.channel(p);
1277 
1278                 for (int i = 0; i < outh; i++)
1279                 {
1280                     for (int j = 0; j < outw; j++)
1281                     {
1282                         float16x8_t _sum = vdupq_n_f16((__fp16)0.f);
1283 
1284                         if (bias_term)
1285                         {
1286                             _sum = vld1q_f16((const __fp16*)bias_data_fp16 + p * 8);
1287                         }
1288 
1289                         const __fp16* kptr = weight_data_fp16.channel(p);
1290 
1291                         // channels
1292                         for (int q = 0; q < channels; q++)
1293                         {
1294                             const Mat m = bottom_blob.channel(q);
1295 
1296                             for (int y = 0; y < kernel_h; y++)
1297                             {
1298                                 int sys = (i + y * dilation_h - (kernel_extent_h - 1));
1299                                 if (sys < 0 || sys % stride_h != 0)
1300                                     continue;
1301 
1302                                 int sy = sys / stride_h;
1303                                 if (sy >= h)
1304                                     continue;
1305 
1306                                 const __fp16* sptr = m.row<const __fp16>(sy);
1307 
1308                                 for (int x = 0; x < kernel_w; x++)
1309                                 {
1310                                     int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
1311                                     if (sxs < 0 || sxs % stride_w != 0)
1312                                         continue;
1313 
1314                                     int sx = sxs / stride_w;
1315                                     if (sx >= w)
1316                                         continue;
1317 
1318                                     float16x8_t _val = vdupq_n_f16(sptr[sx]);
1319 
1320                                     int k = y * kernel_w + x;
1321 
1322                                     float16x8_t _w = vld1q_f16(kptr + k * 8);
1323 
1324                                     _sum = vfmaq_f16(_sum, _val, _w);
1325                                 }
1326                             }
1327 
1328                             kptr += maxk * 8;
1329                         }
1330 
1331                         _sum = activation_ps(_sum, activation_type, activation_params);
1332 
1333                         vst1q_f16(outptr + j * 8, _sum);
1334                     }
1335 
1336                     outptr += outw * 8;
1337                 }
1338             }
1339         }
1340     }
1341 
1342     if (elempack == 4 && out_elempack == 8)
1343     {
1344         {
1345             // num_output
1346             #pragma omp parallel for num_threads(opt.num_threads)
1347             for (int p = 0; p < num_output / out_elempack; p++)
1348             {
1349                 __fp16* outptr = top_blob_bordered.channel(p);
1350 
1351                 for (int i = 0; i < outh; i++)
1352                 {
1353                     for (int j = 0; j < outw; j++)
1354                     {
1355                         float16x8_t _sum = vdupq_n_f16((__fp16)0.f);
1356 
1357                         if (bias_term)
1358                         {
1359                             _sum = vld1q_f16((const __fp16*)bias_data_fp16 + p * 8);
1360                         }
1361 
1362                         const __fp16* kptr = weight_data_fp16.channel(p);
1363 
1364                         // channels
1365                         for (int q = 0; q < channels; q++)
1366                         {
1367                             const Mat m = bottom_blob.channel(q);
1368 
1369                             for (int y = 0; y < kernel_h; y++)
1370                             {
1371                                 int sys = (i + y * dilation_h - (kernel_extent_h - 1));
1372                                 if (sys < 0 || sys % stride_h != 0)
1373                                     continue;
1374 
1375                                 int sy = sys / stride_h;
1376                                 if (sy >= h)
1377                                     continue;
1378 
1379                                 for (int x = 0; x < kernel_w; x++)
1380                                 {
1381                                     int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
1382                                     if (sxs < 0 || sxs % stride_w != 0)
1383                                         continue;
1384 
1385                                     int sx = sxs / stride_w;
1386                                     if (sx >= w)
1387                                         continue;
1388 
1389                                     const __fp16* sptr = m.row<const __fp16>(sy) + sx * 4;
1390 
1391                                     float16x4_t _val = vld1_f16(sptr);
1392 
1393                                     int k = y * kernel_w + x;
1394 
1395                                     float16x8_t _w0 = vld1q_f16(kptr + k * 32);
1396                                     float16x8_t _w1 = vld1q_f16(kptr + k * 32 + 8);
1397                                     float16x8_t _w2 = vld1q_f16(kptr + k * 32 + 16);
1398                                     float16x8_t _w3 = vld1q_f16(kptr + k * 32 + 24);
1399 
1400                                     _sum = vfmaq_lane_f16(_sum, _w0, _val, 0);
1401                                     _sum = vfmaq_lane_f16(_sum, _w1, _val, 1);
1402                                     _sum = vfmaq_lane_f16(_sum, _w2, _val, 2);
1403                                     _sum = vfmaq_lane_f16(_sum, _w3, _val, 3);
1404                                 }
1405                             }
1406 
1407                             kptr += maxk * 32;
1408                         }
1409 
1410                         _sum = activation_ps(_sum, activation_type, activation_params);
1411 
1412                         vst1q_f16(outptr + j * 8, _sum);
1413                     }
1414 
1415                     outptr += outw * 8;
1416                 }
1417             }
1418         }
1419     }
1420 
1421     if (elempack == 8 && out_elempack == 1)
1422     {
1423         {
1424             // num_output
1425             #pragma omp parallel for num_threads(opt.num_threads)
1426             for (int p = 0; p < num_output / out_elempack; p++)
1427             {
1428                 __fp16* outptr = top_blob_bordered.channel(p);
1429 
1430                 for (int i = 0; i < outh; i++)
1431                 {
1432                     for (int j = 0; j < outw; j++)
1433                     {
1434                         float sum = 0.f;
1435 
1436                         if (bias_term)
1437                         {
1438                             sum = bias_data[p];
1439                         }
1440 
1441                         const __fp16* kptr = weight_data_fp16.channel(p);
1442 
1443                         // channels
1444                         for (int q = 0; q < channels; q++)
1445                         {
1446                             const Mat m = bottom_blob.channel(q);
1447 
1448                             for (int y = 0; y < kernel_h; y++)
1449                             {
1450                                 int sys = (i + y * dilation_h - (kernel_extent_h - 1));
1451                                 if (sys < 0 || sys % stride_h != 0)
1452                                     continue;
1453 
1454                                 int sy = sys / stride_h;
1455                                 if (sy >= h)
1456                                     continue;
1457 
1458                                 for (int x = 0; x < kernel_w; x++)
1459                                 {
1460                                     int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
1461                                     if (sxs < 0 || sxs % stride_w != 0)
1462                                         continue;
1463 
1464                                     int sx = sxs / stride_w;
1465                                     if (sx >= w)
1466                                         continue;
1467 
1468                                     const __fp16* sptr = m.row<const __fp16>(sy) + sx * 8;
1469 
1470                                     float16x8_t _val = vld1q_f16(sptr);
1471 
1472                                     int k = y * kernel_w + x;
1473 
1474                                     float16x8_t _w = vld1q_f16(kptr + k * 8);
1475 
1476                                     float16x8_t _s8 = vmulq_f16(_val, _w);
1477 
1478                                     float16x4_t _s4 = vadd_f16(vget_low_f16(_s8), vget_high_f16(_s8));
1479                                     sum += vaddvq_f32(vcvt_f32_f16(_s4)); // dot
1480                                 }
1481                             }
1482 
1483                             kptr += maxk * 8;
1484                         }
1485 
1486                         sum = activation_ss(sum, activation_type, activation_params);
1487 
1488                         outptr[j] = (__fp16)sum;
1489                     }
1490 
1491                     outptr += outw;
1492                 }
1493             }
1494         }
1495     }
1496 
1497     if (elempack == 8 && out_elempack == 4)
1498     {
1499         {
1500             // num_output
1501             #pragma omp parallel for num_threads(opt.num_threads)
1502             for (int p = 0; p < num_output / out_elempack; p++)
1503             {
1504                 __fp16* outptr = top_blob_bordered.channel(p);
1505 
1506                 for (int i = 0; i < outh; i++)
1507                 {
1508                     for (int j = 0; j < outw; j++)
1509                     {
1510                         float16x4_t _sum = vdup_n_f16((__fp16)0.f);
1511 
1512                         if (bias_term)
1513                         {
1514                             _sum = vld1_f16((const __fp16*)bias_data_fp16 + p * 4);
1515                         }
1516 
1517                         const __fp16* kptr = weight_data_fp16.channel(p);
1518 
1519                         // channels
1520                         for (int q = 0; q < channels; q++)
1521                         {
1522                             const Mat m = bottom_blob.channel(q);
1523 
1524                             for (int y = 0; y < kernel_h; y++)
1525                             {
1526                                 int sys = (i + y * dilation_h - (kernel_extent_h - 1));
1527                                 if (sys < 0 || sys % stride_h != 0)
1528                                     continue;
1529 
1530                                 int sy = sys / stride_h;
1531                                 if (sy >= h)
1532                                     continue;
1533 
1534                                 for (int x = 0; x < kernel_w; x++)
1535                                 {
1536                                     int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
1537                                     if (sxs < 0 || sxs % stride_w != 0)
1538                                         continue;
1539 
1540                                     int sx = sxs / stride_w;
1541                                     if (sx >= w)
1542                                         continue;
1543 
1544                                     const __fp16* sptr = m.row<const __fp16>(sy) + sx * 8;
1545 
1546                                     float16x8_t _val = vld1q_f16(sptr);
1547 
1548                                     int k = y * kernel_w + x;
1549 
1550                                     float16x4_t _w0 = vld1_f16(kptr + k * 32);
1551                                     float16x4_t _w1 = vld1_f16(kptr + k * 32 + 4);
1552                                     float16x4_t _w2 = vld1_f16(kptr + k * 32 + 8);
1553                                     float16x4_t _w3 = vld1_f16(kptr + k * 32 + 12);
1554                                     float16x4_t _w4 = vld1_f16(kptr + k * 32 + 16);
1555                                     float16x4_t _w5 = vld1_f16(kptr + k * 32 + 20);
1556                                     float16x4_t _w6 = vld1_f16(kptr + k * 32 + 24);
1557                                     float16x4_t _w7 = vld1_f16(kptr + k * 32 + 28);
1558 
1559                                     _sum = vfma_laneq_f16(_sum, _w0, _val, 0);
1560                                     _sum = vfma_laneq_f16(_sum, _w1, _val, 1);
1561                                     _sum = vfma_laneq_f16(_sum, _w2, _val, 2);
1562                                     _sum = vfma_laneq_f16(_sum, _w3, _val, 3);
1563                                     _sum = vfma_laneq_f16(_sum, _w4, _val, 4);
1564                                     _sum = vfma_laneq_f16(_sum, _w5, _val, 5);
1565                                     _sum = vfma_laneq_f16(_sum, _w6, _val, 6);
1566                                     _sum = vfma_laneq_f16(_sum, _w7, _val, 7);
1567                                 }
1568                             }
1569 
1570                             kptr += maxk * 32;
1571                         }
1572 
1573                         _sum = activation_ps(_sum, activation_type, activation_params);
1574 
1575                         vst1_f16(outptr + j * 4, _sum);
1576                     }
1577 
1578                     outptr += outw * 4;
1579                 }
1580             }
1581         }
1582     }
1583 
1584     if (elempack == 4 && out_elempack == 4)
1585     {
1586         {
1587             // num_output
1588             #pragma omp parallel for num_threads(opt.num_threads)
1589             for (int p = 0; p < num_output / out_elempack; p++)
1590             {
1591                 __fp16* outptr = top_blob_bordered.channel(p);
1592 
1593                 for (int i = 0; i < outh; i++)
1594                 {
1595                     for (int j = 0; j < outw; j++)
1596                     {
1597                         float16x4_t _sum = vdup_n_f16((__fp16)0.f);
1598 
1599                         if (bias_term)
1600                         {
1601                             _sum = vld1_f16((const __fp16*)bias_data_fp16 + p * 4);
1602                         }
1603 
1604                         const __fp16* kptr = weight_data_fp16.channel(p);
1605 
1606                         // channels
1607                         for (int q = 0; q < channels; q++)
1608                         {
1609                             const Mat m = bottom_blob.channel(q);
1610 
1611                             for (int y = 0; y < kernel_h; y++)
1612                             {
1613                                 int sys = (i + y * dilation_h - (kernel_extent_h - 1));
1614                                 if (sys < 0 || sys % stride_h != 0)
1615                                     continue;
1616 
1617                                 int sy = sys / stride_h;
1618                                 if (sy >= h)
1619                                     continue;
1620 
1621                                 for (int x = 0; x < kernel_w; x++)
1622                                 {
1623                                     int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
1624                                     if (sxs < 0 || sxs % stride_w != 0)
1625                                         continue;
1626 
1627                                     int sx = sxs / stride_w;
1628                                     if (sx >= w)
1629                                         continue;
1630 
1631                                     const __fp16* sptr = m.row<const __fp16>(sy) + sx * 4;
1632 
1633                                     float16x4_t _val = vld1_f16(sptr);
1634 
1635                                     int k = y * kernel_w + x;
1636 
1637                                     float16x4_t _w0 = vld1_f16(kptr + k * 16);
1638                                     float16x4_t _w1 = vld1_f16(kptr + k * 16 + 4);
1639                                     float16x4_t _w2 = vld1_f16(kptr + k * 16 + 8);
1640                                     float16x4_t _w3 = vld1_f16(kptr + k * 16 + 12);
1641 
1642                                     _sum = vfma_lane_f16(_sum, _w0, _val, 0);
1643                                     _sum = vfma_lane_f16(_sum, _w1, _val, 1);
1644                                     _sum = vfma_lane_f16(_sum, _w2, _val, 2);
1645                                     _sum = vfma_lane_f16(_sum, _w3, _val, 3);
1646                                 }
1647                             }
1648 
1649                             kptr += maxk * 16;
1650                         }
1651 
1652                         _sum = activation_ps(_sum, activation_type, activation_params);
1653 
1654                         vst1_f16(outptr + j * 4, _sum);
1655                     }
1656 
1657                     outptr += outw * 4;
1658                 }
1659             }
1660         }
1661     }
1662 
1663     if (elempack == 1 && out_elempack == 4)
1664     {
1665         {
1666             // num_output
1667             #pragma omp parallel for num_threads(opt.num_threads)
1668             for (int p = 0; p < num_output / out_elempack; p++)
1669             {
1670                 __fp16* outptr = top_blob_bordered.channel(p);
1671 
1672                 for (int i = 0; i < outh; i++)
1673                 {
1674                     for (int j = 0; j < outw; j++)
1675                     {
1676                         float16x4_t _sum = vdup_n_f16((__fp16)0.f);
1677 
1678                         if (bias_term)
1679                         {
1680                             _sum = vld1_f16((const __fp16*)bias_data_fp16 + p * 4);
1681                         }
1682 
1683                         const __fp16* kptr = weight_data_fp16.channel(p);
1684 
1685                         // channels
1686                         for (int q = 0; q < channels; q++)
1687                         {
1688                             const Mat m = bottom_blob.channel(q);
1689 
1690                             for (int y = 0; y < kernel_h; y++)
1691                             {
1692                                 int sys = (i + y * dilation_h - (kernel_extent_h - 1));
1693                                 if (sys < 0 || sys % stride_h != 0)
1694                                     continue;
1695 
1696                                 int sy = sys / stride_h;
1697                                 if (sy >= h)
1698                                     continue;
1699 
1700                                 const __fp16* sptr = m.row<const __fp16>(sy);
1701 
1702                                 for (int x = 0; x < kernel_w; x++)
1703                                 {
1704                                     int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
1705                                     if (sxs < 0 || sxs % stride_w != 0)
1706                                         continue;
1707 
1708                                     int sx = sxs / stride_w;
1709                                     if (sx >= w)
1710                                         continue;
1711 
1712                                     float16x4_t _val = vdup_n_f16(sptr[sx]);
1713 
1714                                     int k = y * kernel_w + x;
1715 
1716                                     float16x4_t _w = vld1_f16(kptr + k * 4);
1717 
1718                                     _sum = vfma_f16(_sum, _val, _w);
1719                                 }
1720                             }
1721 
1722                             kptr += maxk * 4;
1723                         }
1724 
1725                         _sum = activation_ps(_sum, activation_type, activation_params);
1726 
1727                         vst1_f16(outptr + j * 4, _sum);
1728                     }
1729 
1730                     outptr += outw * 4;
1731                 }
1732             }
1733         }
1734     }
1735 
1736     if (elempack == 4 && out_elempack == 1)
1737     {
1738         {
1739             // num_output
1740             #pragma omp parallel for num_threads(opt.num_threads)
1741             for (int p = 0; p < num_output / out_elempack; p++)
1742             {
1743                 __fp16* outptr = top_blob_bordered.channel(p);
1744 
1745                 for (int i = 0; i < outh; i++)
1746                 {
1747                     for (int j = 0; j < outw; j++)
1748                     {
1749                         float sum = 0.f;
1750 
1751                         if (bias_term)
1752                         {
1753                             sum = bias_data[p];
1754                         }
1755 
1756                         const __fp16* kptr = weight_data_fp16.channel(p);
1757 
1758                         // channels
1759                         for (int q = 0; q < channels; q++)
1760                         {
1761                             const Mat m = bottom_blob.channel(q);
1762 
1763                             for (int y = 0; y < kernel_h; y++)
1764                             {
1765                                 int sys = (i + y * dilation_h - (kernel_extent_h - 1));
1766                                 if (sys < 0 || sys % stride_h != 0)
1767                                     continue;
1768 
1769                                 int sy = sys / stride_h;
1770                                 if (sy >= h)
1771                                     continue;
1772 
1773                                 for (int x = 0; x < kernel_w; x++)
1774                                 {
1775                                     int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
1776                                     if (sxs < 0 || sxs % stride_w != 0)
1777                                         continue;
1778 
1779                                     int sx = sxs / stride_w;
1780                                     if (sx >= w)
1781                                         continue;
1782 
1783                                     const __fp16* sptr = m.row<const __fp16>(sy) + sx * 4;
1784 
1785                                     float16x4_t _val = vld1_f16(sptr);
1786 
1787                                     int k = y * kernel_w + x;
1788 
1789                                     float16x4_t _w = vld1_f16(kptr + k * 4);
1790 
1791                                     float16x4_t _s4 = vmul_f16(_val, _w);
1792 
1793                                     sum += vaddvq_f32(vcvt_f32_f16(_s4)); // dot
1794                                 }
1795                             }
1796 
1797                             kptr += maxk * 4;
1798                         }
1799 
1800                         sum = activation_ss(sum, activation_type, activation_params);
1801 
1802                         outptr[j] = (__fp16)sum;
1803                     }
1804 
1805                     outptr += outw;
1806                 }
1807             }
1808         }
1809     }
1810 
1811     if (elempack == 1 && out_elempack == 1)
1812     {
1813         if (kernel_w == 4 && kernel_h == 4 && stride_w == 2 && stride_h == 2 && dilation_w == 1 && dilation_h == 1)
1814         {
1815             deconv4x4s2_fp16sa_neon(bottom_blob, top_blob_bordered, weight_data_fp16, bias_data_fp16, opt);
1816 
1817             if (activation)
1818             {
1819                 activation->forward_inplace(top_blob_bordered, opt);
1820             }
1821         }
1822         else
1823         {
1824             // num_output
1825             #pragma omp parallel for num_threads(opt.num_threads)
1826             for (int p = 0; p < num_output; p++)
1827             {
1828                 __fp16* outptr = top_blob_bordered.channel(p);
1829 
1830                 for (int i = 0; i < outh; i++)
1831                 {
1832                     for (int j = 0; j < outw; j++)
1833                     {
1834                         float sum = 0.f;
1835 
1836                         if (bias_term)
1837                         {
1838                             sum = bias_data[p];
1839                         }
1840 
1841                         const __fp16* kptr = weight_data_fp16.channel(p);
1842 
1843                         // channels
1844                         for (int q = 0; q < channels; q++)
1845                         {
1846                             const Mat m = bottom_blob.channel(q);
1847 
1848                             for (int y = 0; y < kernel_h; y++)
1849                             {
1850                                 int sys = (i + y * dilation_h - (kernel_extent_h - 1));
1851                                 if (sys < 0 || sys % stride_h != 0)
1852                                     continue;
1853 
1854                                 int sy = sys / stride_h;
1855                                 if (sy >= h)
1856                                     continue;
1857 
1858                                 const __fp16* sptr = m.row<const __fp16>(sy);
1859 
1860                                 for (int x = 0; x < kernel_w; x++)
1861                                 {
1862                                     int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
1863                                     if (sxs < 0 || sxs % stride_w != 0)
1864                                         continue;
1865 
1866                                     int sx = sxs / stride_w;
1867                                     if (sx >= w)
1868                                         continue;
1869 
1870                                     __fp16 val = sptr[sx];
1871 
1872                                     int k = y * kernel_w + x;
1873 
1874                                     __fp16 w = kptr[k];
1875 
1876                                     sum += val * w;
1877                                 }
1878                             }
1879 
1880                             kptr += maxk;
1881                         }
1882 
1883                         sum = activation_ss(sum, activation_type, activation_params);
1884 
1885                         outptr[j] = (__fp16)sum;
1886                     }
1887 
1888                     outptr += outw;
1889                 }
1890             }
1891         }
1892     }
1893 
1894     cut_padding(top_blob_bordered, top_blob, opt);
1895     if (top_blob.empty())
1896         return -100;
1897 
1898     return 0;
1899 }
1900 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1901 
create_pipeline_bf16s(const Option & opt)1902 int Deconvolution_arm::create_pipeline_bf16s(const Option& opt)
1903 {
1904     const int maxk = kernel_w * kernel_h;
1905     const int num_input = weight_data_size / maxk / num_output;
1906 
1907     int elempack = opt.use_packing_layout && num_input % 4 == 0 ? 4 : 1;
1908     int out_elempack = opt.use_packing_layout && num_output % 4 == 0 ? 4 : 1;
1909 
1910     Mat weight_data_transposed(weight_data.w);
1911     {
1912         float* pt = weight_data_transposed;
1913         const float* p = weight_data;
1914 
1915         for (int i = 0; i < num_input * num_output; i++)
1916         {
1917             for (int k = 0; k < maxk; k++)
1918             {
1919                 pt[maxk - 1 - k] = p[k];
1920             }
1921 
1922             p += maxk;
1923             pt += maxk;
1924         }
1925     }
1926 
1927     // src = kw-kh-inch-outch
1928     // dst = pb-pa-kw-kh-inch/pa-outch/pb
1929     {
1930         Mat weight_data_r2 = weight_data_transposed.reshape(maxk, num_input, num_output);
1931 
1932         weight_data_bf16.create(maxk, num_input / elempack, num_output / out_elempack, (size_t)2u * elempack * out_elempack, elempack * out_elempack);
1933 
1934         for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack)
1935         {
1936             Mat g0 = weight_data_bf16.channel(q / out_elempack);
1937 
1938             for (int p = 0; p + (elempack - 1) < num_input; p += elempack)
1939             {
1940                 unsigned short* g00 = g0.row<unsigned short>(p / elempack);
1941 
1942                 for (int k = 0; k < maxk; k++)
1943                 {
1944                     for (int i = 0; i < elempack; i++)
1945                     {
1946                         for (int j = 0; j < out_elempack; j++)
1947                         {
1948                             const float* k00 = weight_data_r2.channel(q + j).row(p + i);
1949 
1950                             g00[0] = float32_to_bfloat16(k00[k]);
1951 
1952                             g00++;
1953                         }
1954                     }
1955                 }
1956             }
1957         }
1958     }
1959 
1960     return 0;
1961 }
1962 
forward_bf16s(const Mat & bottom_blob,Mat & top_blob,const Option & opt) const1963 int Deconvolution_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
1964 {
1965     // deconvolv with NxN kernel
1966     // value = value + bias
1967 
1968     int w = bottom_blob.w;
1969     int h = bottom_blob.h;
1970     int channels = bottom_blob.c;
1971     size_t elemsize = bottom_blob.elemsize;
1972     int elempack = bottom_blob.elempack;
1973 
1974     //     NCNN_LOGE("Deconvolution input %d x %d  pad = %d %d  ksize=%d %d  stride=%d %d", w, h, pad_w, pad_h, kernel_w, kernel_h, stride_w, stride_h);
1975 
1976     const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
1977     const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
1978 
1979     int outw = (w - 1) * stride_w + kernel_extent_w;
1980     int outh = (h - 1) * stride_h + kernel_extent_h;
1981     int out_elempack = opt.use_packing_layout && num_output % 4 == 0 ? 4 : 1;
1982     size_t out_elemsize = elemsize / elempack * out_elempack;
1983 
1984     Mat top_blob_bordered;
1985     if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || output_pad_right > 0 || output_pad_bottom > 0 || (output_w > 0 && output_h > 0))
1986     {
1987         top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_allocator);
1988     }
1989     else
1990     {
1991         top_blob_bordered = top_blob;
1992         top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
1993     }
1994     if (top_blob_bordered.empty())
1995         return -100;
1996 
1997     const int maxk = kernel_w * kernel_h;
1998 
1999 #if __ARM_NEON
2000     if (elempack == 4 && out_elempack == 4)
2001     {
2002         {
2003             // num_output
2004             #pragma omp parallel for num_threads(opt.num_threads)
2005             for (int p = 0; p < num_output / out_elempack; p++)
2006             {
2007                 unsigned short* outptr = top_blob_bordered.channel(p);
2008 
2009                 for (int i = 0; i < outh; i++)
2010                 {
2011                     for (int j = 0; j < outw; j++)
2012                     {
2013                         float32x4_t _sum = vdupq_n_f32(0.f);
2014 
2015                         if (bias_term)
2016                         {
2017                             _sum = vld1q_f32(((const float*)bias_data) + p * 4);
2018                         }
2019 
2020                         const unsigned short* kptr = weight_data_bf16.channel(p);
2021 
2022                         // channels
2023                         for (int q = 0; q < channels; q++)
2024                         {
2025                             const Mat m = bottom_blob.channel(q);
2026 
2027                             for (int y = 0; y < kernel_h; y++)
2028                             {
2029                                 int sys = (i + y * dilation_h - (kernel_extent_h - 1));
2030                                 if (sys < 0 || sys % stride_h != 0)
2031                                     continue;
2032 
2033                                 int sy = sys / stride_h;
2034                                 if (sy >= h)
2035                                     continue;
2036 
2037                                 for (int x = 0; x < kernel_w; x++)
2038                                 {
2039                                     int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
2040                                     if (sxs < 0 || sxs % stride_w != 0)
2041                                         continue;
2042 
2043                                     int sx = sxs / stride_w;
2044                                     if (sx >= w)
2045                                         continue;
2046 
2047                                     const unsigned short* sptr = m.row<const unsigned short>(sy) + sx * 4;
2048 
2049                                     float32x4_t _val = vcvt_f32_bf16(vld1_u16(sptr));
2050 
2051                                     int k = y * kernel_w + x;
2052 
2053                                     float32x4_t _w0 = vcvt_f32_bf16(vld1_u16(kptr + k * 16));
2054                                     float32x4_t _w1 = vcvt_f32_bf16(vld1_u16(kptr + k * 16 + 4));
2055                                     float32x4_t _w2 = vcvt_f32_bf16(vld1_u16(kptr + k * 16 + 8));
2056                                     float32x4_t _w3 = vcvt_f32_bf16(vld1_u16(kptr + k * 16 + 12));
2057 
2058 #if __aarch64__
2059                                     _sum = vmlaq_laneq_f32(_sum, _w0, _val, 0);
2060                                     _sum = vmlaq_laneq_f32(_sum, _w1, _val, 1);
2061                                     _sum = vmlaq_laneq_f32(_sum, _w2, _val, 2);
2062                                     _sum = vmlaq_laneq_f32(_sum, _w3, _val, 3);
2063 #else
2064                                     _sum = vmlaq_lane_f32(_sum, _w0, vget_low_f32(_val), 0);
2065                                     _sum = vmlaq_lane_f32(_sum, _w1, vget_low_f32(_val), 1);
2066                                     _sum = vmlaq_lane_f32(_sum, _w2, vget_high_f32(_val), 0);
2067                                     _sum = vmlaq_lane_f32(_sum, _w3, vget_high_f32(_val), 1);
2068 #endif
2069                                 }
2070                             }
2071 
2072                             kptr += maxk * 16;
2073                         }
2074 
2075                         _sum = activation_ps(_sum, activation_type, activation_params);
2076 
2077                         vst1_u16(outptr + j * 4, vcvt_bf16_f32(_sum));
2078                     }
2079 
2080                     outptr += outw * 4;
2081                 }
2082             }
2083         }
2084     }
2085 
2086     if (elempack == 1 && out_elempack == 4)
2087     {
2088         {
2089             // num_output
2090             #pragma omp parallel for num_threads(opt.num_threads)
2091             for (int p = 0; p < num_output / out_elempack; p++)
2092             {
2093                 unsigned short* outptr = top_blob_bordered.channel(p);
2094 
2095                 for (int i = 0; i < outh; i++)
2096                 {
2097                     for (int j = 0; j < outw; j++)
2098                     {
2099                         float32x4_t _sum = vdupq_n_f32(0.f);
2100 
2101                         if (bias_term)
2102                         {
2103                             _sum = vld1q_f32(((const float*)bias_data) + p * 4);
2104                         }
2105 
2106                         const unsigned short* kptr = weight_data_bf16.channel(p);
2107 
2108                         // channels
2109                         for (int q = 0; q < channels; q++)
2110                         {
2111                             const Mat m = bottom_blob.channel(q);
2112 
2113                             for (int y = 0; y < kernel_h; y++)
2114                             {
2115                                 int sys = (i + y * dilation_h - (kernel_extent_h - 1));
2116                                 if (sys < 0 || sys % stride_h != 0)
2117                                     continue;
2118 
2119                                 int sy = sys / stride_h;
2120                                 if (sy >= h)
2121                                     continue;
2122 
2123                                 const unsigned short* sptr = m.row<const unsigned short>(sy);
2124 
2125                                 for (int x = 0; x < kernel_w; x++)
2126                                 {
2127                                     int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
2128                                     if (sxs < 0 || sxs % stride_w != 0)
2129                                         continue;
2130 
2131                                     int sx = sxs / stride_w;
2132                                     if (sx >= w)
2133                                         continue;
2134 
2135                                     float32x4_t _val = vdupq_n_f32(bfloat16_to_float32(sptr[sx]));
2136 
2137                                     int k = y * kernel_w + x;
2138 
2139                                     float32x4_t _w = vcvt_f32_bf16(vld1_u16(kptr + k * 4));
2140 
2141                                     _sum = vmlaq_f32(_sum, _val, _w);
2142                                 }
2143                             }
2144 
2145                             kptr += maxk * 4;
2146                         }
2147 
2148                         _sum = activation_ps(_sum, activation_type, activation_params);
2149 
2150                         vst1_u16(outptr + j * 4, vcvt_bf16_f32(_sum));
2151                     }
2152 
2153                     outptr += outw * 4;
2154                 }
2155             }
2156         }
2157     }
2158 
2159     if (elempack == 4 && out_elempack == 1)
2160     {
2161         {
2162             // num_output
2163             #pragma omp parallel for num_threads(opt.num_threads)
2164             for (int p = 0; p < num_output / out_elempack; p++)
2165             {
2166                 unsigned short* outptr = top_blob_bordered.channel(p);
2167 
2168                 for (int i = 0; i < outh; i++)
2169                 {
2170                     for (int j = 0; j < outw; j++)
2171                     {
2172                         float sum = 0.f;
2173 
2174                         if (bias_term)
2175                         {
2176                             sum = bias_data[p];
2177                         }
2178 
2179                         const unsigned short* kptr = weight_data_bf16.channel(p);
2180 
2181                         // channels
2182                         for (int q = 0; q < channels; q++)
2183                         {
2184                             const Mat m = bottom_blob.channel(q);
2185 
2186                             for (int y = 0; y < kernel_h; y++)
2187                             {
2188                                 int sys = (i + y * dilation_h - (kernel_extent_h - 1));
2189                                 if (sys < 0 || sys % stride_h != 0)
2190                                     continue;
2191 
2192                                 int sy = sys / stride_h;
2193                                 if (sy >= h)
2194                                     continue;
2195 
2196                                 for (int x = 0; x < kernel_w; x++)
2197                                 {
2198                                     int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
2199                                     if (sxs < 0 || sxs % stride_w != 0)
2200                                         continue;
2201 
2202                                     int sx = sxs / stride_w;
2203                                     if (sx >= w)
2204                                         continue;
2205 
2206                                     const unsigned short* sptr = m.row<const unsigned short>(sy) + sx * 4;
2207 
2208                                     float32x4_t _val = vcvt_f32_bf16(vld1_u16(sptr));
2209 
2210                                     int k = y * kernel_w + x;
2211 
2212                                     float32x4_t _w = vcvt_f32_bf16(vld1_u16(kptr + k * 4));
2213 
2214                                     float32x4_t _s4 = vmulq_f32(_val, _w);
2215 #if __aarch64__
2216                                     sum += vaddvq_f32(_s4); // dot
2217 #else
2218                                     float32x2_t _ss = vadd_f32(vget_low_f32(_s4), vget_high_f32(_s4));
2219                                     _ss = vpadd_f32(_ss, _ss);
2220                                     sum += vget_lane_f32(_ss, 0);
2221 #endif
2222                                 }
2223                             }
2224 
2225                             kptr += maxk * 4;
2226                         }
2227 
2228                         sum = activation_ss(sum, activation_type, activation_params);
2229 
2230                         outptr[j] = float32_to_bfloat16(sum);
2231                     }
2232 
2233                     outptr += outw;
2234                 }
2235             }
2236         }
2237     }
2238 #endif // __ARM_NEON
2239 
2240     if (elempack == 1 && out_elempack == 1)
2241     {
2242         {
2243             // num_output
2244             #pragma omp parallel for num_threads(opt.num_threads)
2245             for (int p = 0; p < num_output; p++)
2246             {
2247                 unsigned short* outptr = top_blob_bordered.channel(p);
2248 
2249                 for (int i = 0; i < outh; i++)
2250                 {
2251                     for (int j = 0; j < outw; j++)
2252                     {
2253                         float sum = 0.f;
2254 
2255                         if (bias_term)
2256                         {
2257                             sum = bias_data[p];
2258                         }
2259 
2260                         const unsigned short* kptr = weight_data_bf16.channel(p);
2261 
2262                         // channels
2263                         for (int q = 0; q < channels; q++)
2264                         {
2265                             const Mat m = bottom_blob.channel(q);
2266 
2267                             for (int y = 0; y < kernel_h; y++)
2268                             {
2269                                 int sys = (i + y * dilation_h - (kernel_extent_h - 1));
2270                                 if (sys < 0 || sys % stride_h != 0)
2271                                     continue;
2272 
2273                                 int sy = sys / stride_h;
2274                                 if (sy >= h)
2275                                     continue;
2276 
2277                                 const unsigned short* sptr = m.row<const unsigned short>(sy);
2278 
2279                                 for (int x = 0; x < kernel_w; x++)
2280                                 {
2281                                     int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
2282                                     if (sxs < 0 || sxs % stride_w != 0)
2283                                         continue;
2284 
2285                                     int sx = sxs / stride_w;
2286                                     if (sx >= w)
2287                                         continue;
2288 
2289                                     float val = bfloat16_to_float32(sptr[sx]);
2290 
2291                                     int k = y * kernel_w + x;
2292 
2293                                     float w = bfloat16_to_float32(kptr[k]);
2294 
2295                                     sum += val * w;
2296                                 }
2297                             }
2298 
2299                             kptr += maxk;
2300                         }
2301 
2302                         if (activation_type == 1)
2303                         {
2304                             sum = std::max(sum, 0.f);
2305                         }
2306                         else if (activation_type == 2)
2307                         {
2308                             float slope = activation_params[0];
2309                             sum = sum > 0.f ? sum : sum * slope;
2310                         }
2311                         else if (activation_type == 3)
2312                         {
2313                             float min = activation_params[0];
2314                             float max = activation_params[1];
2315                             if (sum < min)
2316                                 sum = min;
2317                             if (sum > max)
2318                                 sum = max;
2319                         }
2320                         else if (activation_type == 4)
2321                         {
2322                             sum = static_cast<float>(1.f / (1.f + exp(-sum)));
2323                         }
2324 
2325                         outptr[j] = float32_to_bfloat16(sum);
2326                     }
2327 
2328                     outptr += outw;
2329                 }
2330             }
2331         }
2332     }
2333 
2334     cut_padding(top_blob_bordered, top_blob, opt);
2335     if (top_blob.empty())
2336         return -100;
2337 
2338     return 0;
2339 }
2340 
2341 } // namespace ncnn
2342