1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14
15 #include "convolution1d_arm.h"
16
17 #if __ARM_NEON
18 #include <arm_neon.h>
19 #endif // __ARM_NEON
20
21 #include "arm_activation.h"
22 #include "arm_usability.h"
23
24 namespace ncnn {
25
Convolution1D_arm()26 Convolution1D_arm::Convolution1D_arm()
27 {
28 #if __ARM_NEON
29 support_packing = true;
30 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
31 support_fp16_storage = true;
32 #endif
33 #endif // __ARM_NEON
34
35 #if NCNN_BF16
36 support_bf16_storage = true;
37 #endif
38 }
39
create_pipeline(const Option & opt)40 int Convolution1D_arm::create_pipeline(const Option& opt)
41 {
42 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
43 if (opt.use_fp16_storage)
44 {
45 return create_pipeline_fp16s(opt);
46 }
47 #endif
48
49 #if NCNN_BF16
50 if (opt.use_bf16_storage)
51 {
52 return create_pipeline_bf16s(opt);
53 }
54 #endif
55
56 const int num_input = weight_data_size / kernel_w / num_output;
57
58 int elempack = 1;
59 int out_elempack = 1;
60
61 #if __ARM_NEON
62 if (opt.use_packing_layout)
63 {
64 elempack = num_input % 4 == 0 ? 4 : 1;
65 out_elempack = num_output % 4 == 0 ? 4 : 1;
66 }
67 #endif
68
69 // src = kw-inch-outch
70 // dst = pb-pa-kw-inch/pa-outch/pb
71 {
72 Mat weight_data_r2 = weight_data.reshape(kernel_w, num_input, num_output);
73
74 weight_data_packed.create(kernel_w, num_input / elempack, num_output / out_elempack, (size_t)4u * elempack * out_elempack, elempack * out_elempack);
75
76 for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack)
77 {
78 Mat g0 = weight_data_packed.channel(q / out_elempack);
79
80 for (int p = 0; p + (elempack - 1) < num_input; p += elempack)
81 {
82 float* g00 = g0.row(p / elempack);
83
84 for (int k = 0; k < kernel_w; k++)
85 {
86 for (int i = 0; i < elempack; i++)
87 {
88 for (int j = 0; j < out_elempack; j++)
89 {
90 const float* k00 = weight_data_r2.channel(q + j).row(p + i);
91
92 g00[0] = k00[k];
93
94 g00++;
95 }
96 }
97 }
98 }
99 }
100 }
101
102 return 0;
103 }
104
destroy_pipeline(const Option &)105 int Convolution1D_arm::destroy_pipeline(const Option& /*opt*/)
106 {
107 return 0;
108 }
109
forward(const Mat & bottom_blob,Mat & top_blob,const Option & opt) const110 int Convolution1D_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
111 {
112 int elembits = bottom_blob.elembits();
113
114 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
115 if (opt.use_fp16_storage && elembits == 16)
116 {
117 if (opt.use_fp16_arithmetic)
118 return forward_fp16sa(bottom_blob, top_blob, opt);
119 else
120 return forward_fp16s(bottom_blob, top_blob, opt);
121 }
122 #endif
123
124 #if NCNN_BF16
125 if (opt.use_bf16_storage && elembits == 16)
126 return forward_bf16s(bottom_blob, top_blob, opt);
127 #endif
128
129 int w = bottom_blob.w;
130 int h = bottom_blob.h;
131 size_t elemsize = bottom_blob.elemsize;
132 int elempack = bottom_blob.elempack;
133
134 const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
135
136 Mat bottom_blob_bordered;
137 make_padding(bottom_blob, bottom_blob_bordered, opt);
138 if (bottom_blob_bordered.empty())
139 return -100;
140
141 w = bottom_blob_bordered.w;
142 h = bottom_blob_bordered.h;
143
144 int out_elempack = (support_packing && opt.use_packing_layout && num_output % 4 == 0) ? 4 : 1;
145 size_t out_elemsize = elemsize / elempack * out_elempack;
146
147 const int outw = (w - kernel_extent_w) / stride_w + 1;
148 const int outh = num_output / out_elempack;
149
150 top_blob.create(outw, outh, out_elemsize, out_elempack, opt.blob_allocator);
151 if (top_blob.empty())
152 return -100;
153
154 #if __ARM_NEON
155 if (elempack == 4 && out_elempack == 4)
156 {
157 {
158 #pragma omp parallel for num_threads(opt.num_threads)
159 for (int p = 0; p < outh; p++)
160 {
161 float* outptr = top_blob.row(p);
162
163 for (int j = 0; j < outw; j++)
164 {
165 float32x4_t _sum = vdupq_n_f32(0.f);
166
167 if (bias_term)
168 {
169 _sum = vld1q_f32((const float*)bias_data + p * 4);
170 }
171
172 const float* kptr = weight_data_packed.channel(p);
173
174 for (int q = 0; q < h; q++)
175 {
176 const float* sptr = bottom_blob_bordered.row(q) + j * stride_w * 4;
177
178 for (int k = 0; k < kernel_w; k++)
179 {
180 float32x4_t _val = vld1q_f32(sptr);
181
182 float32x4_t _w0 = vld1q_f32(kptr);
183 float32x4_t _w1 = vld1q_f32(kptr + 4);
184 float32x4_t _w2 = vld1q_f32(kptr + 8);
185 float32x4_t _w3 = vld1q_f32(kptr + 12);
186
187 #if __aarch64__
188 _sum = vmlaq_laneq_f32(_sum, _w0, _val, 0);
189 _sum = vmlaq_laneq_f32(_sum, _w1, _val, 1);
190 _sum = vmlaq_laneq_f32(_sum, _w2, _val, 2);
191 _sum = vmlaq_laneq_f32(_sum, _w3, _val, 3);
192 #else
193 _sum = vmlaq_lane_f32(_sum, _w0, vget_low_f32(_val), 0);
194 _sum = vmlaq_lane_f32(_sum, _w1, vget_low_f32(_val), 1);
195 _sum = vmlaq_lane_f32(_sum, _w2, vget_high_f32(_val), 0);
196 _sum = vmlaq_lane_f32(_sum, _w3, vget_high_f32(_val), 1);
197 #endif
198
199 sptr += dilation_w * 4;
200 kptr += 16;
201 }
202 }
203
204 _sum = activation_ps(_sum, activation_type, activation_params);
205
206 vst1q_f32(outptr, _sum);
207 outptr += 4;
208 }
209 }
210 }
211 }
212
213 if (elempack == 1 && out_elempack == 4)
214 {
215 {
216 #pragma omp parallel for num_threads(opt.num_threads)
217 for (int p = 0; p < outh; p++)
218 {
219 float* outptr = top_blob.row(p);
220
221 for (int j = 0; j < outw; j++)
222 {
223 float32x4_t _sum = vdupq_n_f32(0.f);
224
225 if (bias_term)
226 {
227 _sum = vld1q_f32((const float*)bias_data + p * 4);
228 }
229
230 const float* kptr = weight_data_packed.channel(p);
231
232 for (int q = 0; q < h; q++)
233 {
234 const float* sptr = bottom_blob_bordered.row(q) + j * stride_w;
235
236 for (int k = 0; k < kernel_w; k++)
237 {
238 float32x4_t _val = vdupq_n_f32(sptr[0]);
239 float32x4_t _w = vld1q_f32(kptr);
240 _sum = vmlaq_f32(_sum, _val, _w);
241
242 sptr += dilation_w;
243 kptr += 4;
244 }
245 }
246
247 _sum = activation_ps(_sum, activation_type, activation_params);
248
249 vst1q_f32(outptr, _sum);
250 outptr += 4;
251 }
252 }
253 }
254 }
255
256 if (elempack == 4 && out_elempack == 1)
257 {
258 {
259 #pragma omp parallel for num_threads(opt.num_threads)
260 for (int p = 0; p < outh; p++)
261 {
262 float* outptr = top_blob.row(p);
263
264 for (int j = 0; j < outw; j++)
265 {
266 float sum = 0.f;
267
268 if (bias_term)
269 {
270 sum = bias_data[p];
271 }
272
273 const float* kptr = weight_data_packed.channel(p);
274
275 for (int q = 0; q < h; q++)
276 {
277 const float* sptr = bottom_blob_bordered.row(q) + j * stride_w * 4;
278
279 for (int k = 0; k < kernel_w; k++) // 29.23
280 {
281 float32x4_t _val = vld1q_f32(sptr);
282 float32x4_t _w = vld1q_f32(kptr);
283 float32x4_t _s4 = vmulq_f32(_val, _w);
284 #if __aarch64__
285 sum += vaddvq_f32(_s4); // dot
286 #else
287 float32x2_t _ss = vadd_f32(vget_low_f32(_s4), vget_high_f32(_s4));
288 _ss = vpadd_f32(_ss, _ss);
289 sum += vget_lane_f32(_ss, 0);
290 #endif
291
292 sptr += dilation_w * 4;
293 kptr += 4;
294 }
295 }
296
297 sum = activation_ss(sum, activation_type, activation_params);
298
299 outptr[j] = sum;
300 }
301 }
302 }
303 }
304 #endif // __ARM_NEON
305
306 if (elempack == 1 && out_elempack == 1)
307 {
308 {
309 #pragma omp parallel for num_threads(opt.num_threads)
310 for (int p = 0; p < outh; p++)
311 {
312 float* outptr = top_blob.row(p);
313
314 for (int j = 0; j < outw; j++)
315 {
316 float sum = 0.f;
317
318 if (bias_term)
319 {
320 sum = bias_data[p];
321 }
322
323 const float* kptr = (const float*)weight_data + kernel_w * h * p;
324
325 for (int q = 0; q < h; q++)
326 {
327 const float* sptr = bottom_blob_bordered.row(q) + j * stride_w;
328
329 for (int k = 0; k < kernel_w; k++)
330 {
331 float val = sptr[0];
332 float wt = kptr[0];
333 sum += val * wt;
334
335 sptr += dilation_w;
336 kptr += 1;
337 }
338 }
339
340 sum = activation_ss(sum, activation_type, activation_params);
341
342 outptr[j] = sum;
343 }
344 }
345 }
346 }
347
348 return 0;
349 }
350
351 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
create_pipeline_fp16s(const Option & opt)352 int Convolution1D_arm::create_pipeline_fp16s(const Option& opt)
353 {
354 const int num_input = weight_data_size / kernel_w / num_output;
355
356 int elempack = 1;
357 int out_elempack = 1;
358
359 if (opt.use_packing_layout)
360 {
361 elempack = opt.use_fp16_arithmetic && num_input % 8 == 0 ? 8 : num_input % 4 == 0 ? 4 : 1;
362 out_elempack = opt.use_fp16_arithmetic && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
363 }
364
365 // src = kw-inch-outch
366 // dst = pb-pa-kw-inch/pa-outch/pb
367 {
368 Mat weight_data_r2 = weight_data.reshape(kernel_w, num_input, num_output);
369
370 weight_data_fp16.create(kernel_w, num_input / elempack, num_output / out_elempack, (size_t)2u * elempack * out_elempack, elempack * out_elempack);
371
372 for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack)
373 {
374 Mat g0 = weight_data_fp16.channel(q / out_elempack);
375
376 for (int p = 0; p + (elempack - 1) < num_input; p += elempack)
377 {
378 __fp16* g00 = g0.row<__fp16>(p / elempack);
379
380 for (int k = 0; k < kernel_w; k++)
381 {
382 for (int i = 0; i < elempack; i++)
383 {
384 for (int j = 0; j < out_elempack; j++)
385 {
386 const float* k00 = weight_data_r2.channel(q + j).row(p + i);
387
388 g00[0] = (__fp16)k00[k];
389
390 g00++;
391 }
392 }
393 }
394 }
395 }
396 }
397
398 ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt);
399
400 return 0;
401 }
402
forward_fp16s(const Mat & bottom_blob,Mat & top_blob,const Option & opt) const403 int Convolution1D_arm::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
404 {
405 int w = bottom_blob.w;
406 int h = bottom_blob.h;
407 size_t elemsize = bottom_blob.elemsize;
408 int elempack = bottom_blob.elempack;
409
410 const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
411
412 Mat bottom_blob_bordered;
413 make_padding(bottom_blob, bottom_blob_bordered, opt);
414 if (bottom_blob_bordered.empty())
415 return -100;
416
417 w = bottom_blob_bordered.w;
418 h = bottom_blob_bordered.h;
419
420 int out_elempack = (support_packing && opt.use_packing_layout && num_output % 4 == 0) ? 4 : 1;
421 size_t out_elemsize = elemsize / elempack * out_elempack;
422
423 const int outw = (w - kernel_extent_w) / stride_w + 1;
424 const int outh = num_output / out_elempack;
425
426 top_blob.create(outw, outh, out_elemsize, out_elempack, opt.blob_allocator);
427 if (top_blob.empty())
428 return -100;
429
430 if (elempack == 4 && out_elempack == 4)
431 {
432 {
433 #pragma omp parallel for num_threads(opt.num_threads)
434 for (int p = 0; p < outh; p++)
435 {
436 __fp16* outptr = top_blob.row<__fp16>(p);
437
438 for (int j = 0; j < outw; j++)
439 {
440 float32x4_t _sum = vdupq_n_f32(0.f);
441
442 if (bias_term)
443 {
444 _sum = vld1q_f32((const float*)bias_data + p * 4);
445 }
446
447 const __fp16* kptr = weight_data_fp16.channel(p);
448
449 for (int q = 0; q < h; q++)
450 {
451 const __fp16* sptr = bottom_blob_bordered.row<const __fp16>(q) + j * stride_w * 4;
452
453 for (int k = 0; k < kernel_w; k++)
454 {
455 float32x4_t _val = vcvt_f32_f16(vld1_f16(sptr));
456
457 float32x4_t _w0 = vcvt_f32_f16(vld1_f16(kptr));
458 float32x4_t _w1 = vcvt_f32_f16(vld1_f16(kptr + 4));
459 float32x4_t _w2 = vcvt_f32_f16(vld1_f16(kptr + 8));
460 float32x4_t _w3 = vcvt_f32_f16(vld1_f16(kptr + 12));
461
462 _sum = vfmaq_laneq_f32(_sum, _w0, _val, 0);
463 _sum = vfmaq_laneq_f32(_sum, _w1, _val, 1);
464 _sum = vfmaq_laneq_f32(_sum, _w2, _val, 2);
465 _sum = vfmaq_laneq_f32(_sum, _w3, _val, 3);
466
467 sptr += dilation_w * 4;
468 kptr += 16;
469 }
470 }
471
472 _sum = activation_ps(_sum, activation_type, activation_params);
473
474 vst1_f16(outptr, vcvt_f16_f32(_sum));
475 outptr += 4;
476 }
477 }
478 }
479 }
480
481 if (elempack == 1 && out_elempack == 4)
482 {
483 {
484 #pragma omp parallel for num_threads(opt.num_threads)
485 for (int p = 0; p < outh; p++)
486 {
487 __fp16* outptr = top_blob.row<__fp16>(p);
488
489 for (int j = 0; j < outw; j++)
490 {
491 float32x4_t _sum = vdupq_n_f32(0.f);
492
493 if (bias_term)
494 {
495 _sum = vld1q_f32((const float*)bias_data + p * 4);
496 }
497
498 const __fp16* kptr = weight_data_fp16.channel(p);
499
500 for (int q = 0; q < h; q++)
501 {
502 const __fp16* sptr = bottom_blob_bordered.row<const __fp16>(q) + j * stride_w;
503
504 for (int k = 0; k < kernel_w; k++)
505 {
506 float32x4_t _val = vcvt_f32_f16(vdup_n_f16(sptr[0]));
507 float32x4_t _w = vcvt_f32_f16(vld1_f16(kptr));
508 _sum = vfmaq_f32(_sum, _val, _w);
509
510 sptr += dilation_w;
511 kptr += 4;
512 }
513 }
514
515 _sum = activation_ps(_sum, activation_type, activation_params);
516
517 vst1_f16(outptr, vcvt_f16_f32(_sum));
518 outptr += 4;
519 }
520 }
521 }
522 }
523
524 if (elempack == 4 && out_elempack == 1)
525 {
526 {
527 #pragma omp parallel for num_threads(opt.num_threads)
528 for (int p = 0; p < outh; p++)
529 {
530 __fp16* outptr = top_blob.row<__fp16>(p);
531
532 for (int j = 0; j < outw; j++)
533 {
534 float sum = 0.f;
535
536 if (bias_term)
537 {
538 sum = bias_data[p];
539 }
540
541 const __fp16* kptr = weight_data_fp16.channel(p);
542
543 for (int q = 0; q < h; q++)
544 {
545 const __fp16* sptr = bottom_blob_bordered.row<const __fp16>(q) + j * stride_w * 4;
546
547 for (int k = 0; k < kernel_w; k++)
548 {
549 float32x4_t _val = vcvt_f32_f16(vld1_f16(sptr));
550 float32x4_t _w = vcvt_f32_f16(vld1_f16(kptr));
551 float32x4_t _s4 = vmulq_f32(_val, _w);
552
553 sum += vaddvq_f32(_s4); // dot
554
555 sptr += dilation_w * 4;
556 kptr += 4;
557 }
558 }
559
560 sum = activation_ss(sum, activation_type, activation_params);
561
562 outptr[j] = (__fp16)sum;
563 }
564 }
565 }
566 }
567
568 if (elempack == 1 && out_elempack == 1)
569 {
570 {
571 #pragma omp parallel for num_threads(opt.num_threads)
572 for (int p = 0; p < outh; p++)
573 {
574 __fp16* outptr = top_blob.row<__fp16>(p);
575
576 for (int j = 0; j < outw; j++)
577 {
578 float sum = 0.f;
579
580 if (bias_term)
581 {
582 sum = bias_data[p];
583 }
584
585 const __fp16* kptr = weight_data_fp16.channel(p);
586
587 for (int q = 0; q < h; q++)
588 {
589 const __fp16* sptr = bottom_blob_bordered.row<const __fp16>(q) + j * stride_w;
590
591 for (int k = 0; k < kernel_w; k++)
592 {
593 float val = (float)sptr[0];
594 float w = (float)kptr[0];
595 sum += val * w;
596
597 sptr += dilation_w;
598 kptr += 1;
599 }
600 }
601
602 sum = activation_ss(sum, activation_type, activation_params);
603
604 outptr[j] = (__fp16)sum;
605 }
606 }
607 }
608 }
609
610 return 0;
611 }
612
forward_fp16sa(const Mat & bottom_blob,Mat & top_blob,const Option & opt) const613 int Convolution1D_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
614 {
615 int w = bottom_blob.w;
616 int h = bottom_blob.h;
617 size_t elemsize = bottom_blob.elemsize;
618 int elempack = bottom_blob.elempack;
619
620 const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
621
622 Mat bottom_blob_bordered;
623 make_padding(bottom_blob, bottom_blob_bordered, opt);
624 if (bottom_blob_bordered.empty())
625 return -100;
626
627 w = bottom_blob_bordered.w;
628 h = bottom_blob_bordered.h;
629
630 int out_elempack = 1;
631 if (opt.use_packing_layout)
632 {
633 out_elempack = opt.use_fp16_arithmetic && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
634 }
635 size_t out_elemsize = elemsize / elempack * out_elempack;
636
637 const int outw = (w - kernel_extent_w) / stride_w + 1;
638 const int outh = num_output / out_elempack;
639
640 top_blob.create(outw, outh, out_elemsize, out_elempack, opt.blob_allocator);
641 if (top_blob.empty())
642 return -100;
643
644 if (elempack == 8 && out_elempack == 8)
645 {
646 {
647 #pragma omp parallel for num_threads(opt.num_threads)
648 for (int p = 0; p < outh; p++)
649 {
650 __fp16* outptr = top_blob.row<__fp16>(p);
651
652 for (int j = 0; j < outw; j++)
653 {
654 float16x8_t _sum = vdupq_n_f16((__fp16)0.f);
655
656 if (bias_term)
657 {
658 _sum = vld1q_f16((const __fp16*)bias_data_fp16 + p * 8);
659 }
660
661 const __fp16* kptr = weight_data_fp16.channel(p);
662
663 for (int q = 0; q < h; q++)
664 {
665 const __fp16* sptr = bottom_blob_bordered.row<const __fp16>(q) + j * stride_w * 8;
666
667 for (int k = 0; k < kernel_w; k++)
668 {
669 float16x8_t _val = vld1q_f16(sptr);
670
671 float16x8_t _w0 = vld1q_f16(kptr);
672 float16x8_t _w1 = vld1q_f16(kptr + 8);
673 float16x8_t _w2 = vld1q_f16(kptr + 16);
674 float16x8_t _w3 = vld1q_f16(kptr + 24);
675 float16x8_t _w4 = vld1q_f16(kptr + 32);
676 float16x8_t _w5 = vld1q_f16(kptr + 40);
677 float16x8_t _w6 = vld1q_f16(kptr + 48);
678 float16x8_t _w7 = vld1q_f16(kptr + 56);
679
680 _sum = vfmaq_laneq_f16(_sum, _w0, _val, 0);
681 _sum = vfmaq_laneq_f16(_sum, _w1, _val, 1);
682 _sum = vfmaq_laneq_f16(_sum, _w2, _val, 2);
683 _sum = vfmaq_laneq_f16(_sum, _w3, _val, 3);
684 _sum = vfmaq_laneq_f16(_sum, _w4, _val, 4);
685 _sum = vfmaq_laneq_f16(_sum, _w5, _val, 5);
686 _sum = vfmaq_laneq_f16(_sum, _w6, _val, 6);
687 _sum = vfmaq_laneq_f16(_sum, _w7, _val, 7);
688
689 sptr += dilation_w * 8;
690 kptr += 64;
691 }
692 }
693
694 _sum = activation_ps(_sum, activation_type, activation_params);
695
696 vst1q_f16(outptr, _sum);
697 outptr += 8;
698 }
699 }
700 }
701 }
702
703 if (elempack == 1 && out_elempack == 8)
704 {
705 {
706 #pragma omp parallel for num_threads(opt.num_threads)
707 for (int p = 0; p < outh; p++)
708 {
709 __fp16* outptr = top_blob.row<__fp16>(p);
710
711 for (int j = 0; j < outw; j++)
712 {
713 float16x8_t _sum = vdupq_n_f16((__fp16)0.f);
714
715 if (bias_term)
716 {
717 _sum = vld1q_f16((const __fp16*)bias_data_fp16 + p * 8);
718 }
719
720 const __fp16* kptr = weight_data_fp16.channel(p);
721
722 for (int q = 0; q < h; q++)
723 {
724 const __fp16* sptr = bottom_blob_bordered.row<const __fp16>(q) + j * stride_w;
725
726 for (int k = 0; k < kernel_w; k++)
727 {
728 float16x8_t _val = vdupq_n_f16(sptr[0]);
729 float16x8_t _w = vld1q_f16(kptr);
730 _sum = vfmaq_f16(_sum, _val, _w);
731
732 sptr += dilation_w;
733 kptr += 8;
734 }
735 }
736
737 _sum = activation_ps(_sum, activation_type, activation_params);
738
739 vst1q_f16(outptr, _sum);
740 outptr += 8;
741 }
742 }
743 }
744 }
745
746 if (elempack == 4 && out_elempack == 8)
747 {
748 {
749 #pragma omp parallel for num_threads(opt.num_threads)
750 for (int p = 0; p < outh; p++)
751 {
752 __fp16* outptr = top_blob.row<__fp16>(p);
753
754 for (int j = 0; j < outw; j++)
755 {
756 float16x8_t _sum = vdupq_n_f16((__fp16)0.f);
757
758 if (bias_term)
759 {
760 _sum = vld1q_f16((const __fp16*)bias_data_fp16 + p * 8);
761 }
762
763 const __fp16* kptr = weight_data_fp16.channel(p);
764
765 for (int q = 0; q < h; q++)
766 {
767 const __fp16* sptr = bottom_blob_bordered.row<const __fp16>(q) + j * stride_w * 4;
768
769 for (int k = 0; k < kernel_w; k++)
770 {
771 float16x4_t _val = vld1_f16(sptr);
772
773 float16x8_t _w0 = vld1q_f16(kptr);
774 float16x8_t _w1 = vld1q_f16(kptr + 8);
775 float16x8_t _w2 = vld1q_f16(kptr + 16);
776 float16x8_t _w3 = vld1q_f16(kptr + 24);
777
778 _sum = vfmaq_lane_f16(_sum, _w0, _val, 0);
779 _sum = vfmaq_lane_f16(_sum, _w1, _val, 1);
780 _sum = vfmaq_lane_f16(_sum, _w2, _val, 2);
781 _sum = vfmaq_lane_f16(_sum, _w3, _val, 3);
782
783 sptr += dilation_w * 4;
784 kptr += 32;
785 }
786 }
787
788 _sum = activation_ps(_sum, activation_type, activation_params);
789
790 vst1q_f16(outptr, _sum);
791 outptr += 8;
792 }
793 }
794 }
795 }
796
797 if (elempack == 8 && out_elempack == 1)
798 {
799 {
800 #pragma omp parallel for num_threads(opt.num_threads)
801 for (int p = 0; p < outh; p++)
802 {
803 __fp16* outptr = top_blob.row<__fp16>(p);
804
805 for (int j = 0; j < outw; j++)
806 {
807 float sum = 0.f;
808
809 if (bias_term)
810 {
811 sum = ((const __fp16*)bias_data_fp16)[p];
812 }
813
814 const __fp16* kptr = weight_data_fp16.channel(p);
815
816 for (int q = 0; q < h; q++)
817 {
818 const __fp16* sptr = bottom_blob_bordered.row<const __fp16>(q) + j * stride_w * 8;
819
820 for (int k = 0; k < kernel_w; k++)
821 {
822 float16x8_t _val = vld1q_f16(sptr);
823 float16x8_t _w = vld1q_f16(kptr);
824 float16x8_t _s8 = vmulq_f16(_val, _w);
825
826 float16x4_t _s4 = vadd_f16(vget_low_f16(_s8), vget_high_f16(_s8));
827 sum += vaddvq_f32(vcvt_f32_f16(_s4)); // dot
828
829 sptr += dilation_w * 8;
830 kptr += 8;
831 }
832 }
833
834 sum = activation_ss(sum, activation_type, activation_params);
835
836 outptr[j] = sum;
837 }
838 }
839 }
840 }
841
842 if (elempack == 8 && out_elempack == 4)
843 {
844 {
845 #pragma omp parallel for num_threads(opt.num_threads)
846 for (int p = 0; p < outh; p++)
847 {
848 __fp16* outptr = top_blob.row<__fp16>(p);
849
850 for (int j = 0; j < outw; j++)
851 {
852 float16x4_t _sum = vdup_n_f16((__fp16)0.f);
853
854 if (bias_term)
855 {
856 _sum = vld1_f16((const __fp16*)bias_data_fp16 + p * 4);
857 }
858
859 const __fp16* kptr = weight_data_fp16.channel(p);
860
861 for (int q = 0; q < h; q++)
862 {
863 const __fp16* sptr = bottom_blob_bordered.row<const __fp16>(q) + j * stride_w * 8;
864
865 for (int k = 0; k < kernel_w; k++)
866 {
867 float16x8_t _val = vld1q_f16(sptr);
868
869 float16x4_t _w0 = vld1_f16(kptr);
870 float16x4_t _w1 = vld1_f16(kptr + 4);
871 float16x4_t _w2 = vld1_f16(kptr + 8);
872 float16x4_t _w3 = vld1_f16(kptr + 12);
873 float16x4_t _w4 = vld1_f16(kptr + 16);
874 float16x4_t _w5 = vld1_f16(kptr + 20);
875 float16x4_t _w6 = vld1_f16(kptr + 24);
876 float16x4_t _w7 = vld1_f16(kptr + 28);
877
878 _sum = vfma_laneq_f16(_sum, _w0, _val, 0);
879 _sum = vfma_laneq_f16(_sum, _w1, _val, 1);
880 _sum = vfma_laneq_f16(_sum, _w2, _val, 2);
881 _sum = vfma_laneq_f16(_sum, _w3, _val, 3);
882 _sum = vfma_laneq_f16(_sum, _w4, _val, 4);
883 _sum = vfma_laneq_f16(_sum, _w5, _val, 5);
884 _sum = vfma_laneq_f16(_sum, _w6, _val, 6);
885 _sum = vfma_laneq_f16(_sum, _w7, _val, 7);
886
887 sptr += dilation_w * 8;
888 kptr += 32;
889 }
890 }
891
892 _sum = activation_ps(_sum, activation_type, activation_params);
893
894 vst1_f16(outptr, _sum);
895 outptr += 4;
896 }
897 }
898 }
899 }
900
901 if (elempack == 4 && out_elempack == 4)
902 {
903 {
904 #pragma omp parallel for num_threads(opt.num_threads)
905 for (int p = 0; p < outh; p++)
906 {
907 __fp16* outptr = top_blob.row<__fp16>(p);
908
909 for (int j = 0; j < outw; j++)
910 {
911 float16x4_t _sum = vdup_n_f16((__fp16)0.f);
912
913 if (bias_term)
914 {
915 _sum = vld1_f16((const __fp16*)bias_data_fp16 + p * 4);
916 }
917
918 const __fp16* kptr = weight_data_fp16.channel(p);
919
920 for (int q = 0; q < h; q++)
921 {
922 const __fp16* sptr = bottom_blob_bordered.row<const __fp16>(q) + j * stride_w * 4;
923
924 for (int k = 0; k < kernel_w; k++)
925 {
926 float16x4_t _val = vld1_f16(sptr);
927
928 float16x4_t _w0 = vld1_f16(kptr);
929 float16x4_t _w1 = vld1_f16(kptr + 4);
930 float16x4_t _w2 = vld1_f16(kptr + 8);
931 float16x4_t _w3 = vld1_f16(kptr + 12);
932
933 _sum = vfma_lane_f16(_sum, _w0, _val, 0);
934 _sum = vfma_lane_f16(_sum, _w1, _val, 1);
935 _sum = vfma_lane_f16(_sum, _w2, _val, 2);
936 _sum = vfma_lane_f16(_sum, _w3, _val, 3);
937
938 sptr += dilation_w * 4;
939 kptr += 16;
940 }
941 }
942
943 _sum = activation_ps(_sum, activation_type, activation_params);
944
945 vst1_f16(outptr, _sum);
946 outptr += 4;
947 }
948 }
949 }
950 }
951
952 if (elempack == 1 && out_elempack == 4)
953 {
954 {
955 #pragma omp parallel for num_threads(opt.num_threads)
956 for (int p = 0; p < outh; p++)
957 {
958 __fp16* outptr = top_blob.row<__fp16>(p);
959
960 for (int j = 0; j < outw; j++)
961 {
962 float16x4_t _sum = vdup_n_f16((__fp16)0.f);
963
964 if (bias_term)
965 {
966 _sum = vld1_f16((const __fp16*)bias_data_fp16 + p * 4);
967 }
968
969 const __fp16* kptr = weight_data_fp16.channel(p);
970
971 for (int q = 0; q < h; q++)
972 {
973 const __fp16* sptr = bottom_blob_bordered.row<const __fp16>(q) + j * stride_w;
974
975 for (int k = 0; k < kernel_w; k++)
976 {
977 float16x4_t _val = vdup_n_f16(sptr[0]);
978 float16x4_t _w = vld1_f16(kptr);
979 _sum = vfma_f16(_sum, _val, _w);
980
981 sptr += dilation_w;
982 kptr += 4;
983 }
984 }
985
986 _sum = activation_ps(_sum, activation_type, activation_params);
987
988 vst1_f16(outptr, _sum);
989 outptr += 4;
990 }
991 }
992 }
993 }
994
995 if (elempack == 4 && out_elempack == 1)
996 {
997 {
998 #pragma omp parallel for num_threads(opt.num_threads)
999 for (int p = 0; p < outh; p++)
1000 {
1001 __fp16* outptr = top_blob.row<__fp16>(p);
1002
1003 for (int j = 0; j < outw; j++)
1004 {
1005 float sum = 0.f;
1006
1007 if (bias_term)
1008 {
1009 sum = ((const __fp16*)bias_data_fp16)[p];
1010 }
1011
1012 const __fp16* kptr = weight_data_fp16.channel(p);
1013
1014 for (int q = 0; q < h; q++)
1015 {
1016 const __fp16* sptr = bottom_blob_bordered.row<const __fp16>(q) + j * stride_w * 4;
1017
1018 for (int k = 0; k < kernel_w; k++)
1019 {
1020 float16x4_t _val = vld1_f16(sptr);
1021 float16x4_t _w = vld1_f16(kptr);
1022 float16x4_t _s4 = vmul_f16(_val, _w);
1023
1024 sum += vaddvq_f32(vcvt_f32_f16(_s4)); // dot
1025
1026 sptr += dilation_w * 4;
1027 kptr += 4;
1028 }
1029 }
1030
1031 sum = activation_ss(sum, activation_type, activation_params);
1032
1033 outptr[j] = sum;
1034 }
1035 }
1036 }
1037 }
1038
1039 if (elempack == 1 && out_elempack == 1)
1040 {
1041 {
1042 #pragma omp parallel for num_threads(opt.num_threads)
1043 for (int p = 0; p < outh; p++)
1044 {
1045 __fp16* outptr = top_blob.row<__fp16>(p);
1046
1047 for (int j = 0; j < outw; j++)
1048 {
1049 float sum = 0.f;
1050
1051 if (bias_term)
1052 {
1053 sum = bias_data[p];
1054 }
1055
1056 const __fp16* kptr = weight_data_fp16.channel(p);
1057
1058 for (int q = 0; q < h; q++)
1059 {
1060 const __fp16* sptr = bottom_blob_bordered.row<const __fp16>(q) + j * stride_w;
1061
1062 for (int k = 0; k < kernel_w; k++)
1063 {
1064 float val = (float)sptr[0];
1065 float w = (float)kptr[0];
1066 sum += val * w;
1067
1068 sptr += dilation_w;
1069 kptr += 1;
1070 }
1071 }
1072
1073 sum = activation_ss(sum, activation_type, activation_params);
1074
1075 outptr[j] = (__fp16)sum;
1076 }
1077 }
1078 }
1079 }
1080
1081 return 0;
1082 }
1083 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1084
1085 #if NCNN_BF16
create_pipeline_bf16s(const Option & opt)1086 int Convolution1D_arm::create_pipeline_bf16s(const Option& opt)
1087 {
1088 const int num_input = weight_data_size / kernel_w / num_output;
1089
1090 int elempack = 1;
1091 int out_elempack = 1;
1092
1093 #if __ARM_NEON
1094 if (opt.use_packing_layout)
1095 {
1096 elempack = num_input % 4 == 0 ? 4 : 1;
1097 out_elempack = num_output % 4 == 0 ? 4 : 1;
1098 }
1099 #endif
1100
1101 // src = kw-inch-outch
1102 // dst = pb-pa-kw-inch/pa-outch/pb
1103 {
1104 Mat weight_data_r2 = weight_data.reshape(kernel_w, num_input, num_output);
1105
1106 weight_data_bf16.create(kernel_w, num_input / elempack, num_output / out_elempack, (size_t)2u * elempack * out_elempack, elempack * out_elempack);
1107
1108 for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack)
1109 {
1110 Mat g0 = weight_data_bf16.channel(q / out_elempack);
1111
1112 for (int p = 0; p + (elempack - 1) < num_input; p += elempack)
1113 {
1114 unsigned short* g00 = g0.row<unsigned short>(p / elempack);
1115
1116 for (int k = 0; k < kernel_w; k++)
1117 {
1118 for (int i = 0; i < elempack; i++)
1119 {
1120 for (int j = 0; j < out_elempack; j++)
1121 {
1122 const float* k00 = weight_data_r2.channel(q + j).row(p + i);
1123
1124 g00[0] = float32_to_bfloat16(k00[k]);
1125
1126 g00++;
1127 }
1128 }
1129 }
1130 }
1131 }
1132 }
1133
1134 return 0;
1135 }
1136
forward_bf16s(const Mat & bottom_blob,Mat & top_blob,const Option & opt) const1137 int Convolution1D_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
1138 {
1139 int w = bottom_blob.w;
1140 int h = bottom_blob.h;
1141 size_t elemsize = bottom_blob.elemsize;
1142 int elempack = bottom_blob.elempack;
1143
1144 const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
1145
1146 Mat bottom_blob_bordered;
1147 make_padding(bottom_blob, bottom_blob_bordered, opt);
1148 if (bottom_blob_bordered.empty())
1149 return -100;
1150
1151 w = bottom_blob_bordered.w;
1152 h = bottom_blob_bordered.h;
1153
1154 int out_elempack = (support_packing && opt.use_packing_layout && num_output % 4 == 0) ? 4 : 1;
1155 size_t out_elemsize = elemsize / elempack * out_elempack;
1156
1157 const int outw = (w - kernel_extent_w) / stride_w + 1;
1158 const int outh = num_output / out_elempack;
1159
1160 top_blob.create(outw, outh, out_elemsize, out_elempack, opt.blob_allocator);
1161 if (top_blob.empty())
1162 return -100;
1163
1164 #if __ARM_NEON
1165 if (elempack == 4 && out_elempack == 4)
1166 {
1167 {
1168 #pragma omp parallel for num_threads(opt.num_threads)
1169 for (int p = 0; p < outh; p++)
1170 {
1171 unsigned short* outptr = top_blob.row<unsigned short>(p);
1172
1173 for (int j = 0; j < outw; j++)
1174 {
1175 float32x4_t _sum = vdupq_n_f32(0.f);
1176
1177 if (bias_term)
1178 {
1179 _sum = vld1q_f32((const float*)bias_data + p * 4);
1180 }
1181
1182 const unsigned short* kptr = weight_data_bf16.channel(p);
1183
1184 for (int q = 0; q < h; q++)
1185 {
1186 const unsigned short* sptr = bottom_blob_bordered.row<const unsigned short>(q) + j * stride_w * 4;
1187
1188 for (int k = 0; k < kernel_w; k++)
1189 {
1190 float32x4_t _val = vcvt_f32_bf16(vld1_u16(sptr));
1191
1192 float32x4_t _w0 = vcvt_f32_bf16(vld1_u16(kptr));
1193 float32x4_t _w1 = vcvt_f32_bf16(vld1_u16(kptr + 4));
1194 float32x4_t _w2 = vcvt_f32_bf16(vld1_u16(kptr + 8));
1195 float32x4_t _w3 = vcvt_f32_bf16(vld1_u16(kptr + 12));
1196
1197 #if __aarch64__
1198 _sum = vmlaq_laneq_f32(_sum, _w0, _val, 0);
1199 _sum = vmlaq_laneq_f32(_sum, _w1, _val, 1);
1200 _sum = vmlaq_laneq_f32(_sum, _w2, _val, 2);
1201 _sum = vmlaq_laneq_f32(_sum, _w3, _val, 3);
1202 #else
1203 _sum = vmlaq_lane_f32(_sum, _w0, vget_low_f32(_val), 0);
1204 _sum = vmlaq_lane_f32(_sum, _w1, vget_low_f32(_val), 1);
1205 _sum = vmlaq_lane_f32(_sum, _w2, vget_high_f32(_val), 0);
1206 _sum = vmlaq_lane_f32(_sum, _w3, vget_high_f32(_val), 1);
1207 #endif
1208
1209 sptr += dilation_w * 4;
1210 kptr += 16;
1211 }
1212 }
1213
1214 _sum = activation_ps(_sum, activation_type, activation_params);
1215
1216 vst1_u16(outptr, vcvt_bf16_f32(_sum));
1217 outptr += 4;
1218 }
1219 }
1220 }
1221 }
1222
1223 if (elempack == 1 && out_elempack == 4)
1224 {
1225 {
1226 #pragma omp parallel for num_threads(opt.num_threads)
1227 for (int p = 0; p < outh; p++)
1228 {
1229 unsigned short* outptr = top_blob.row<unsigned short>(p);
1230
1231 for (int j = 0; j < outw; j++)
1232 {
1233 float32x4_t _sum = vdupq_n_f32(0.f);
1234
1235 if (bias_term)
1236 {
1237 _sum = vld1q_f32((const float*)bias_data + p * 4);
1238 }
1239
1240 const unsigned short* kptr = weight_data_bf16.channel(p);
1241
1242 for (int q = 0; q < h; q++)
1243 {
1244 const unsigned short* sptr = bottom_blob_bordered.row<const unsigned short>(q) + j * stride_w;
1245
1246 for (int k = 0; k < kernel_w; k++)
1247 {
1248 float32x4_t _val = vdupq_n_f32(bfloat16_to_float32(sptr[0]));
1249 float32x4_t _w = vcvt_f32_bf16(vld1_u16(kptr));
1250 _sum = vmlaq_f32(_sum, _val, _w);
1251
1252 sptr += dilation_w;
1253 kptr += 4;
1254 }
1255 }
1256
1257 _sum = activation_ps(_sum, activation_type, activation_params);
1258
1259 vst1_u16(outptr, vcvt_bf16_f32(_sum));
1260 outptr += 4;
1261 }
1262 }
1263 }
1264 }
1265
1266 if (elempack == 4 && out_elempack == 1)
1267 {
1268 {
1269 #pragma omp parallel for num_threads(opt.num_threads)
1270 for (int p = 0; p < outh; p++)
1271 {
1272 unsigned short* outptr = top_blob.row<unsigned short>(p);
1273
1274 for (int j = 0; j < outw; j++)
1275 {
1276 float sum = 0.f;
1277
1278 if (bias_term)
1279 {
1280 sum = bias_data[p];
1281 }
1282
1283 const unsigned short* kptr = weight_data_bf16.channel(p);
1284
1285 for (int q = 0; q < h; q++)
1286 {
1287 const unsigned short* sptr = bottom_blob_bordered.row<const unsigned short>(q) + j * stride_w * 4;
1288
1289 for (int k = 0; k < kernel_w; k++)
1290 {
1291 float32x4_t _val = vcvt_f32_bf16(vld1_u16(sptr));
1292 float32x4_t _w = vcvt_f32_bf16(vld1_u16(kptr));
1293 float32x4_t _s4 = vmulq_f32(_val, _w);
1294 #if __aarch64__
1295 sum += vaddvq_f32(_s4); // dot
1296 #else
1297 float32x2_t _ss = vadd_f32(vget_low_f32(_s4), vget_high_f32(_s4));
1298 _ss = vpadd_f32(_ss, _ss);
1299 sum += vget_lane_f32(_ss, 0);
1300 #endif
1301
1302 sptr += dilation_w * 4;
1303 kptr += 4;
1304 }
1305 }
1306
1307 sum = activation_ss(sum, activation_type, activation_params);
1308
1309 outptr[j] = float32_to_bfloat16(sum);
1310 }
1311 }
1312 }
1313 }
1314 #endif // __ARM_NEON
1315
1316 if (elempack == 1 && out_elempack == 1)
1317 {
1318 {
1319 #pragma omp parallel for num_threads(opt.num_threads)
1320 for (int p = 0; p < outh; p++)
1321 {
1322 unsigned short* outptr = top_blob.row<unsigned short>(p);
1323
1324 for (int j = 0; j < outw; j++)
1325 {
1326 float sum = 0.f;
1327
1328 if (bias_term)
1329 {
1330 sum = bias_data[p];
1331 }
1332
1333 const unsigned short* kptr = weight_data_bf16.channel(p);
1334
1335 for (int q = 0; q < h; q++)
1336 {
1337 const unsigned short* sptr = bottom_blob_bordered.row<unsigned short>(q) + j * stride_w;
1338
1339 for (int k = 0; k < kernel_w; k++)
1340 {
1341 float val = bfloat16_to_float32(sptr[0]);
1342 float wt = bfloat16_to_float32(kptr[0]);
1343 sum += val * wt;
1344
1345 sptr += dilation_w;
1346 kptr += 1;
1347 }
1348 }
1349
1350 sum = activation_ss(sum, activation_type, activation_params);
1351
1352 outptr[j] = float32_to_bfloat16(sum);
1353 }
1354 }
1355 }
1356 }
1357
1358 return 0;
1359 }
1360 #endif // NCNN_BF16
1361
1362 } // namespace ncnn
1363