1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14
15 #include "interp_arm.h"
16
17 #include <math.h>
18
19 #if __ARM_NEON
20 #include <arm_neon.h>
21 #endif // __ARM_NEON
22
23 namespace ncnn {
24
25 #include "interp_bicubic.h"
26 #include "interp_bicubic_bf16s.h"
27 #include "interp_bilinear.h"
28 #include "interp_bilinear_bf16s.h"
29
30 #if __ARM_NEON
31 #include "interp_bicubic_pack4.h"
32 #include "interp_bicubic_pack4_bf16s.h"
33 #include "interp_bilinear_pack4.h"
34 #include "interp_bilinear_pack4_bf16s.h"
35 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
36 #include "interp_bicubic_fp16s.h"
37 #include "interp_bicubic_pack4_fp16s.h"
38 #include "interp_bicubic_pack8_fp16s.h"
39 #include "interp_bilinear_fp16s.h"
40 #include "interp_bilinear_pack4_fp16s.h"
41 #include "interp_bilinear_pack8_fp16s.h"
42 #endif
43 #endif
44
Interp_arm()45 Interp_arm::Interp_arm()
46 {
47 #if __ARM_NEON
48 support_packing = true;
49 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
50 support_fp16_storage = true;
51 #endif
52 #endif // __ARM_NEON
53
54 support_bf16_storage = true;
55 }
56
forward(const std::vector<Mat> & bottom_blobs,std::vector<Mat> & top_blobs,const Option & opt) const57 int Interp_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
58 {
59 const Mat& bottom_blob = bottom_blobs[0];
60 const Mat& reference_blob = bottom_blobs[1];
61 Mat& top_blob = top_blobs[0];
62
63 int elembits = bottom_blob.elembits();
64
65 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
66 if (opt.use_fp16_storage && elembits == 16)
67 {
68 if (opt.use_fp16_arithmetic)
69 return forward_fp16sa(bottom_blobs, top_blobs, opt);
70 else
71 return forward_fp16s(bottom_blobs, top_blobs, opt);
72 }
73 #endif
74
75 if (opt.use_bf16_storage && elembits == 16)
76 return forward_bf16s(bottom_blobs, top_blobs, opt);
77
78 int h = bottom_blob.h;
79 int w = bottom_blob.w;
80 int channels = bottom_blob.c;
81 size_t elemsize = bottom_blob.elemsize;
82 int elempack = bottom_blob.elempack;
83
84 int outw = reference_blob.w;
85 int outh = reference_blob.h;
86
87 if (bottom_blob.dims == 1)
88 {
89 top_blob.create(outw, outh, w, elemsize, elempack, opt.blob_allocator);
90 if (top_blob.empty())
91 return -100;
92
93 #if __ARM_NEON
94 if (elempack == 4)
95 {
96 #pragma omp parallel for num_threads(opt.num_threads)
97 for (int q = 0; q < w; q++)
98 {
99 Mat top_blob_c = top_blob.channel(q);
100 float32x4_t _v = vld1q_f32((const float*)bottom_blob + q * 4);
101 top_blob_c.fill(_v);
102 }
103
104 return 0;
105 }
106 #endif // __ARM_NEON
107
108 #pragma omp parallel for num_threads(opt.num_threads)
109 for (int q = 0; q < w; q++)
110 {
111 Mat top_blob_c = top_blob.channel(q);
112 const float v = bottom_blob[q];
113 top_blob_c.fill(v);
114 }
115
116 return 0;
117 }
118
119 if (outw == w && outh == h)
120 {
121 top_blob = bottom_blob;
122 return 0;
123 }
124
125 top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_allocator);
126 if (top_blob.empty())
127 return -100;
128
129 #if __ARM_NEON
130 if (elempack == 4)
131 {
132 if (resize_type == 1) // nearest
133 {
134 const float hs = outh ? h / (float)outh : 1.f / height_scale;
135 const float ws = outw ? w / (float)outw : 1.f / width_scale;
136
137 #pragma omp parallel for num_threads(opt.num_threads)
138 for (int q = 0; q < channels; q++)
139 {
140 const Mat src = bottom_blob.channel(q);
141 Mat dst = top_blob.channel(q);
142
143 for (int y = 0; y < outh; y++)
144 {
145 int in_y = std::min((int)(y * hs), (h - 1));
146
147 const float* ptr = src.row(in_y);
148 float* outptr = dst.row(y);
149 for (int x = 0; x < outw; x++)
150 {
151 int in_x = std::min((int)(x * ws), (w - 1));
152
153 float32x4_t _p = vld1q_f32(ptr + in_x * 4);
154 vst1q_f32(outptr, _p);
155
156 outptr += 4;
157 }
158 }
159 }
160 }
161
162 if (resize_type == 2) // bilinear
163 {
164 int* buf = new int[outw + outh + outw * 2 + outh * 2];
165
166 int* xofs = buf; //new int[outw];
167 int* yofs = buf + outw; //new int[outh];
168
169 float* alpha = (float*)(buf + outw + outh); //new float[outw * 2];
170 float* beta = (float*)(buf + outw + outh + outw * 2); //new float[outh * 2];
171
172 linear_coeffs(w, outw, xofs, alpha, align_corner);
173 linear_coeffs(h, outh, yofs, beta, align_corner);
174
175 #pragma omp parallel for num_threads(opt.num_threads)
176 for (int q = 0; q < channels; q++)
177 {
178 const Mat src = bottom_blob.channel(q);
179 Mat dst = top_blob.channel(q);
180
181 resize_bilinear_image_pack4(src, dst, alpha, xofs, beta, yofs);
182 }
183
184 delete[] buf;
185 }
186
187 if (resize_type == 3) // bicubic
188 {
189 int* buf = new int[outw + outh + outw * 4 + outh * 4];
190
191 int* xofs = buf; //new int[outw];
192 int* yofs = buf + outw; //new int[outh];
193
194 float* alpha = (float*)(buf + outw + outh); //new float[outw * 4];
195 float* beta = (float*)(buf + outw + outh + outw * 4); //new float[outh * 4];
196
197 cubic_coeffs(w, outw, xofs, alpha);
198 cubic_coeffs(h, outh, yofs, beta);
199
200 #pragma omp parallel for num_threads(opt.num_threads)
201 for (int q = 0; q < channels; q++)
202 {
203 const Mat src = bottom_blob.channel(q);
204 Mat dst = top_blob.channel(q);
205
206 resize_bicubic_image_pack4(src, dst, alpha, xofs, beta, yofs);
207 }
208
209 delete[] buf;
210 }
211
212 return 0;
213 }
214 #endif // __ARM_NEON
215
216 if (resize_type == 1) // nearest
217 {
218 const float hs = outh ? h / (float)outh : 1.f / height_scale;
219 const float ws = outw ? w / (float)outw : 1.f / width_scale;
220
221 #pragma omp parallel for num_threads(opt.num_threads)
222 for (int q = 0; q < channels; q++)
223 {
224 const Mat src = bottom_blob.channel(q);
225 Mat dst = top_blob.channel(q);
226
227 for (int y = 0; y < outh; y++)
228 {
229 int in_y = std::min((int)(y * hs), (h - 1));
230
231 const float* ptr = src.row(in_y);
232 float* outptr = dst.row(y);
233 for (int x = 0; x < outw; x++)
234 {
235 int in_x = std::min((int)(x * ws), (w - 1));
236 *outptr++ = ptr[in_x];
237 }
238 }
239 }
240 }
241
242 if (resize_type == 2) // bilinear
243 {
244 int* buf = new int[outw + outh + outw * 2 + outh * 2];
245
246 int* xofs = buf; //new int[outw];
247 int* yofs = buf + outw; //new int[outh];
248
249 float* alpha = (float*)(buf + outw + outh); //new float[outw * 2];
250 float* beta = (float*)(buf + outw + outh + outw * 2); //new float[outh * 2];
251
252 linear_coeffs(w, outw, xofs, alpha, align_corner);
253 linear_coeffs(h, outh, yofs, beta, align_corner);
254
255 #pragma omp parallel for num_threads(opt.num_threads)
256 for (int q = 0; q < channels; q++)
257 {
258 const Mat src = bottom_blob.channel(q);
259 Mat dst = top_blob.channel(q);
260
261 resize_bilinear_image(src, dst, alpha, xofs, beta, yofs);
262 }
263
264 delete[] buf;
265 }
266
267 if (resize_type == 3) // bicubic
268 {
269 int* buf = new int[outw + outh + outw * 4 + outh * 4];
270
271 int* xofs = buf; //new int[outw];
272 int* yofs = buf + outw; //new int[outh];
273
274 float* alpha = (float*)(buf + outw + outh); //new float[outw * 4];
275 float* beta = (float*)(buf + outw + outh + outw * 4); //new float[outh * 4];
276
277 cubic_coeffs(w, outw, xofs, alpha);
278 cubic_coeffs(h, outh, yofs, beta);
279
280 #pragma omp parallel for num_threads(opt.num_threads)
281 for (int q = 0; q < channels; q++)
282 {
283 const Mat src = bottom_blob.channel(q);
284 Mat dst = top_blob.channel(q);
285
286 resize_bicubic_image(src, dst, alpha, xofs, beta, yofs);
287 }
288
289 delete[] buf;
290 }
291
292 return 0;
293 }
294
295 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
forward_fp16s(const std::vector<Mat> & bottom_blobs,std::vector<Mat> & top_blobs,const Option & opt) const296 int Interp_arm::forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
297 {
298 const Mat& bottom_blob = bottom_blobs[0];
299 const Mat& reference_blob = bottom_blobs[1];
300 Mat& top_blob = top_blobs[0];
301
302 int h = bottom_blob.h;
303 int w = bottom_blob.w;
304 int channels = bottom_blob.c;
305 size_t elemsize = bottom_blob.elemsize;
306 int elempack = bottom_blob.elempack;
307
308 int outw = reference_blob.w;
309 int outh = reference_blob.h;
310
311 if (bottom_blob.dims == 1)
312 {
313 top_blob.create(outw, outh, w, elemsize, elempack, opt.blob_allocator);
314 if (top_blob.empty())
315 return -100;
316
317 if (elempack == 4)
318 {
319 #pragma omp parallel for num_threads(opt.num_threads)
320 for (int q = 0; q < w; q++)
321 {
322 Mat top_blob_c = top_blob.channel(q);
323 float16x4_t _v = vld1_f16((const __fp16*)bottom_blob + q * 4);
324 top_blob_c.fill(_v);
325 }
326
327 return 0;
328 }
329
330 #pragma omp parallel for num_threads(opt.num_threads)
331 for (int q = 0; q < w; q++)
332 {
333 Mat top_blob_c = top_blob.channel(q);
334 const __fp16* ptr = bottom_blob;
335 top_blob_c.fill(ptr[q]);
336 }
337
338 return 0;
339 }
340
341 if (outw == w && outh == h)
342 {
343 top_blob = bottom_blob;
344 return 0;
345 }
346
347 top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_allocator);
348 if (top_blob.empty())
349 return -100;
350
351 if (elempack == 4)
352 {
353 if (resize_type == 1) // nearest
354 {
355 const float hs = outh ? h / (float)outh : 1.f / height_scale;
356 const float ws = outw ? w / (float)outw : 1.f / width_scale;
357
358 #pragma omp parallel for num_threads(opt.num_threads)
359 for (int q = 0; q < channels; q++)
360 {
361 const Mat src = bottom_blob.channel(q);
362 Mat dst = top_blob.channel(q);
363
364 for (int y = 0; y < outh; y++)
365 {
366 int in_y = std::min((int)(y * hs), (h - 1));
367
368 const __fp16* ptr = src.row<const __fp16>(in_y);
369 __fp16* outptr = dst.row<__fp16>(y);
370 for (int x = 0; x < outw; x++)
371 {
372 int in_x = std::min((int)(x * ws), (w - 1));
373
374 float16x4_t _p = vld1_f16(ptr + in_x * 4);
375 vst1_f16(outptr, _p);
376
377 outptr += 4;
378 }
379 }
380 }
381 }
382
383 if (resize_type == 2) // bilinear
384 {
385 int* buf = new int[outw + outh + outw * 2 + outh * 2];
386
387 int* xofs = buf; //new int[outw];
388 int* yofs = buf + outw; //new int[outh];
389
390 float* alpha = (float*)(buf + outw + outh); //new float[outw * 2];
391 float* beta = (float*)(buf + outw + outh + outw * 2); //new float[outh * 2];
392
393 linear_coeffs(w, outw, xofs, alpha, align_corner);
394 linear_coeffs(h, outh, yofs, beta, align_corner);
395
396 #pragma omp parallel for num_threads(opt.num_threads)
397 for (int q = 0; q < channels; q++)
398 {
399 const Mat src = bottom_blob.channel(q);
400 Mat dst = top_blob.channel(q);
401
402 resize_bilinear_image_pack4_fp16s(src, dst, alpha, xofs, beta, yofs);
403 }
404
405 delete[] buf;
406 }
407
408 if (resize_type == 3) // bicubic
409 {
410 int* buf = new int[outw + outh + outw * 4 + outh * 4];
411
412 int* xofs = buf; //new int[outw];
413 int* yofs = buf + outw; //new int[outh];
414
415 float* alpha = (float*)(buf + outw + outh); //new float[outw * 4];
416 float* beta = (float*)(buf + outw + outh + outw * 4); //new float[outh * 4];
417
418 cubic_coeffs(w, outw, xofs, alpha);
419 cubic_coeffs(h, outh, yofs, beta);
420
421 #pragma omp parallel for num_threads(opt.num_threads)
422 for (int q = 0; q < channels; q++)
423 {
424 const Mat src = bottom_blob.channel(q);
425 Mat dst = top_blob.channel(q);
426
427 resize_bicubic_image_pack4_fp16s(src, dst, alpha, xofs, beta, yofs);
428 }
429
430 delete[] buf;
431 }
432
433 return 0;
434 }
435
436 if (resize_type == 1) // nearest
437 {
438 const float hs = outh ? h / (float)outh : 1.f / height_scale;
439 const float ws = outw ? w / (float)outw : 1.f / width_scale;
440
441 #pragma omp parallel for num_threads(opt.num_threads)
442 for (int q = 0; q < channels; q++)
443 {
444 const Mat src = bottom_blob.channel(q);
445 Mat dst = top_blob.channel(q);
446
447 for (int y = 0; y < outh; y++)
448 {
449 int in_y = std::min((int)(y * hs), (h - 1));
450
451 const __fp16* ptr = src.row<const __fp16>(in_y);
452 __fp16* outptr = dst.row<__fp16>(y);
453 for (int x = 0; x < outw; x++)
454 {
455 int in_x = std::min((int)(x * ws), (w - 1));
456 *outptr++ = ptr[in_x];
457 }
458 }
459 }
460 }
461
462 if (resize_type == 2) // bilinear
463 {
464 int* buf = new int[outw + outh + outw * 2 + outh * 2];
465
466 int* xofs = buf; //new int[outw];
467 int* yofs = buf + outw; //new int[outh];
468
469 float* alpha = (float*)(buf + outw + outh); //new float[outw * 2];
470 float* beta = (float*)(buf + outw + outh + outw * 2); //new float[outh * 2];
471
472 linear_coeffs(w, outw, xofs, alpha, align_corner);
473 linear_coeffs(h, outh, yofs, beta, align_corner);
474
475 #pragma omp parallel for num_threads(opt.num_threads)
476 for (int q = 0; q < channels; q++)
477 {
478 const Mat src = bottom_blob.channel(q);
479 Mat dst = top_blob.channel(q);
480
481 resize_bilinear_image_fp16s(src, dst, alpha, xofs, beta, yofs);
482 }
483
484 delete[] buf;
485 }
486
487 if (resize_type == 3) // bicubic
488 {
489 int* buf = new int[outw + outh + outw * 4 + outh * 4];
490
491 int* xofs = buf; //new int[outw];
492 int* yofs = buf + outw; //new int[outh];
493
494 float* alpha = (float*)(buf + outw + outh); //new float[outw * 4];
495 float* beta = (float*)(buf + outw + outh + outw * 4); //new float[outh * 4];
496
497 cubic_coeffs(w, outw, xofs, alpha);
498 cubic_coeffs(h, outh, yofs, beta);
499
500 #pragma omp parallel for num_threads(opt.num_threads)
501 for (int q = 0; q < channels; q++)
502 {
503 const Mat src = bottom_blob.channel(q);
504 Mat dst = top_blob.channel(q);
505
506 resize_bicubic_image_fp16s(src, dst, alpha, xofs, beta, yofs);
507 }
508
509 delete[] buf;
510 }
511
512 return 0;
513 }
514
forward_fp16sa(const std::vector<Mat> & bottom_blobs,std::vector<Mat> & top_blobs,const Option & opt) const515 int Interp_arm::forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
516 {
517 const Mat& bottom_blob = bottom_blobs[0];
518 const Mat& reference_blob = bottom_blobs[1];
519 Mat& top_blob = top_blobs[0];
520
521 int h = bottom_blob.h;
522 int w = bottom_blob.w;
523 int channels = bottom_blob.c;
524 size_t elemsize = bottom_blob.elemsize;
525 int elempack = bottom_blob.elempack;
526
527 int outw = reference_blob.w;
528 int outh = reference_blob.h;
529
530 if (bottom_blob.dims == 1)
531 {
532 top_blob.create(outw, outh, w, elemsize, elempack, opt.blob_allocator);
533 if (top_blob.empty())
534 return -100;
535
536 if (elempack == 8)
537 {
538 #pragma omp parallel for num_threads(opt.num_threads)
539 for (int q = 0; q < w; q++)
540 {
541 Mat top_blob_c = top_blob.channel(q);
542 float16x8_t _v = vld1q_f16((const __fp16*)bottom_blob + q * 8);
543 top_blob_c.fill(_v);
544 }
545
546 return 0;
547 }
548
549 if (elempack == 4)
550 {
551 #pragma omp parallel for num_threads(opt.num_threads)
552 for (int q = 0; q < w; q++)
553 {
554 Mat top_blob_c = top_blob.channel(q);
555 float16x4_t _v = vld1_f16((const __fp16*)bottom_blob + q * 4);
556 top_blob_c.fill(_v);
557 }
558
559 return 0;
560 }
561
562 #pragma omp parallel for num_threads(opt.num_threads)
563 for (int q = 0; q < w; q++)
564 {
565 Mat top_blob_c = top_blob.channel(q);
566 const __fp16* ptr = bottom_blob;
567 top_blob_c.fill(ptr[q]);
568 }
569
570 return 0;
571 }
572
573 if (outw == w && outh == h)
574 {
575 top_blob = bottom_blob;
576 return 0;
577 }
578
579 top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_allocator);
580 if (top_blob.empty())
581 return -100;
582
583 if (elempack == 8)
584 {
585 if (resize_type == 1) // nearest
586 {
587 const float hs = outh ? h / (float)outh : 1.f / height_scale;
588 const float ws = outw ? w / (float)outw : 1.f / width_scale;
589
590 #pragma omp parallel for num_threads(opt.num_threads)
591 for (int q = 0; q < channels; q++)
592 {
593 const Mat src = bottom_blob.channel(q);
594 Mat dst = top_blob.channel(q);
595
596 for (int y = 0; y < outh; y++)
597 {
598 int in_y = std::min((int)(y * hs), (h - 1));
599
600 const __fp16* ptr = src.row<const __fp16>(in_y);
601 __fp16* outptr = dst.row<__fp16>(y);
602 for (int x = 0; x < outw; x++)
603 {
604 int in_x = std::min((int)(x * ws), (w - 1));
605
606 float16x8_t _p = vld1q_f16(ptr + in_x * 8);
607 vst1q_f16(outptr, _p);
608
609 outptr += 8;
610 }
611 }
612 }
613 }
614
615 if (resize_type == 2) // bilinear
616 {
617 int* buf = new int[outw + outh + outw * 2 + outh * 2];
618
619 int* xofs = buf; //new int[outw];
620 int* yofs = buf + outw; //new int[outh];
621
622 __fp16* alpha = (__fp16*)(buf + outw + outh); //new __fp16[outw * 2];
623 __fp16* beta = (__fp16*)(buf + outw + outh + outw * 2); //new __fp16[outh * 2];
624
625 linear_coeffs_fp16sa(w, outw, xofs, alpha, align_corner);
626 linear_coeffs_fp16sa(h, outh, yofs, beta, align_corner);
627
628 #pragma omp parallel for num_threads(opt.num_threads)
629 for (int q = 0; q < channels; q++)
630 {
631 const Mat src = bottom_blob.channel(q);
632 Mat dst = top_blob.channel(q);
633
634 resize_bilinear_image_pack8_fp16sa(src, dst, alpha, xofs, beta, yofs);
635 }
636
637 delete[] buf;
638 }
639
640 if (resize_type == 3) // bicubic
641 {
642 int* buf = new int[outw + outh + outw * 4 + outh * 4];
643
644 int* xofs = buf; //new int[outw];
645 int* yofs = buf + outw; //new int[outh];
646
647 __fp16* alpha = (__fp16*)(buf + outw + outh); //new __fp16[outw * 4];
648 __fp16* beta = (__fp16*)(buf + outw + outh + outw * 4); //new __fp16[outh * 4];
649
650 cubic_coeffs_fp16sa(w, outw, xofs, alpha);
651 cubic_coeffs_fp16sa(h, outh, yofs, beta);
652
653 #pragma omp parallel for num_threads(opt.num_threads)
654 for (int q = 0; q < channels; q++)
655 {
656 const Mat src = bottom_blob.channel(q);
657 Mat dst = top_blob.channel(q);
658
659 resize_bicubic_image_pack8_fp16sa(src, dst, alpha, xofs, beta, yofs);
660 }
661
662 delete[] buf;
663 }
664
665 return 0;
666 }
667
668 if (elempack == 4)
669 {
670 if (resize_type == 1) // nearest
671 {
672 const float hs = outh ? h / (float)outh : 1.f / height_scale;
673 const float ws = outw ? w / (float)outw : 1.f / width_scale;
674
675 #pragma omp parallel for num_threads(opt.num_threads)
676 for (int q = 0; q < channels; q++)
677 {
678 const Mat src = bottom_blob.channel(q);
679 Mat dst = top_blob.channel(q);
680
681 for (int y = 0; y < outh; y++)
682 {
683 int in_y = std::min((int)(y * hs), (h - 1));
684
685 const __fp16* ptr = src.row<const __fp16>(in_y);
686 __fp16* outptr = dst.row<__fp16>(y);
687 for (int x = 0; x < outw; x++)
688 {
689 int in_x = std::min((int)(x * ws), (w - 1));
690
691 float16x4_t _p = vld1_f16(ptr + in_x * 4);
692 vst1_f16(outptr, _p);
693
694 outptr += 4;
695 }
696 }
697 }
698 }
699
700 if (resize_type == 2) // bilinear
701 {
702 int* buf = new int[outw + outh + outw * 2 + outh * 2];
703
704 int* xofs = buf; //new int[outw];
705 int* yofs = buf + outw; //new int[outh];
706
707 __fp16* alpha = (__fp16*)(buf + outw + outh); //new __fp16[outw * 2];
708 __fp16* beta = (__fp16*)(buf + outw + outh + outw * 2); //new __fp16[outh * 2];
709
710 linear_coeffs_fp16sa(w, outw, xofs, alpha, align_corner);
711 linear_coeffs_fp16sa(h, outh, yofs, beta, align_corner);
712
713 #pragma omp parallel for num_threads(opt.num_threads)
714 for (int q = 0; q < channels; q++)
715 {
716 const Mat src = bottom_blob.channel(q);
717 Mat dst = top_blob.channel(q);
718
719 resize_bilinear_image_pack4_fp16sa(src, dst, alpha, xofs, beta, yofs);
720 }
721
722 delete[] buf;
723 }
724
725 if (resize_type == 3) // bicubic
726 {
727 int* buf = new int[outw + outh + outw * 4 + outh * 4];
728
729 int* xofs = buf; //new int[outw];
730 int* yofs = buf + outw; //new int[outh];
731
732 __fp16* alpha = (__fp16*)(buf + outw + outh); //new __fp16[outw * 4];
733 __fp16* beta = (__fp16*)(buf + outw + outh + outw * 4); //new __fp16[outh * 4];
734
735 cubic_coeffs_fp16sa(w, outw, xofs, alpha);
736 cubic_coeffs_fp16sa(h, outh, yofs, beta);
737
738 #pragma omp parallel for num_threads(opt.num_threads)
739 for (int q = 0; q < channels; q++)
740 {
741 const Mat src = bottom_blob.channel(q);
742 Mat dst = top_blob.channel(q);
743
744 resize_bicubic_image_pack4_fp16sa(src, dst, alpha, xofs, beta, yofs);
745 }
746
747 delete[] buf;
748 }
749
750 return 0;
751 }
752
753 if (resize_type == 1) // nearest
754 {
755 const float hs = outh ? h / (float)outh : 1.f / height_scale;
756 const float ws = outw ? w / (float)outw : 1.f / width_scale;
757
758 #pragma omp parallel for num_threads(opt.num_threads)
759 for (int q = 0; q < channels; q++)
760 {
761 const Mat src = bottom_blob.channel(q);
762 Mat dst = top_blob.channel(q);
763
764 for (int y = 0; y < outh; y++)
765 {
766 int in_y = std::min((int)(y * hs), (h - 1));
767
768 const __fp16* ptr = src.row<const __fp16>(in_y);
769 __fp16* outptr = dst.row<__fp16>(y);
770 for (int x = 0; x < outw; x++)
771 {
772 int in_x = std::min((int)(x * ws), (w - 1));
773 *outptr++ = ptr[in_x];
774 }
775 }
776 }
777 }
778
779 if (resize_type == 2) // bilinear
780 {
781 int* buf = new int[outw + outh + outw * 2 + outh * 2];
782
783 int* xofs = buf; //new int[outw];
784 int* yofs = buf + outw; //new int[outh];
785
786 __fp16* alpha = (__fp16*)(buf + outw + outh); //new __fp16[outw * 2];
787 __fp16* beta = (__fp16*)(buf + outw + outh + outw * 2); //new __fp16[outh * 2];
788
789 linear_coeffs_fp16sa(w, outw, xofs, alpha, align_corner);
790 linear_coeffs_fp16sa(h, outh, yofs, beta, align_corner);
791
792 #pragma omp parallel for num_threads(opt.num_threads)
793 for (int q = 0; q < channels; q++)
794 {
795 const Mat src = bottom_blob.channel(q);
796 Mat dst = top_blob.channel(q);
797
798 resize_bilinear_image_fp16sa(src, dst, alpha, xofs, beta, yofs);
799 }
800
801 delete[] buf;
802 }
803
804 if (resize_type == 3) // bicubic
805 {
806 int* buf = new int[outw + outh + outw * 4 + outh * 4];
807
808 int* xofs = buf; //new int[outw];
809 int* yofs = buf + outw; //new int[outh];
810
811 __fp16* alpha = (__fp16*)(buf + outw + outh); //new __fp16[outw * 4];
812 __fp16* beta = (__fp16*)(buf + outw + outh + outw * 4); //new __fp16[outh * 4];
813
814 cubic_coeffs_fp16sa(w, outw, xofs, alpha);
815 cubic_coeffs_fp16sa(h, outh, yofs, beta);
816
817 #pragma omp parallel for num_threads(opt.num_threads)
818 for (int q = 0; q < channels; q++)
819 {
820 const Mat src = bottom_blob.channel(q);
821 Mat dst = top_blob.channel(q);
822
823 resize_bicubic_image_fp16sa(src, dst, alpha, xofs, beta, yofs);
824 }
825
826 delete[] buf;
827 }
828
829 return 0;
830 }
831 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
832
forward_bf16s(const std::vector<Mat> & bottom_blobs,std::vector<Mat> & top_blobs,const Option & opt) const833 int Interp_arm::forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
834 {
835 const Mat& bottom_blob = bottom_blobs[0];
836 const Mat& reference_blob = bottom_blobs[1];
837 Mat& top_blob = top_blobs[0];
838
839 int h = bottom_blob.h;
840 int w = bottom_blob.w;
841 int channels = bottom_blob.c;
842 size_t elemsize = bottom_blob.elemsize;
843 int elempack = bottom_blob.elempack;
844
845 int outw = reference_blob.w;
846 int outh = reference_blob.h;
847
848 if (bottom_blob.dims == 1)
849 {
850 top_blob.create(outw, outh, w, elemsize, elempack, opt.blob_allocator);
851 if (top_blob.empty())
852 return -100;
853
854 #if __ARM_NEON
855 if (elempack == 4)
856 {
857 #pragma omp parallel for num_threads(opt.num_threads)
858 for (int q = 0; q < w; q++)
859 {
860 Mat top_blob_c = top_blob.channel(q);
861 uint16x4_t _v = vld1_u16((const unsigned short*)bottom_blob + q * 4);
862 top_blob_c.fill(_v);
863 }
864
865 return 0;
866 }
867 #endif // __ARM_NEON
868
869 #pragma omp parallel for num_threads(opt.num_threads)
870 for (int q = 0; q < w; q++)
871 {
872 Mat top_blob_c = top_blob.channel(q);
873 const unsigned short* ptr = bottom_blob;
874 top_blob_c.fill(ptr[q]);
875 }
876
877 return 0;
878 }
879
880 if (outw == w && outh == h)
881 {
882 top_blob = bottom_blob;
883 return 0;
884 }
885
886 top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_allocator);
887 if (top_blob.empty())
888 return -100;
889
890 #if __ARM_NEON
891 if (elempack == 4)
892 {
893 if (resize_type == 1) // nearest
894 {
895 const float hs = outh ? h / (float)outh : 1.f / height_scale;
896 const float ws = outw ? w / (float)outw : 1.f / width_scale;
897
898 #pragma omp parallel for num_threads(opt.num_threads)
899 for (int q = 0; q < channels; q++)
900 {
901 const Mat src = bottom_blob.channel(q);
902 Mat dst = top_blob.channel(q);
903
904 for (int y = 0; y < outh; y++)
905 {
906 int in_y = std::min((int)(y * hs), (h - 1));
907
908 const unsigned short* ptr = src.row<const unsigned short>(in_y);
909 unsigned short* outptr = dst.row<unsigned short>(y);
910 for (int x = 0; x < outw; x++)
911 {
912 int in_x = std::min((int)(x * ws), (w - 1));
913
914 uint16x4_t _p = vld1_u16(ptr + in_x * 4);
915 vst1_u16(outptr, _p);
916
917 outptr += 4;
918 }
919 }
920 }
921 }
922
923 if (resize_type == 2) // bilinear
924 {
925 int* buf = new int[outw + outh + outw * 2 + outh * 2];
926
927 int* xofs = buf; //new int[outw];
928 int* yofs = buf + outw; //new int[outh];
929
930 float* alpha = (float*)(buf + outw + outh); //new float[outw * 2];
931 float* beta = (float*)(buf + outw + outh + outw * 2); //new float[outh * 2];
932
933 linear_coeffs(w, outw, xofs, alpha, align_corner);
934 linear_coeffs(h, outh, yofs, beta, align_corner);
935
936 #pragma omp parallel for num_threads(opt.num_threads)
937 for (int q = 0; q < channels; q++)
938 {
939 const Mat src = bottom_blob.channel(q);
940 Mat dst = top_blob.channel(q);
941
942 resize_bilinear_image_pack4_bf16s(src, dst, alpha, xofs, beta, yofs);
943 }
944
945 delete[] buf;
946 }
947
948 if (resize_type == 3) // bicubic
949 {
950 int* buf = new int[outw + outh + outw * 4 + outh * 4];
951
952 int* xofs = buf; //new int[outw];
953 int* yofs = buf + outw; //new int[outh];
954
955 float* alpha = (float*)(buf + outw + outh); //new float[outw * 4];
956 float* beta = (float*)(buf + outw + outh + outw * 4); //new float[outh * 4];
957
958 cubic_coeffs(w, outw, xofs, alpha);
959 cubic_coeffs(h, outh, yofs, beta);
960
961 #pragma omp parallel for num_threads(opt.num_threads)
962 for (int q = 0; q < channels; q++)
963 {
964 const Mat src = bottom_blob.channel(q);
965 Mat dst = top_blob.channel(q);
966
967 resize_bicubic_image_pack4_bf16s(src, dst, alpha, xofs, beta, yofs);
968 }
969
970 delete[] buf;
971 }
972
973 return 0;
974 }
975 #endif // __ARM_NEON
976
977 if (resize_type == 1) // nearest
978 {
979 const float hs = outh ? h / (float)outh : 1.f / height_scale;
980 const float ws = outw ? w / (float)outw : 1.f / width_scale;
981
982 #pragma omp parallel for num_threads(opt.num_threads)
983 for (int q = 0; q < channels; q++)
984 {
985 const Mat src = bottom_blob.channel(q);
986 Mat dst = top_blob.channel(q);
987
988 for (int y = 0; y < outh; y++)
989 {
990 int in_y = std::min((int)(y * hs), (h - 1));
991
992 const unsigned short* ptr = src.row<const unsigned short>(in_y);
993 unsigned short* outptr = dst.row<unsigned short>(y);
994 for (int x = 0; x < outw; x++)
995 {
996 int in_x = std::min((int)(x * ws), (w - 1));
997 *outptr++ = ptr[in_x];
998 }
999 }
1000 }
1001 }
1002
1003 if (resize_type == 2) // bilinear
1004 {
1005 int* buf = new int[outw + outh + outw * 2 + outh * 2];
1006
1007 int* xofs = buf; //new int[outw];
1008 int* yofs = buf + outw; //new int[outh];
1009
1010 float* alpha = (float*)(buf + outw + outh); //new float[outw * 2];
1011 float* beta = (float*)(buf + outw + outh + outw * 2); //new float[outh * 2];
1012
1013 linear_coeffs(w, outw, xofs, alpha, align_corner);
1014 linear_coeffs(h, outh, yofs, beta, align_corner);
1015
1016 #pragma omp parallel for num_threads(opt.num_threads)
1017 for (int q = 0; q < channels; q++)
1018 {
1019 const Mat src = bottom_blob.channel(q);
1020 Mat dst = top_blob.channel(q);
1021
1022 resize_bilinear_image_bf16s(src, dst, alpha, xofs, beta, yofs);
1023 }
1024
1025 delete[] buf;
1026 }
1027
1028 if (resize_type == 3) // bicubic
1029 {
1030 int* buf = new int[outw + outh + outw * 4 + outh * 4];
1031
1032 int* xofs = buf; //new int[outw];
1033 int* yofs = buf + outw; //new int[outh];
1034
1035 float* alpha = (float*)(buf + outw + outh); //new float[outw * 4];
1036 float* beta = (float*)(buf + outw + outh + outw * 4); //new float[outh * 4];
1037
1038 cubic_coeffs(w, outw, xofs, alpha);
1039 cubic_coeffs(h, outh, yofs, beta);
1040
1041 #pragma omp parallel for num_threads(opt.num_threads)
1042 for (int q = 0; q < channels; q++)
1043 {
1044 const Mat src = bottom_blob.channel(q);
1045 Mat dst = top_blob.channel(q);
1046
1047 resize_bicubic_image_bf16s(src, dst, alpha, xofs, beta, yofs);
1048 }
1049
1050 delete[] buf;
1051 }
1052
1053 return 0;
1054 }
1055
1056 } // namespace ncnn
1057