1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14 
15 #include "interp_arm.h"
16 
17 #include <math.h>
18 
19 #if __ARM_NEON
20 #include <arm_neon.h>
21 #endif // __ARM_NEON
22 
23 namespace ncnn {
24 
25 #include "interp_bicubic.h"
26 #include "interp_bicubic_bf16s.h"
27 #include "interp_bilinear.h"
28 #include "interp_bilinear_bf16s.h"
29 
30 #if __ARM_NEON
31 #include "interp_bicubic_pack4.h"
32 #include "interp_bicubic_pack4_bf16s.h"
33 #include "interp_bilinear_pack4.h"
34 #include "interp_bilinear_pack4_bf16s.h"
35 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
36 #include "interp_bicubic_fp16s.h"
37 #include "interp_bicubic_pack4_fp16s.h"
38 #include "interp_bicubic_pack8_fp16s.h"
39 #include "interp_bilinear_fp16s.h"
40 #include "interp_bilinear_pack4_fp16s.h"
41 #include "interp_bilinear_pack8_fp16s.h"
42 #endif
43 #endif
44 
Interp_arm()45 Interp_arm::Interp_arm()
46 {
47 #if __ARM_NEON
48     support_packing = true;
49 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
50     support_fp16_storage = true;
51 #endif
52 #endif // __ARM_NEON
53 
54     support_bf16_storage = true;
55 }
56 
forward(const std::vector<Mat> & bottom_blobs,std::vector<Mat> & top_blobs,const Option & opt) const57 int Interp_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
58 {
59     const Mat& bottom_blob = bottom_blobs[0];
60     const Mat& reference_blob = bottom_blobs[1];
61     Mat& top_blob = top_blobs[0];
62 
63     int elembits = bottom_blob.elembits();
64 
65 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
66     if (opt.use_fp16_storage && elembits == 16)
67     {
68         if (opt.use_fp16_arithmetic)
69             return forward_fp16sa(bottom_blobs, top_blobs, opt);
70         else
71             return forward_fp16s(bottom_blobs, top_blobs, opt);
72     }
73 #endif
74 
75     if (opt.use_bf16_storage && elembits == 16)
76         return forward_bf16s(bottom_blobs, top_blobs, opt);
77 
78     int h = bottom_blob.h;
79     int w = bottom_blob.w;
80     int channels = bottom_blob.c;
81     size_t elemsize = bottom_blob.elemsize;
82     int elempack = bottom_blob.elempack;
83 
84     int outw = reference_blob.w;
85     int outh = reference_blob.h;
86 
87     if (bottom_blob.dims == 1)
88     {
89         top_blob.create(outw, outh, w, elemsize, elempack, opt.blob_allocator);
90         if (top_blob.empty())
91             return -100;
92 
93 #if __ARM_NEON
94         if (elempack == 4)
95         {
96             #pragma omp parallel for num_threads(opt.num_threads)
97             for (int q = 0; q < w; q++)
98             {
99                 Mat top_blob_c = top_blob.channel(q);
100                 float32x4_t _v = vld1q_f32((const float*)bottom_blob + q * 4);
101                 top_blob_c.fill(_v);
102             }
103 
104             return 0;
105         }
106 #endif // __ARM_NEON
107 
108         #pragma omp parallel for num_threads(opt.num_threads)
109         for (int q = 0; q < w; q++)
110         {
111             Mat top_blob_c = top_blob.channel(q);
112             const float v = bottom_blob[q];
113             top_blob_c.fill(v);
114         }
115 
116         return 0;
117     }
118 
119     if (outw == w && outh == h)
120     {
121         top_blob = bottom_blob;
122         return 0;
123     }
124 
125     top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_allocator);
126     if (top_blob.empty())
127         return -100;
128 
129 #if __ARM_NEON
130     if (elempack == 4)
131     {
132         if (resize_type == 1) // nearest
133         {
134             const float hs = outh ? h / (float)outh : 1.f / height_scale;
135             const float ws = outw ? w / (float)outw : 1.f / width_scale;
136 
137             #pragma omp parallel for num_threads(opt.num_threads)
138             for (int q = 0; q < channels; q++)
139             {
140                 const Mat src = bottom_blob.channel(q);
141                 Mat dst = top_blob.channel(q);
142 
143                 for (int y = 0; y < outh; y++)
144                 {
145                     int in_y = std::min((int)(y * hs), (h - 1));
146 
147                     const float* ptr = src.row(in_y);
148                     float* outptr = dst.row(y);
149                     for (int x = 0; x < outw; x++)
150                     {
151                         int in_x = std::min((int)(x * ws), (w - 1));
152 
153                         float32x4_t _p = vld1q_f32(ptr + in_x * 4);
154                         vst1q_f32(outptr, _p);
155 
156                         outptr += 4;
157                     }
158                 }
159             }
160         }
161 
162         if (resize_type == 2) // bilinear
163         {
164             int* buf = new int[outw + outh + outw * 2 + outh * 2];
165 
166             int* xofs = buf;        //new int[outw];
167             int* yofs = buf + outw; //new int[outh];
168 
169             float* alpha = (float*)(buf + outw + outh);           //new float[outw * 2];
170             float* beta = (float*)(buf + outw + outh + outw * 2); //new float[outh * 2];
171 
172             linear_coeffs(w, outw, xofs, alpha, align_corner);
173             linear_coeffs(h, outh, yofs, beta, align_corner);
174 
175             #pragma omp parallel for num_threads(opt.num_threads)
176             for (int q = 0; q < channels; q++)
177             {
178                 const Mat src = bottom_blob.channel(q);
179                 Mat dst = top_blob.channel(q);
180 
181                 resize_bilinear_image_pack4(src, dst, alpha, xofs, beta, yofs);
182             }
183 
184             delete[] buf;
185         }
186 
187         if (resize_type == 3) // bicubic
188         {
189             int* buf = new int[outw + outh + outw * 4 + outh * 4];
190 
191             int* xofs = buf;        //new int[outw];
192             int* yofs = buf + outw; //new int[outh];
193 
194             float* alpha = (float*)(buf + outw + outh);           //new float[outw * 4];
195             float* beta = (float*)(buf + outw + outh + outw * 4); //new float[outh * 4];
196 
197             cubic_coeffs(w, outw, xofs, alpha);
198             cubic_coeffs(h, outh, yofs, beta);
199 
200             #pragma omp parallel for num_threads(opt.num_threads)
201             for (int q = 0; q < channels; q++)
202             {
203                 const Mat src = bottom_blob.channel(q);
204                 Mat dst = top_blob.channel(q);
205 
206                 resize_bicubic_image_pack4(src, dst, alpha, xofs, beta, yofs);
207             }
208 
209             delete[] buf;
210         }
211 
212         return 0;
213     }
214 #endif // __ARM_NEON
215 
216     if (resize_type == 1) // nearest
217     {
218         const float hs = outh ? h / (float)outh : 1.f / height_scale;
219         const float ws = outw ? w / (float)outw : 1.f / width_scale;
220 
221         #pragma omp parallel for num_threads(opt.num_threads)
222         for (int q = 0; q < channels; q++)
223         {
224             const Mat src = bottom_blob.channel(q);
225             Mat dst = top_blob.channel(q);
226 
227             for (int y = 0; y < outh; y++)
228             {
229                 int in_y = std::min((int)(y * hs), (h - 1));
230 
231                 const float* ptr = src.row(in_y);
232                 float* outptr = dst.row(y);
233                 for (int x = 0; x < outw; x++)
234                 {
235                     int in_x = std::min((int)(x * ws), (w - 1));
236                     *outptr++ = ptr[in_x];
237                 }
238             }
239         }
240     }
241 
242     if (resize_type == 2) // bilinear
243     {
244         int* buf = new int[outw + outh + outw * 2 + outh * 2];
245 
246         int* xofs = buf;        //new int[outw];
247         int* yofs = buf + outw; //new int[outh];
248 
249         float* alpha = (float*)(buf + outw + outh);           //new float[outw * 2];
250         float* beta = (float*)(buf + outw + outh + outw * 2); //new float[outh * 2];
251 
252         linear_coeffs(w, outw, xofs, alpha, align_corner);
253         linear_coeffs(h, outh, yofs, beta, align_corner);
254 
255         #pragma omp parallel for num_threads(opt.num_threads)
256         for (int q = 0; q < channels; q++)
257         {
258             const Mat src = bottom_blob.channel(q);
259             Mat dst = top_blob.channel(q);
260 
261             resize_bilinear_image(src, dst, alpha, xofs, beta, yofs);
262         }
263 
264         delete[] buf;
265     }
266 
267     if (resize_type == 3) // bicubic
268     {
269         int* buf = new int[outw + outh + outw * 4 + outh * 4];
270 
271         int* xofs = buf;        //new int[outw];
272         int* yofs = buf + outw; //new int[outh];
273 
274         float* alpha = (float*)(buf + outw + outh);           //new float[outw * 4];
275         float* beta = (float*)(buf + outw + outh + outw * 4); //new float[outh * 4];
276 
277         cubic_coeffs(w, outw, xofs, alpha);
278         cubic_coeffs(h, outh, yofs, beta);
279 
280         #pragma omp parallel for num_threads(opt.num_threads)
281         for (int q = 0; q < channels; q++)
282         {
283             const Mat src = bottom_blob.channel(q);
284             Mat dst = top_blob.channel(q);
285 
286             resize_bicubic_image(src, dst, alpha, xofs, beta, yofs);
287         }
288 
289         delete[] buf;
290     }
291 
292     return 0;
293 }
294 
295 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
forward_fp16s(const std::vector<Mat> & bottom_blobs,std::vector<Mat> & top_blobs,const Option & opt) const296 int Interp_arm::forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
297 {
298     const Mat& bottom_blob = bottom_blobs[0];
299     const Mat& reference_blob = bottom_blobs[1];
300     Mat& top_blob = top_blobs[0];
301 
302     int h = bottom_blob.h;
303     int w = bottom_blob.w;
304     int channels = bottom_blob.c;
305     size_t elemsize = bottom_blob.elemsize;
306     int elempack = bottom_blob.elempack;
307 
308     int outw = reference_blob.w;
309     int outh = reference_blob.h;
310 
311     if (bottom_blob.dims == 1)
312     {
313         top_blob.create(outw, outh, w, elemsize, elempack, opt.blob_allocator);
314         if (top_blob.empty())
315             return -100;
316 
317         if (elempack == 4)
318         {
319             #pragma omp parallel for num_threads(opt.num_threads)
320             for (int q = 0; q < w; q++)
321             {
322                 Mat top_blob_c = top_blob.channel(q);
323                 float16x4_t _v = vld1_f16((const __fp16*)bottom_blob + q * 4);
324                 top_blob_c.fill(_v);
325             }
326 
327             return 0;
328         }
329 
330         #pragma omp parallel for num_threads(opt.num_threads)
331         for (int q = 0; q < w; q++)
332         {
333             Mat top_blob_c = top_blob.channel(q);
334             const __fp16* ptr = bottom_blob;
335             top_blob_c.fill(ptr[q]);
336         }
337 
338         return 0;
339     }
340 
341     if (outw == w && outh == h)
342     {
343         top_blob = bottom_blob;
344         return 0;
345     }
346 
347     top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_allocator);
348     if (top_blob.empty())
349         return -100;
350 
351     if (elempack == 4)
352     {
353         if (resize_type == 1) // nearest
354         {
355             const float hs = outh ? h / (float)outh : 1.f / height_scale;
356             const float ws = outw ? w / (float)outw : 1.f / width_scale;
357 
358             #pragma omp parallel for num_threads(opt.num_threads)
359             for (int q = 0; q < channels; q++)
360             {
361                 const Mat src = bottom_blob.channel(q);
362                 Mat dst = top_blob.channel(q);
363 
364                 for (int y = 0; y < outh; y++)
365                 {
366                     int in_y = std::min((int)(y * hs), (h - 1));
367 
368                     const __fp16* ptr = src.row<const __fp16>(in_y);
369                     __fp16* outptr = dst.row<__fp16>(y);
370                     for (int x = 0; x < outw; x++)
371                     {
372                         int in_x = std::min((int)(x * ws), (w - 1));
373 
374                         float16x4_t _p = vld1_f16(ptr + in_x * 4);
375                         vst1_f16(outptr, _p);
376 
377                         outptr += 4;
378                     }
379                 }
380             }
381         }
382 
383         if (resize_type == 2) // bilinear
384         {
385             int* buf = new int[outw + outh + outw * 2 + outh * 2];
386 
387             int* xofs = buf;        //new int[outw];
388             int* yofs = buf + outw; //new int[outh];
389 
390             float* alpha = (float*)(buf + outw + outh);           //new float[outw * 2];
391             float* beta = (float*)(buf + outw + outh + outw * 2); //new float[outh * 2];
392 
393             linear_coeffs(w, outw, xofs, alpha, align_corner);
394             linear_coeffs(h, outh, yofs, beta, align_corner);
395 
396             #pragma omp parallel for num_threads(opt.num_threads)
397             for (int q = 0; q < channels; q++)
398             {
399                 const Mat src = bottom_blob.channel(q);
400                 Mat dst = top_blob.channel(q);
401 
402                 resize_bilinear_image_pack4_fp16s(src, dst, alpha, xofs, beta, yofs);
403             }
404 
405             delete[] buf;
406         }
407 
408         if (resize_type == 3) // bicubic
409         {
410             int* buf = new int[outw + outh + outw * 4 + outh * 4];
411 
412             int* xofs = buf;        //new int[outw];
413             int* yofs = buf + outw; //new int[outh];
414 
415             float* alpha = (float*)(buf + outw + outh);           //new float[outw * 4];
416             float* beta = (float*)(buf + outw + outh + outw * 4); //new float[outh * 4];
417 
418             cubic_coeffs(w, outw, xofs, alpha);
419             cubic_coeffs(h, outh, yofs, beta);
420 
421             #pragma omp parallel for num_threads(opt.num_threads)
422             for (int q = 0; q < channels; q++)
423             {
424                 const Mat src = bottom_blob.channel(q);
425                 Mat dst = top_blob.channel(q);
426 
427                 resize_bicubic_image_pack4_fp16s(src, dst, alpha, xofs, beta, yofs);
428             }
429 
430             delete[] buf;
431         }
432 
433         return 0;
434     }
435 
436     if (resize_type == 1) // nearest
437     {
438         const float hs = outh ? h / (float)outh : 1.f / height_scale;
439         const float ws = outw ? w / (float)outw : 1.f / width_scale;
440 
441         #pragma omp parallel for num_threads(opt.num_threads)
442         for (int q = 0; q < channels; q++)
443         {
444             const Mat src = bottom_blob.channel(q);
445             Mat dst = top_blob.channel(q);
446 
447             for (int y = 0; y < outh; y++)
448             {
449                 int in_y = std::min((int)(y * hs), (h - 1));
450 
451                 const __fp16* ptr = src.row<const __fp16>(in_y);
452                 __fp16* outptr = dst.row<__fp16>(y);
453                 for (int x = 0; x < outw; x++)
454                 {
455                     int in_x = std::min((int)(x * ws), (w - 1));
456                     *outptr++ = ptr[in_x];
457                 }
458             }
459         }
460     }
461 
462     if (resize_type == 2) // bilinear
463     {
464         int* buf = new int[outw + outh + outw * 2 + outh * 2];
465 
466         int* xofs = buf;        //new int[outw];
467         int* yofs = buf + outw; //new int[outh];
468 
469         float* alpha = (float*)(buf + outw + outh);           //new float[outw * 2];
470         float* beta = (float*)(buf + outw + outh + outw * 2); //new float[outh * 2];
471 
472         linear_coeffs(w, outw, xofs, alpha, align_corner);
473         linear_coeffs(h, outh, yofs, beta, align_corner);
474 
475         #pragma omp parallel for num_threads(opt.num_threads)
476         for (int q = 0; q < channels; q++)
477         {
478             const Mat src = bottom_blob.channel(q);
479             Mat dst = top_blob.channel(q);
480 
481             resize_bilinear_image_fp16s(src, dst, alpha, xofs, beta, yofs);
482         }
483 
484         delete[] buf;
485     }
486 
487     if (resize_type == 3) // bicubic
488     {
489         int* buf = new int[outw + outh + outw * 4 + outh * 4];
490 
491         int* xofs = buf;        //new int[outw];
492         int* yofs = buf + outw; //new int[outh];
493 
494         float* alpha = (float*)(buf + outw + outh);           //new float[outw * 4];
495         float* beta = (float*)(buf + outw + outh + outw * 4); //new float[outh * 4];
496 
497         cubic_coeffs(w, outw, xofs, alpha);
498         cubic_coeffs(h, outh, yofs, beta);
499 
500         #pragma omp parallel for num_threads(opt.num_threads)
501         for (int q = 0; q < channels; q++)
502         {
503             const Mat src = bottom_blob.channel(q);
504             Mat dst = top_blob.channel(q);
505 
506             resize_bicubic_image_fp16s(src, dst, alpha, xofs, beta, yofs);
507         }
508 
509         delete[] buf;
510     }
511 
512     return 0;
513 }
514 
forward_fp16sa(const std::vector<Mat> & bottom_blobs,std::vector<Mat> & top_blobs,const Option & opt) const515 int Interp_arm::forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
516 {
517     const Mat& bottom_blob = bottom_blobs[0];
518     const Mat& reference_blob = bottom_blobs[1];
519     Mat& top_blob = top_blobs[0];
520 
521     int h = bottom_blob.h;
522     int w = bottom_blob.w;
523     int channels = bottom_blob.c;
524     size_t elemsize = bottom_blob.elemsize;
525     int elempack = bottom_blob.elempack;
526 
527     int outw = reference_blob.w;
528     int outh = reference_blob.h;
529 
530     if (bottom_blob.dims == 1)
531     {
532         top_blob.create(outw, outh, w, elemsize, elempack, opt.blob_allocator);
533         if (top_blob.empty())
534             return -100;
535 
536         if (elempack == 8)
537         {
538             #pragma omp parallel for num_threads(opt.num_threads)
539             for (int q = 0; q < w; q++)
540             {
541                 Mat top_blob_c = top_blob.channel(q);
542                 float16x8_t _v = vld1q_f16((const __fp16*)bottom_blob + q * 8);
543                 top_blob_c.fill(_v);
544             }
545 
546             return 0;
547         }
548 
549         if (elempack == 4)
550         {
551             #pragma omp parallel for num_threads(opt.num_threads)
552             for (int q = 0; q < w; q++)
553             {
554                 Mat top_blob_c = top_blob.channel(q);
555                 float16x4_t _v = vld1_f16((const __fp16*)bottom_blob + q * 4);
556                 top_blob_c.fill(_v);
557             }
558 
559             return 0;
560         }
561 
562         #pragma omp parallel for num_threads(opt.num_threads)
563         for (int q = 0; q < w; q++)
564         {
565             Mat top_blob_c = top_blob.channel(q);
566             const __fp16* ptr = bottom_blob;
567             top_blob_c.fill(ptr[q]);
568         }
569 
570         return 0;
571     }
572 
573     if (outw == w && outh == h)
574     {
575         top_blob = bottom_blob;
576         return 0;
577     }
578 
579     top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_allocator);
580     if (top_blob.empty())
581         return -100;
582 
583     if (elempack == 8)
584     {
585         if (resize_type == 1) // nearest
586         {
587             const float hs = outh ? h / (float)outh : 1.f / height_scale;
588             const float ws = outw ? w / (float)outw : 1.f / width_scale;
589 
590             #pragma omp parallel for num_threads(opt.num_threads)
591             for (int q = 0; q < channels; q++)
592             {
593                 const Mat src = bottom_blob.channel(q);
594                 Mat dst = top_blob.channel(q);
595 
596                 for (int y = 0; y < outh; y++)
597                 {
598                     int in_y = std::min((int)(y * hs), (h - 1));
599 
600                     const __fp16* ptr = src.row<const __fp16>(in_y);
601                     __fp16* outptr = dst.row<__fp16>(y);
602                     for (int x = 0; x < outw; x++)
603                     {
604                         int in_x = std::min((int)(x * ws), (w - 1));
605 
606                         float16x8_t _p = vld1q_f16(ptr + in_x * 8);
607                         vst1q_f16(outptr, _p);
608 
609                         outptr += 8;
610                     }
611                 }
612             }
613         }
614 
615         if (resize_type == 2) // bilinear
616         {
617             int* buf = new int[outw + outh + outw * 2 + outh * 2];
618 
619             int* xofs = buf;        //new int[outw];
620             int* yofs = buf + outw; //new int[outh];
621 
622             __fp16* alpha = (__fp16*)(buf + outw + outh);           //new __fp16[outw * 2];
623             __fp16* beta = (__fp16*)(buf + outw + outh + outw * 2); //new __fp16[outh * 2];
624 
625             linear_coeffs_fp16sa(w, outw, xofs, alpha, align_corner);
626             linear_coeffs_fp16sa(h, outh, yofs, beta, align_corner);
627 
628             #pragma omp parallel for num_threads(opt.num_threads)
629             for (int q = 0; q < channels; q++)
630             {
631                 const Mat src = bottom_blob.channel(q);
632                 Mat dst = top_blob.channel(q);
633 
634                 resize_bilinear_image_pack8_fp16sa(src, dst, alpha, xofs, beta, yofs);
635             }
636 
637             delete[] buf;
638         }
639 
640         if (resize_type == 3) // bicubic
641         {
642             int* buf = new int[outw + outh + outw * 4 + outh * 4];
643 
644             int* xofs = buf;        //new int[outw];
645             int* yofs = buf + outw; //new int[outh];
646 
647             __fp16* alpha = (__fp16*)(buf + outw + outh);           //new __fp16[outw * 4];
648             __fp16* beta = (__fp16*)(buf + outw + outh + outw * 4); //new __fp16[outh * 4];
649 
650             cubic_coeffs_fp16sa(w, outw, xofs, alpha);
651             cubic_coeffs_fp16sa(h, outh, yofs, beta);
652 
653             #pragma omp parallel for num_threads(opt.num_threads)
654             for (int q = 0; q < channels; q++)
655             {
656                 const Mat src = bottom_blob.channel(q);
657                 Mat dst = top_blob.channel(q);
658 
659                 resize_bicubic_image_pack8_fp16sa(src, dst, alpha, xofs, beta, yofs);
660             }
661 
662             delete[] buf;
663         }
664 
665         return 0;
666     }
667 
668     if (elempack == 4)
669     {
670         if (resize_type == 1) // nearest
671         {
672             const float hs = outh ? h / (float)outh : 1.f / height_scale;
673             const float ws = outw ? w / (float)outw : 1.f / width_scale;
674 
675             #pragma omp parallel for num_threads(opt.num_threads)
676             for (int q = 0; q < channels; q++)
677             {
678                 const Mat src = bottom_blob.channel(q);
679                 Mat dst = top_blob.channel(q);
680 
681                 for (int y = 0; y < outh; y++)
682                 {
683                     int in_y = std::min((int)(y * hs), (h - 1));
684 
685                     const __fp16* ptr = src.row<const __fp16>(in_y);
686                     __fp16* outptr = dst.row<__fp16>(y);
687                     for (int x = 0; x < outw; x++)
688                     {
689                         int in_x = std::min((int)(x * ws), (w - 1));
690 
691                         float16x4_t _p = vld1_f16(ptr + in_x * 4);
692                         vst1_f16(outptr, _p);
693 
694                         outptr += 4;
695                     }
696                 }
697             }
698         }
699 
700         if (resize_type == 2) // bilinear
701         {
702             int* buf = new int[outw + outh + outw * 2 + outh * 2];
703 
704             int* xofs = buf;        //new int[outw];
705             int* yofs = buf + outw; //new int[outh];
706 
707             __fp16* alpha = (__fp16*)(buf + outw + outh);           //new __fp16[outw * 2];
708             __fp16* beta = (__fp16*)(buf + outw + outh + outw * 2); //new __fp16[outh * 2];
709 
710             linear_coeffs_fp16sa(w, outw, xofs, alpha, align_corner);
711             linear_coeffs_fp16sa(h, outh, yofs, beta, align_corner);
712 
713             #pragma omp parallel for num_threads(opt.num_threads)
714             for (int q = 0; q < channels; q++)
715             {
716                 const Mat src = bottom_blob.channel(q);
717                 Mat dst = top_blob.channel(q);
718 
719                 resize_bilinear_image_pack4_fp16sa(src, dst, alpha, xofs, beta, yofs);
720             }
721 
722             delete[] buf;
723         }
724 
725         if (resize_type == 3) // bicubic
726         {
727             int* buf = new int[outw + outh + outw * 4 + outh * 4];
728 
729             int* xofs = buf;        //new int[outw];
730             int* yofs = buf + outw; //new int[outh];
731 
732             __fp16* alpha = (__fp16*)(buf + outw + outh);           //new __fp16[outw * 4];
733             __fp16* beta = (__fp16*)(buf + outw + outh + outw * 4); //new __fp16[outh * 4];
734 
735             cubic_coeffs_fp16sa(w, outw, xofs, alpha);
736             cubic_coeffs_fp16sa(h, outh, yofs, beta);
737 
738             #pragma omp parallel for num_threads(opt.num_threads)
739             for (int q = 0; q < channels; q++)
740             {
741                 const Mat src = bottom_blob.channel(q);
742                 Mat dst = top_blob.channel(q);
743 
744                 resize_bicubic_image_pack4_fp16sa(src, dst, alpha, xofs, beta, yofs);
745             }
746 
747             delete[] buf;
748         }
749 
750         return 0;
751     }
752 
753     if (resize_type == 1) // nearest
754     {
755         const float hs = outh ? h / (float)outh : 1.f / height_scale;
756         const float ws = outw ? w / (float)outw : 1.f / width_scale;
757 
758         #pragma omp parallel for num_threads(opt.num_threads)
759         for (int q = 0; q < channels; q++)
760         {
761             const Mat src = bottom_blob.channel(q);
762             Mat dst = top_blob.channel(q);
763 
764             for (int y = 0; y < outh; y++)
765             {
766                 int in_y = std::min((int)(y * hs), (h - 1));
767 
768                 const __fp16* ptr = src.row<const __fp16>(in_y);
769                 __fp16* outptr = dst.row<__fp16>(y);
770                 for (int x = 0; x < outw; x++)
771                 {
772                     int in_x = std::min((int)(x * ws), (w - 1));
773                     *outptr++ = ptr[in_x];
774                 }
775             }
776         }
777     }
778 
779     if (resize_type == 2) // bilinear
780     {
781         int* buf = new int[outw + outh + outw * 2 + outh * 2];
782 
783         int* xofs = buf;        //new int[outw];
784         int* yofs = buf + outw; //new int[outh];
785 
786         __fp16* alpha = (__fp16*)(buf + outw + outh);           //new __fp16[outw * 2];
787         __fp16* beta = (__fp16*)(buf + outw + outh + outw * 2); //new __fp16[outh * 2];
788 
789         linear_coeffs_fp16sa(w, outw, xofs, alpha, align_corner);
790         linear_coeffs_fp16sa(h, outh, yofs, beta, align_corner);
791 
792         #pragma omp parallel for num_threads(opt.num_threads)
793         for (int q = 0; q < channels; q++)
794         {
795             const Mat src = bottom_blob.channel(q);
796             Mat dst = top_blob.channel(q);
797 
798             resize_bilinear_image_fp16sa(src, dst, alpha, xofs, beta, yofs);
799         }
800 
801         delete[] buf;
802     }
803 
804     if (resize_type == 3) // bicubic
805     {
806         int* buf = new int[outw + outh + outw * 4 + outh * 4];
807 
808         int* xofs = buf;        //new int[outw];
809         int* yofs = buf + outw; //new int[outh];
810 
811         __fp16* alpha = (__fp16*)(buf + outw + outh);           //new __fp16[outw * 4];
812         __fp16* beta = (__fp16*)(buf + outw + outh + outw * 4); //new __fp16[outh * 4];
813 
814         cubic_coeffs_fp16sa(w, outw, xofs, alpha);
815         cubic_coeffs_fp16sa(h, outh, yofs, beta);
816 
817         #pragma omp parallel for num_threads(opt.num_threads)
818         for (int q = 0; q < channels; q++)
819         {
820             const Mat src = bottom_blob.channel(q);
821             Mat dst = top_blob.channel(q);
822 
823             resize_bicubic_image_fp16sa(src, dst, alpha, xofs, beta, yofs);
824         }
825 
826         delete[] buf;
827     }
828 
829     return 0;
830 }
831 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
832 
forward_bf16s(const std::vector<Mat> & bottom_blobs,std::vector<Mat> & top_blobs,const Option & opt) const833 int Interp_arm::forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
834 {
835     const Mat& bottom_blob = bottom_blobs[0];
836     const Mat& reference_blob = bottom_blobs[1];
837     Mat& top_blob = top_blobs[0];
838 
839     int h = bottom_blob.h;
840     int w = bottom_blob.w;
841     int channels = bottom_blob.c;
842     size_t elemsize = bottom_blob.elemsize;
843     int elempack = bottom_blob.elempack;
844 
845     int outw = reference_blob.w;
846     int outh = reference_blob.h;
847 
848     if (bottom_blob.dims == 1)
849     {
850         top_blob.create(outw, outh, w, elemsize, elempack, opt.blob_allocator);
851         if (top_blob.empty())
852             return -100;
853 
854 #if __ARM_NEON
855         if (elempack == 4)
856         {
857             #pragma omp parallel for num_threads(opt.num_threads)
858             for (int q = 0; q < w; q++)
859             {
860                 Mat top_blob_c = top_blob.channel(q);
861                 uint16x4_t _v = vld1_u16((const unsigned short*)bottom_blob + q * 4);
862                 top_blob_c.fill(_v);
863             }
864 
865             return 0;
866         }
867 #endif // __ARM_NEON
868 
869         #pragma omp parallel for num_threads(opt.num_threads)
870         for (int q = 0; q < w; q++)
871         {
872             Mat top_blob_c = top_blob.channel(q);
873             const unsigned short* ptr = bottom_blob;
874             top_blob_c.fill(ptr[q]);
875         }
876 
877         return 0;
878     }
879 
880     if (outw == w && outh == h)
881     {
882         top_blob = bottom_blob;
883         return 0;
884     }
885 
886     top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_allocator);
887     if (top_blob.empty())
888         return -100;
889 
890 #if __ARM_NEON
891     if (elempack == 4)
892     {
893         if (resize_type == 1) // nearest
894         {
895             const float hs = outh ? h / (float)outh : 1.f / height_scale;
896             const float ws = outw ? w / (float)outw : 1.f / width_scale;
897 
898             #pragma omp parallel for num_threads(opt.num_threads)
899             for (int q = 0; q < channels; q++)
900             {
901                 const Mat src = bottom_blob.channel(q);
902                 Mat dst = top_blob.channel(q);
903 
904                 for (int y = 0; y < outh; y++)
905                 {
906                     int in_y = std::min((int)(y * hs), (h - 1));
907 
908                     const unsigned short* ptr = src.row<const unsigned short>(in_y);
909                     unsigned short* outptr = dst.row<unsigned short>(y);
910                     for (int x = 0; x < outw; x++)
911                     {
912                         int in_x = std::min((int)(x * ws), (w - 1));
913 
914                         uint16x4_t _p = vld1_u16(ptr + in_x * 4);
915                         vst1_u16(outptr, _p);
916 
917                         outptr += 4;
918                     }
919                 }
920             }
921         }
922 
923         if (resize_type == 2) // bilinear
924         {
925             int* buf = new int[outw + outh + outw * 2 + outh * 2];
926 
927             int* xofs = buf;        //new int[outw];
928             int* yofs = buf + outw; //new int[outh];
929 
930             float* alpha = (float*)(buf + outw + outh);           //new float[outw * 2];
931             float* beta = (float*)(buf + outw + outh + outw * 2); //new float[outh * 2];
932 
933             linear_coeffs(w, outw, xofs, alpha, align_corner);
934             linear_coeffs(h, outh, yofs, beta, align_corner);
935 
936             #pragma omp parallel for num_threads(opt.num_threads)
937             for (int q = 0; q < channels; q++)
938             {
939                 const Mat src = bottom_blob.channel(q);
940                 Mat dst = top_blob.channel(q);
941 
942                 resize_bilinear_image_pack4_bf16s(src, dst, alpha, xofs, beta, yofs);
943             }
944 
945             delete[] buf;
946         }
947 
948         if (resize_type == 3) // bicubic
949         {
950             int* buf = new int[outw + outh + outw * 4 + outh * 4];
951 
952             int* xofs = buf;        //new int[outw];
953             int* yofs = buf + outw; //new int[outh];
954 
955             float* alpha = (float*)(buf + outw + outh);           //new float[outw * 4];
956             float* beta = (float*)(buf + outw + outh + outw * 4); //new float[outh * 4];
957 
958             cubic_coeffs(w, outw, xofs, alpha);
959             cubic_coeffs(h, outh, yofs, beta);
960 
961             #pragma omp parallel for num_threads(opt.num_threads)
962             for (int q = 0; q < channels; q++)
963             {
964                 const Mat src = bottom_blob.channel(q);
965                 Mat dst = top_blob.channel(q);
966 
967                 resize_bicubic_image_pack4_bf16s(src, dst, alpha, xofs, beta, yofs);
968             }
969 
970             delete[] buf;
971         }
972 
973         return 0;
974     }
975 #endif // __ARM_NEON
976 
977     if (resize_type == 1) // nearest
978     {
979         const float hs = outh ? h / (float)outh : 1.f / height_scale;
980         const float ws = outw ? w / (float)outw : 1.f / width_scale;
981 
982         #pragma omp parallel for num_threads(opt.num_threads)
983         for (int q = 0; q < channels; q++)
984         {
985             const Mat src = bottom_blob.channel(q);
986             Mat dst = top_blob.channel(q);
987 
988             for (int y = 0; y < outh; y++)
989             {
990                 int in_y = std::min((int)(y * hs), (h - 1));
991 
992                 const unsigned short* ptr = src.row<const unsigned short>(in_y);
993                 unsigned short* outptr = dst.row<unsigned short>(y);
994                 for (int x = 0; x < outw; x++)
995                 {
996                     int in_x = std::min((int)(x * ws), (w - 1));
997                     *outptr++ = ptr[in_x];
998                 }
999             }
1000         }
1001     }
1002 
1003     if (resize_type == 2) // bilinear
1004     {
1005         int* buf = new int[outw + outh + outw * 2 + outh * 2];
1006 
1007         int* xofs = buf;        //new int[outw];
1008         int* yofs = buf + outw; //new int[outh];
1009 
1010         float* alpha = (float*)(buf + outw + outh);           //new float[outw * 2];
1011         float* beta = (float*)(buf + outw + outh + outw * 2); //new float[outh * 2];
1012 
1013         linear_coeffs(w, outw, xofs, alpha, align_corner);
1014         linear_coeffs(h, outh, yofs, beta, align_corner);
1015 
1016         #pragma omp parallel for num_threads(opt.num_threads)
1017         for (int q = 0; q < channels; q++)
1018         {
1019             const Mat src = bottom_blob.channel(q);
1020             Mat dst = top_blob.channel(q);
1021 
1022             resize_bilinear_image_bf16s(src, dst, alpha, xofs, beta, yofs);
1023         }
1024 
1025         delete[] buf;
1026     }
1027 
1028     if (resize_type == 3) // bicubic
1029     {
1030         int* buf = new int[outw + outh + outw * 4 + outh * 4];
1031 
1032         int* xofs = buf;        //new int[outw];
1033         int* yofs = buf + outw; //new int[outh];
1034 
1035         float* alpha = (float*)(buf + outw + outh);           //new float[outw * 4];
1036         float* beta = (float*)(buf + outw + outh + outw * 4); //new float[outh * 4];
1037 
1038         cubic_coeffs(w, outw, xofs, alpha);
1039         cubic_coeffs(h, outh, yofs, beta);
1040 
1041         #pragma omp parallel for num_threads(opt.num_threads)
1042         for (int q = 0; q < channels; q++)
1043         {
1044             const Mat src = bottom_blob.channel(q);
1045             Mat dst = top_blob.channel(q);
1046 
1047             resize_bicubic_image_bf16s(src, dst, alpha, xofs, beta, yofs);
1048         }
1049 
1050         delete[] buf;
1051     }
1052 
1053     return 0;
1054 }
1055 
1056 } // namespace ncnn
1057