1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14 
15 #include "mat.h"
16 
17 #include <limits.h>
18 #include <math.h>
19 #if __ARM_NEON
20 #include <arm_neon.h>
21 #endif // __ARM_NEON
22 #include "platform.h"
23 
24 namespace ncnn {
25 
26 #if NCNN_PIXEL
from_rgb(const unsigned char * rgb,int w,int h,int stride,Mat & m,Allocator * allocator)27 static int from_rgb(const unsigned char* rgb, int w, int h, int stride, Mat& m, Allocator* allocator)
28 {
29     m.create(w, h, 3, 4u, allocator);
30     if (m.empty())
31         return -100;
32 
33     const int wgap = stride - w * 3;
34     if (wgap == 0)
35     {
36         w = w * h;
37         h = 1;
38     }
39 
40     float* ptr0 = m.channel(0);
41     float* ptr1 = m.channel(1);
42     float* ptr2 = m.channel(2);
43 
44     for (int y = 0; y < h; y++)
45     {
46 #if __ARM_NEON
47         int nn = w >> 3;
48         int remain = w - (nn << 3);
49 #else
50         int remain = w;
51 #endif // __ARM_NEON
52 
53 #if __ARM_NEON
54 #if __aarch64__
55         for (; nn > 0; nn--)
56         {
57             uint8x8x3_t _rgb = vld3_u8(rgb);
58             uint16x8_t _r16 = vmovl_u8(_rgb.val[0]);
59             uint16x8_t _g16 = vmovl_u8(_rgb.val[1]);
60             uint16x8_t _b16 = vmovl_u8(_rgb.val[2]);
61 
62             float32x4_t _rlow = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_r16)));
63             float32x4_t _rhigh = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_r16)));
64             float32x4_t _glow = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_g16)));
65             float32x4_t _ghigh = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_g16)));
66             float32x4_t _blow = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_b16)));
67             float32x4_t _bhigh = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_b16)));
68 
69             vst1q_f32(ptr0, _rlow);
70             vst1q_f32(ptr0 + 4, _rhigh);
71             vst1q_f32(ptr1, _glow);
72             vst1q_f32(ptr1 + 4, _ghigh);
73             vst1q_f32(ptr2, _blow);
74             vst1q_f32(ptr2 + 4, _bhigh);
75 
76             rgb += 3 * 8;
77             ptr0 += 8;
78             ptr1 += 8;
79             ptr2 += 8;
80         }
81 #else
82         if (nn > 0)
83         {
84             asm volatile(
85                 "0:                             \n"
86                 "pld        [%1, #256]          \n"
87                 "vld3.u8    {d0-d2}, [%1]!      \n"
88                 "vmovl.u8   q8, d0              \n"
89                 "vmovl.u8   q9, d1              \n"
90                 "vmovl.u8   q10, d2             \n"
91                 "vmovl.u16  q0, d16             \n"
92                 "vmovl.u16  q1, d17             \n"
93                 "vmovl.u16  q2, d18             \n"
94                 "vmovl.u16  q3, d19             \n"
95                 "vmovl.u16  q8, d20             \n"
96                 "vmovl.u16  q9, d21             \n"
97                 "vcvt.f32.u32   q0, q0          \n"
98                 "vcvt.f32.u32   q1, q1          \n"
99                 "vcvt.f32.u32   q2, q2          \n"
100                 "vcvt.f32.u32   q3, q3          \n"
101                 "vcvt.f32.u32   q8, q8          \n"
102                 "subs       %0, #1              \n"
103                 "vst1.f32   {d0-d3}, [%2]!      \n"
104                 "vcvt.f32.u32   q9, q9          \n"
105                 "vst1.f32   {d4-d7}, [%3]!      \n"
106                 "vst1.f32   {d16-d19}, [%4]!    \n"
107                 "bne        0b                  \n"
108                 : "=r"(nn),   // %0
109                 "=r"(rgb),  // %1
110                 "=r"(ptr0), // %2
111                 "=r"(ptr1), // %3
112                 "=r"(ptr2)  // %4
113                 : "0"(nn),
114                 "1"(rgb),
115                 "2"(ptr0),
116                 "3"(ptr1),
117                 "4"(ptr2)
118                 : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10");
119         }
120 #endif // __aarch64__
121 #endif // __ARM_NEON
122         for (; remain > 0; remain--)
123         {
124             *ptr0 = rgb[0];
125             *ptr1 = rgb[1];
126             *ptr2 = rgb[2];
127 
128             rgb += 3;
129             ptr0++;
130             ptr1++;
131             ptr2++;
132         }
133 
134         rgb += wgap;
135     }
136 
137     return 0;
138 }
139 
to_rgb(const Mat & m,unsigned char * rgb,int stride)140 static void to_rgb(const Mat& m, unsigned char* rgb, int stride)
141 {
142     int w = m.w;
143     int h = m.h;
144 
145     const int wgap = stride - w * 3;
146     if (wgap == 0)
147     {
148         w = w * h;
149         h = 1;
150     }
151 
152     const float* ptr0 = m.channel(0);
153     const float* ptr1 = m.channel(1);
154     const float* ptr2 = m.channel(2);
155 
156     for (int y = 0; y < h; y++)
157     {
158 #define SATURATE_CAST_UCHAR(X) (unsigned char)::std::min(::std::max((int)(X), 0), 255);
159 
160 #if __ARM_NEON
161         int nn = w >> 3;
162         int remain = w - (nn << 3);
163 #else
164         int remain = w;
165 #endif // __ARM_NEON
166 
167 #if __ARM_NEON
168         for (; nn > 0; nn--)
169         {
170             float32x4_t _rlow = vld1q_f32(ptr0);
171             float32x4_t _rhigh = vld1q_f32(ptr0 + 4);
172             float32x4_t _glow = vld1q_f32(ptr1);
173             float32x4_t _ghigh = vld1q_f32(ptr1 + 4);
174             float32x4_t _blow = vld1q_f32(ptr2);
175             float32x4_t _bhigh = vld1q_f32(ptr2 + 4);
176 
177             int16x8_t _r16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_rlow)), vmovn_s32(vcvtq_s32_f32(_rhigh)));
178             int16x8_t _g16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_glow)), vmovn_s32(vcvtq_s32_f32(_ghigh)));
179             int16x8_t _b16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_blow)), vmovn_s32(vcvtq_s32_f32(_bhigh)));
180 
181             uint8x8x3_t _rgb;
182             _rgb.val[0] = vqmovun_s16(_r16);
183             _rgb.val[1] = vqmovun_s16(_g16);
184             _rgb.val[2] = vqmovun_s16(_b16);
185 
186             vst3_u8(rgb, _rgb);
187 
188             rgb += 3 * 8;
189             ptr0 += 8;
190             ptr1 += 8;
191             ptr2 += 8;
192         }
193 #endif // __ARM_NEON
194         for (; remain > 0; remain--)
195         {
196             rgb[0] = SATURATE_CAST_UCHAR(*ptr0);
197             rgb[1] = SATURATE_CAST_UCHAR(*ptr1);
198             rgb[2] = SATURATE_CAST_UCHAR(*ptr2);
199 
200             rgb += 3;
201             ptr0++;
202             ptr1++;
203             ptr2++;
204         }
205 
206 #undef SATURATE_CAST_UCHAR
207         rgb += wgap;
208     }
209 }
210 
from_gray(const unsigned char * gray,int w,int h,int stride,Mat & m,Allocator * allocator)211 static int from_gray(const unsigned char* gray, int w, int h, int stride, Mat& m, Allocator* allocator)
212 {
213     m.create(w, h, 1, 4u, allocator);
214     if (m.empty())
215         return -100;
216 
217     const int wgap = stride - w;
218     if (wgap == 0)
219     {
220         w = w * h;
221         h = 1;
222     }
223 
224     float* ptr = m;
225 
226     for (int y = 0; y < h; y++)
227     {
228 #if __ARM_NEON
229         int nn = w >> 4;
230         int remain = w - (nn << 4);
231 #else
232         int remain = w;
233 #endif // __ARM_NEON
234 
235 #if __ARM_NEON
236 #if __aarch64__
237         for (; nn > 0; nn--)
238         {
239             uint8x16_t _gray = vld1q_u8(gray);
240             uint16x8_t _gray16_0 = vmovl_u8(vget_low_u8(_gray));
241             uint16x8_t _gray16_1 = vmovl_u8(vget_high_u8(_gray));
242 
243             float32x4_t _graylow_0 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_gray16_0)));
244             float32x4_t _grayhigh_0 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_gray16_0)));
245             float32x4_t _graylow_1 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_gray16_1)));
246             float32x4_t _grayhigh_1 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_gray16_1)));
247 
248             vst1q_f32(ptr, _graylow_0);
249             vst1q_f32(ptr + 4, _grayhigh_0);
250             vst1q_f32(ptr + 8, _graylow_1);
251             vst1q_f32(ptr + 12, _grayhigh_1);
252 
253             gray += 16;
254             ptr += 16;
255         }
256 #else
257         if (nn > 0)
258         {
259             asm volatile(
260                 "0:                             \n"
261                 "pld        [%1, #128]          \n"
262                 "vld1.u8    {d0,d1}, [%1]!      \n"
263                 "vmovl.u8   q8, d0              \n"
264                 "vmovl.u8   q9, d1              \n"
265                 "vmovl.u16  q0, d16             \n"
266                 "vmovl.u16  q1, d17             \n"
267                 "vmovl.u16  q2, d18             \n"
268                 "vmovl.u16  q3, d19             \n"
269                 "vcvt.f32.u32   q0, q0          \n"
270                 "vcvt.f32.u32   q1, q1          \n"
271                 "vcvt.f32.u32   q2, q2          \n"
272                 "vcvt.f32.u32   q3, q3          \n"
273                 "subs       %0, #1              \n"
274                 "vst1.f32   {d0-d3}, [%2]!      \n"
275                 "vst1.f32   {d4-d7}, [%2]!      \n"
276                 "bne        0b                  \n"
277                 : "=r"(nn),   // %0
278                 "=r"(gray), // %1
279                 "=r"(ptr)   // %2
280                 : "0"(nn),
281                 "1"(gray),
282                 "2"(ptr)
283                 : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9");
284         }
285 #endif // __aarch64__
286 #endif // __ARM_NEON
287         for (; remain > 0; remain--)
288         {
289             *ptr = *gray;
290 
291             gray++;
292             ptr++;
293         }
294 
295         gray += wgap;
296     }
297 
298     return 0;
299 }
300 
to_gray(const Mat & m,unsigned char * gray,int stride)301 static void to_gray(const Mat& m, unsigned char* gray, int stride)
302 {
303     int w = m.w;
304     int h = m.h;
305 
306     const int wgap = stride - w;
307     if (wgap == 0)
308     {
309         w = w * h;
310         h = 1;
311     }
312 
313     const float* ptr = m;
314 
315     for (int y = 0; y < h; y++)
316     {
317 #define SATURATE_CAST_UCHAR(X) (unsigned char)::std::min(::std::max((int)(X), 0), 255);
318 
319 #if __ARM_NEON
320         int nn = w >> 3;
321         int remain = w - (nn << 3);
322 #else
323         int remain = w;
324 #endif // __ARM_NEON
325 
326 #if __ARM_NEON
327         for (; nn > 0; nn--)
328         {
329             float32x4_t _glow = vld1q_f32(ptr);
330             float32x4_t _ghigh = vld1q_f32(ptr + 4);
331 
332             int16x8_t _g16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_glow)), vmovn_s32(vcvtq_s32_f32(_ghigh)));
333 
334             uint8x8_t _gray = vqmovun_s16(_g16);
335 
336             vst1_u8(gray, _gray);
337 
338             gray += 8;
339             ptr += 8;
340         }
341 #endif // __ARM_NEON
342         for (; remain > 0; remain--)
343         {
344             *gray = SATURATE_CAST_UCHAR(*ptr);
345 
346             gray++;
347             ptr++;
348         }
349 
350 #undef SATURATE_CAST_UCHAR
351         gray += wgap;
352     }
353 }
354 
from_rgba(const unsigned char * rgba,int w,int h,int stride,Mat & m,Allocator * allocator)355 static int from_rgba(const unsigned char* rgba, int w, int h, int stride, Mat& m, Allocator* allocator)
356 {
357     m.create(w, h, 4, 4u, allocator);
358     if (m.empty())
359         return -100;
360 
361     const int wgap = stride - w * 4;
362     if (wgap == 0)
363     {
364         w = w * h;
365         h = 1;
366     }
367 
368     float* ptr0 = m.channel(0);
369     float* ptr1 = m.channel(1);
370     float* ptr2 = m.channel(2);
371     float* ptr3 = m.channel(3);
372 
373     for (int y = 0; y < h; y++)
374     {
375 #if __ARM_NEON
376         int nn = w >> 3;
377         int remain = w - (nn << 3);
378 #else
379         int remain = w;
380 #endif // __ARM_NEON
381 
382 #if __ARM_NEON
383 #if __aarch64__
384         for (; nn > 0; nn--)
385         {
386             uint8x8x4_t _rgba = vld4_u8(rgba);
387             int16x8_t _r16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[0]));
388             int16x8_t _g16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[1]));
389             int16x8_t _b16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[2]));
390             int16x8_t _a16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[3]));
391 
392             float32x4_t _rlow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_r16)));
393             float32x4_t _rhigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_r16)));
394             float32x4_t _glow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_g16)));
395             float32x4_t _ghigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_g16)));
396             float32x4_t _blow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_b16)));
397             float32x4_t _bhigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_b16)));
398             float32x4_t _alow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_a16)));
399             float32x4_t _ahigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_a16)));
400 
401             vst1q_f32(ptr0, _rlow);
402             vst1q_f32(ptr0 + 4, _rhigh);
403             vst1q_f32(ptr1, _glow);
404             vst1q_f32(ptr1 + 4, _ghigh);
405             vst1q_f32(ptr2, _blow);
406             vst1q_f32(ptr2 + 4, _bhigh);
407             vst1q_f32(ptr3, _alow);
408             vst1q_f32(ptr3 + 4, _ahigh);
409 
410             rgba += 4 * 8;
411             ptr0 += 8;
412             ptr1 += 8;
413             ptr2 += 8;
414             ptr3 += 8;
415         }
416 #else
417         if (nn > 0)
418         {
419             asm volatile(
420                 "0:                             \n"
421                 "pld        [%1, #256]          \n"
422                 "vld4.u8    {d0-d3}, [%1]!      \n"
423                 "vmovl.u8   q8, d0              \n"
424                 "vmovl.u8   q9, d1              \n"
425                 "vmovl.u8   q10, d2             \n"
426                 "vmovl.u8   q11, d3             \n"
427                 "vmovl.u16  q0, d16             \n"
428                 "vmovl.u16  q1, d17             \n"
429                 "vmovl.u16  q2, d18             \n"
430                 "vmovl.u16  q3, d19             \n"
431                 "vmovl.u16  q8, d20             \n"
432                 "vmovl.u16  q9, d21             \n"
433                 "vmovl.u16  q10, d22            \n"
434                 "vmovl.u16  q11, d23            \n"
435                 "vcvt.f32.u32   q0, q0          \n"
436                 "vcvt.f32.u32   q1, q1          \n"
437                 "vcvt.f32.u32   q2, q2          \n"
438                 "vcvt.f32.u32   q3, q3          \n"
439                 "vcvt.f32.u32   q8, q8          \n"
440                 "vcvt.f32.u32   q9, q9          \n"
441                 "subs       %0, #1              \n"
442                 "vst1.f32   {d0-d3}, [%2]!      \n"
443                 "vcvt.f32.u32   q10, q10        \n"
444                 "vcvt.f32.u32   q11, q11        \n"
445                 "vst1.f32   {d4-d7}, [%3]!      \n"
446                 "vst1.f32   {d16-d19}, [%4]!    \n"
447                 "vst1.f32   {d20-d23}, [%5]!    \n"
448                 "bne        0b                  \n"
449                 : "=r"(nn),   // %0
450                 "=r"(rgba), // %1
451                 "=r"(ptr0), // %2
452                 "=r"(ptr1), // %3
453                 "=r"(ptr2), // %4
454                 "=r"(ptr3)  // %5
455                 : "0"(nn),
456                 "1"(rgba),
457                 "2"(ptr0),
458                 "3"(ptr1),
459                 "4"(ptr2),
460                 "5"(ptr3)
461                 : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
462         }
463 #endif // __aarch64__
464 #endif // __ARM_NEON
465         for (; remain > 0; remain--)
466         {
467             *ptr0 = rgba[0];
468             *ptr1 = rgba[1];
469             *ptr2 = rgba[2];
470             *ptr3 = rgba[3];
471 
472             rgba += 4;
473             ptr0++;
474             ptr1++;
475             ptr2++;
476             ptr3++;
477         }
478 
479         rgba += wgap;
480     }
481 
482     return 0;
483 }
484 
to_rgba(const Mat & m,unsigned char * rgba,int stride)485 static void to_rgba(const Mat& m, unsigned char* rgba, int stride)
486 {
487     int w = m.w;
488     int h = m.h;
489 
490     const int wgap = stride - w * 4;
491     if (wgap == 0)
492     {
493         w = w * h;
494         h = 1;
495     }
496 
497     const float* ptr0 = m.channel(0);
498     const float* ptr1 = m.channel(1);
499     const float* ptr2 = m.channel(2);
500     const float* ptr3 = m.channel(3);
501 
502     for (int y = 0; y < h; y++)
503     {
504 #define SATURATE_CAST_UCHAR(X) (unsigned char)::std::min(::std::max((int)(X), 0), 255);
505 
506 #if __ARM_NEON
507         int nn = w >> 3;
508         int remain = w - (nn << 3);
509 #else
510         int remain = w;
511 #endif // __ARM_NEON
512 
513 #if __ARM_NEON
514         for (; nn > 0; nn--)
515         {
516             float32x4_t _rlow = vld1q_f32(ptr0);
517             float32x4_t _rhigh = vld1q_f32(ptr0 + 4);
518             float32x4_t _glow = vld1q_f32(ptr1);
519             float32x4_t _ghigh = vld1q_f32(ptr1 + 4);
520             float32x4_t _blow = vld1q_f32(ptr2);
521             float32x4_t _bhigh = vld1q_f32(ptr2 + 4);
522             float32x4_t _alow = vld1q_f32(ptr3);
523             float32x4_t _ahigh = vld1q_f32(ptr3 + 4);
524 
525             int16x8_t _r16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_rlow)), vmovn_s32(vcvtq_s32_f32(_rhigh)));
526             int16x8_t _g16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_glow)), vmovn_s32(vcvtq_s32_f32(_ghigh)));
527             int16x8_t _b16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_blow)), vmovn_s32(vcvtq_s32_f32(_bhigh)));
528             int16x8_t _a16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_alow)), vmovn_s32(vcvtq_s32_f32(_ahigh)));
529 
530             uint8x8x4_t _rgba;
531             _rgba.val[0] = vqmovun_s16(_r16);
532             _rgba.val[1] = vqmovun_s16(_g16);
533             _rgba.val[2] = vqmovun_s16(_b16);
534             _rgba.val[3] = vqmovun_s16(_a16);
535 
536             vst4_u8(rgba, _rgba);
537 
538             rgba += 4 * 8;
539             ptr0 += 8;
540             ptr1 += 8;
541             ptr2 += 8;
542             ptr3 += 8;
543         }
544 #endif // __ARM_NEON
545         for (; remain > 0; remain--)
546         {
547             rgba[0] = SATURATE_CAST_UCHAR(*ptr0);
548             rgba[1] = SATURATE_CAST_UCHAR(*ptr1);
549             rgba[2] = SATURATE_CAST_UCHAR(*ptr2);
550             rgba[3] = SATURATE_CAST_UCHAR(*ptr3);
551 
552             rgba += 4;
553             ptr0++;
554             ptr1++;
555             ptr2++;
556             ptr3++;
557         }
558 
559 #undef SATURATE_CAST_UCHAR
560         rgba += wgap;
561     }
562 }
563 
from_rgb2bgr(const unsigned char * rgb,int w,int h,int stride,Mat & m,Allocator * allocator)564 static int from_rgb2bgr(const unsigned char* rgb, int w, int h, int stride, Mat& m, Allocator* allocator)
565 {
566     m.create(w, h, 3, 4u, allocator);
567     if (m.empty())
568         return -100;
569 
570     const int wgap = stride - w * 3;
571     if (wgap == 0)
572     {
573         w = w * h;
574         h = 1;
575     }
576 
577     float* ptr0 = m.channel(0);
578     float* ptr1 = m.channel(1);
579     float* ptr2 = m.channel(2);
580 
581     for (int y = 0; y < h; y++)
582     {
583 #if __ARM_NEON
584         int nn = w >> 3;
585         int remain = w - (nn << 3);
586 #else
587         int remain = w;
588 #endif // __ARM_NEON
589 
590 #if __ARM_NEON
591 #if __aarch64__
592         for (; nn > 0; nn--)
593         {
594             uint8x8x3_t _rgb = vld3_u8(rgb);
595             uint16x8_t _r16 = vmovl_u8(_rgb.val[0]);
596             uint16x8_t _g16 = vmovl_u8(_rgb.val[1]);
597             uint16x8_t _b16 = vmovl_u8(_rgb.val[2]);
598 
599             float32x4_t _rlow = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_r16)));
600             float32x4_t _rhigh = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_r16)));
601             float32x4_t _glow = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_g16)));
602             float32x4_t _ghigh = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_g16)));
603             float32x4_t _blow = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_b16)));
604             float32x4_t _bhigh = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_b16)));
605 
606             vst1q_f32(ptr2, _rlow);
607             vst1q_f32(ptr2 + 4, _rhigh);
608             vst1q_f32(ptr1, _glow);
609             vst1q_f32(ptr1 + 4, _ghigh);
610             vst1q_f32(ptr0, _blow);
611             vst1q_f32(ptr0 + 4, _bhigh);
612 
613             rgb += 3 * 8;
614             ptr0 += 8;
615             ptr1 += 8;
616             ptr2 += 8;
617         }
618 #else
619         if (nn > 0)
620         {
621             asm volatile(
622                 "0:                             \n"
623                 "pld        [%1, #256]          \n"
624                 "vld3.u8    {d0-d2}, [%1]!      \n"
625                 "vmovl.u8   q8, d0              \n"
626                 "vmovl.u8   q9, d1              \n"
627                 "vmovl.u8   q10, d2             \n"
628                 "vmovl.u16  q0, d16             \n"
629                 "vmovl.u16  q1, d17             \n"
630                 "vmovl.u16  q2, d18             \n"
631                 "vmovl.u16  q3, d19             \n"
632                 "vmovl.u16  q8, d20             \n"
633                 "vmovl.u16  q9, d21             \n"
634                 "vcvt.f32.u32   q0, q0          \n"
635                 "vcvt.f32.u32   q1, q1          \n"
636                 "vcvt.f32.u32   q2, q2          \n"
637                 "vcvt.f32.u32   q3, q3          \n"
638                 "vcvt.f32.u32   q8, q8          \n"
639                 "subs       %0, #1              \n"
640                 "vst1.f32   {d0-d3}, [%4]!      \n"
641                 "vcvt.f32.u32   q9, q9          \n"
642                 "vst1.f32   {d4-d7}, [%3]!      \n"
643                 "vst1.f32   {d16-d19}, [%2]!    \n"
644                 "bne        0b                  \n"
645                 : "=r"(nn),   // %0
646                 "=r"(rgb),  // %1
647                 "=r"(ptr0), // %2
648                 "=r"(ptr1), // %3
649                 "=r"(ptr2)  // %4
650                 : "0"(nn),
651                 "1"(rgb),
652                 "2"(ptr0),
653                 "3"(ptr1),
654                 "4"(ptr2)
655                 : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10");
656         }
657 #endif // __aarch64__
658 #endif // __ARM_NEON
659         for (; remain > 0; remain--)
660         {
661             *ptr0 = rgb[2];
662             *ptr1 = rgb[1];
663             *ptr2 = rgb[0];
664 
665             rgb += 3;
666             ptr0++;
667             ptr1++;
668             ptr2++;
669         }
670 
671         rgb += wgap;
672     }
673 
674     return 0;
675 }
676 
to_bgr2rgb(const Mat & m,unsigned char * rgb,int stride)677 static void to_bgr2rgb(const Mat& m, unsigned char* rgb, int stride)
678 {
679     int w = m.w;
680     int h = m.h;
681 
682     const int wgap = stride - w * 3;
683     if (wgap == 0)
684     {
685         w = w * h;
686         h = 1;
687     }
688 
689     const float* ptr0 = m.channel(0);
690     const float* ptr1 = m.channel(1);
691     const float* ptr2 = m.channel(2);
692 
693     for (int y = 0; y < h; y++)
694     {
695 #define SATURATE_CAST_UCHAR(X) (unsigned char)::std::min(::std::max((int)(X), 0), 255);
696 
697 #if __ARM_NEON
698         int nn = w >> 3;
699         int remain = w - (nn << 3);
700 #else
701         int remain = w;
702 #endif // __ARM_NEON
703 
704 #if __ARM_NEON
705         for (; nn > 0; nn--)
706         {
707             float32x4_t _rlow = vld1q_f32(ptr2);
708             float32x4_t _rhigh = vld1q_f32(ptr2 + 4);
709             float32x4_t _glow = vld1q_f32(ptr1);
710             float32x4_t _ghigh = vld1q_f32(ptr1 + 4);
711             float32x4_t _blow = vld1q_f32(ptr0);
712             float32x4_t _bhigh = vld1q_f32(ptr0 + 4);
713 
714             int16x8_t _r16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_rlow)), vmovn_s32(vcvtq_s32_f32(_rhigh)));
715             int16x8_t _g16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_glow)), vmovn_s32(vcvtq_s32_f32(_ghigh)));
716             int16x8_t _b16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_blow)), vmovn_s32(vcvtq_s32_f32(_bhigh)));
717 
718             uint8x8x3_t _rgb;
719             _rgb.val[0] = vqmovun_s16(_r16);
720             _rgb.val[1] = vqmovun_s16(_g16);
721             _rgb.val[2] = vqmovun_s16(_b16);
722 
723             vst3_u8(rgb, _rgb);
724 
725             rgb += 3 * 8;
726             ptr0 += 8;
727             ptr1 += 8;
728             ptr2 += 8;
729         }
730 #endif // __ARM_NEON
731         for (; remain > 0; remain--)
732         {
733             rgb[2] = SATURATE_CAST_UCHAR(*ptr0);
734             rgb[1] = SATURATE_CAST_UCHAR(*ptr1);
735             rgb[0] = SATURATE_CAST_UCHAR(*ptr2);
736 
737             rgb += 3;
738             ptr0++;
739             ptr1++;
740             ptr2++;
741         }
742 
743 #undef SATURATE_CAST_UCHAR
744         rgb += wgap;
745     }
746 }
747 
from_rgb2gray(const unsigned char * rgb,int w,int h,int stride,Mat & m,Allocator * allocator)748 static int from_rgb2gray(const unsigned char* rgb, int w, int h, int stride, Mat& m, Allocator* allocator)
749 {
750     // coeffs for r g b = 0.299f, 0.587f, 0.114f
751     const unsigned char Y_shift = 8; //14
752     const unsigned char R2Y = 77;
753     const unsigned char G2Y = 150;
754     const unsigned char B2Y = 29;
755 
756     m.create(w, h, 1, 4u, allocator);
757     if (m.empty())
758         return -100;
759 
760     const int wgap = stride - w * 3;
761     if (wgap == 0)
762     {
763         w = w * h;
764         h = 1;
765     }
766 
767     float* ptr = m;
768 
769     for (int y = 0; y < h; y++)
770     {
771 #if __ARM_NEON
772         int nn = w >> 3;
773         int remain = w - (nn << 3);
774 #else
775         int remain = w;
776 #endif // __ARM_NEON
777 
778 #if __ARM_NEON
779 #if __aarch64__
780         uint8x8_t _R2Y = vdup_n_u8(R2Y);
781         uint8x8_t _G2Y = vdup_n_u8(G2Y);
782         uint8x8_t _B2Y = vdup_n_u8(B2Y);
783         for (; nn > 0; nn--)
784         {
785             uint8x8x3_t _rgb = vld3_u8(rgb);
786 
787             uint16x8_t _y16 = vmull_u8(_rgb.val[0], _R2Y);
788             _y16 = vmlal_u8(_y16, _rgb.val[1], _G2Y);
789             _y16 = vmlal_u8(_y16, _rgb.val[2], _B2Y);
790             _y16 = vshrq_n_u16(_y16, Y_shift);
791 
792             float32x4_t _ylow = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_y16)));
793             float32x4_t _yhigh = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_y16)));
794 
795             vst1q_f32(ptr, _ylow);
796             vst1q_f32(ptr + 4, _yhigh);
797 
798             rgb += 3 * 8;
799             ptr += 8;
800         }
801 #else
802         if (nn > 0)
803         {
804             asm volatile(
805                 "vdup.u8    d16, %6             \n"
806                 "vdup.u8    d17, %7             \n"
807                 "vdup.u8    d18, %8             \n"
808                 "0:                             \n"
809                 "pld        [%1, #256]          \n"
810                 "vld3.u8    {d0-d2}, [%1]!      \n"
811                 "vmull.u8   q2, d0, d16         \n"
812                 "vmlal.u8   q2, d1, d17         \n"
813                 "vmlal.u8   q2, d2, d18         \n"
814                 "vshr.u16   q2, q2, #8          \n" // Y_shift
815                 "vmovl.u16  q0, d4              \n"
816                 "vmovl.u16  q1, d5              \n"
817                 "vcvt.f32.u32   q0, q0          \n"
818                 "vcvt.f32.u32   q1, q1          \n"
819                 "subs       %0, #1              \n"
820                 "vst1.f32   {d0-d3}, [%2]!      \n"
821                 "bne        0b                  \n"
822                 : "=r"(nn),  // %0
823                 "=r"(rgb), // %1
824                 "=r"(ptr)  // %2
825                 : "0"(nn),
826                 "1"(rgb),
827                 "2"(ptr),
828                 "r"(R2Y), // %6
829                 "r"(G2Y), // %7
830                 "r"(B2Y)  // %8
831                 : "cc", "memory", "q0", "q1", "q2", "q8", "q9");
832         }
833 #endif // __aarch64__
834 #endif // __ARM_NEON
835         for (; remain > 0; remain--)
836         {
837             *ptr = static_cast<float>((rgb[0] * R2Y + rgb[1] * G2Y + rgb[2] * B2Y) >> Y_shift);
838 
839             rgb += 3;
840             ptr++;
841         }
842 
843         rgb += wgap;
844     }
845 
846     return 0;
847 }
848 
from_rgb2rgba(const unsigned char * rgb,int w,int h,int stride,Mat & m,Allocator * allocator)849 static int from_rgb2rgba(const unsigned char* rgb, int w, int h, int stride, Mat& m, Allocator* allocator)
850 {
851     m.create(w, h, 4, 4u, allocator);
852     if (m.empty())
853         return -100;
854 
855     Mat rgb_channels = m.channel_range(0, 3);
856     from_rgb(rgb, w, h, stride, rgb_channels, allocator);
857 
858     Mat alpha_channel = m.channel(3);
859     alpha_channel.fill(255.f);
860 
861     return 0;
862 }
863 
to_rgb2rgba(const Mat & m,unsigned char * rgba,int stride)864 static void to_rgb2rgba(const Mat& m, unsigned char* rgba, int stride)
865 {
866     int w = m.w;
867     int h = m.h;
868 
869     const int wgap = stride - w * 4;
870     if (wgap == 0)
871     {
872         w = w * h;
873         h = 1;
874     }
875 
876     const float* ptr0 = m.channel(0);
877     const float* ptr1 = m.channel(1);
878     const float* ptr2 = m.channel(2);
879 
880     for (int y = 0; y < h; y++)
881     {
882 #define SATURATE_CAST_UCHAR(X) (unsigned char)::std::min(::std::max((int)(X), 0), 255);
883 
884 #if __ARM_NEON
885         int nn = w >> 3;
886         int remain = w - (nn << 3);
887 #else
888         int remain = w;
889 #endif // __ARM_NEON
890 
891 #if __ARM_NEON
892         uint8x8_t _a = vdup_n_u8(255);
893         for (; nn > 0; nn--)
894         {
895             float32x4_t _rlow = vld1q_f32(ptr0);
896             float32x4_t _rhigh = vld1q_f32(ptr0 + 4);
897             float32x4_t _glow = vld1q_f32(ptr1);
898             float32x4_t _ghigh = vld1q_f32(ptr1 + 4);
899             float32x4_t _blow = vld1q_f32(ptr2);
900             float32x4_t _bhigh = vld1q_f32(ptr2 + 4);
901 
902             int16x8_t _r16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_rlow)), vmovn_s32(vcvtq_s32_f32(_rhigh)));
903             int16x8_t _g16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_glow)), vmovn_s32(vcvtq_s32_f32(_ghigh)));
904             int16x8_t _b16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_blow)), vmovn_s32(vcvtq_s32_f32(_bhigh)));
905 
906             uint8x8x4_t _rgba;
907             _rgba.val[0] = vqmovun_s16(_r16);
908             _rgba.val[1] = vqmovun_s16(_g16);
909             _rgba.val[2] = vqmovun_s16(_b16);
910             _rgba.val[3] = _a;
911 
912             vst4_u8(rgba, _rgba);
913 
914             rgba += 4 * 8;
915             ptr0 += 8;
916             ptr1 += 8;
917             ptr2 += 8;
918         }
919 #endif // __ARM_NEON
920         for (; remain > 0; remain--)
921         {
922             rgba[0] = SATURATE_CAST_UCHAR(*ptr0);
923             rgba[1] = SATURATE_CAST_UCHAR(*ptr1);
924             rgba[2] = SATURATE_CAST_UCHAR(*ptr2);
925             rgba[3] = 255;
926 
927             rgba += 4;
928             ptr0++;
929             ptr1++;
930             ptr2++;
931         }
932 
933 #undef SATURATE_CAST_UCHAR
934         rgba += wgap;
935     }
936 }
937 
from_bgr2gray(const unsigned char * bgr,int w,int h,int stride,Mat & m,Allocator * allocator)938 static int from_bgr2gray(const unsigned char* bgr, int w, int h, int stride, Mat& m, Allocator* allocator)
939 {
940     // coeffs for r g b = 0.299f, 0.587f, 0.114f
941     const unsigned char Y_shift = 8; //14
942     const unsigned char R2Y = 77;
943     const unsigned char G2Y = 150;
944     const unsigned char B2Y = 29;
945 
946     m.create(w, h, 1, 4u, allocator);
947     if (m.empty())
948         return -100;
949 
950     const int wgap = stride - w * 3;
951     if (wgap == 0)
952     {
953         w = w * h;
954         h = 1;
955     }
956 
957     float* ptr = m;
958 
959     for (int y = 0; y < h; y++)
960     {
961 #if __ARM_NEON
962         int nn = w >> 3;
963         int remain = w - (nn << 3);
964 #else
965         int remain = w;
966 #endif // __ARM_NEON
967 
968 #if __ARM_NEON
969 #if __aarch64__
970         uint8x8_t _R2Y = vdup_n_u8(R2Y);
971         uint8x8_t _G2Y = vdup_n_u8(G2Y);
972         uint8x8_t _B2Y = vdup_n_u8(B2Y);
973         for (; nn > 0; nn--)
974         {
975             uint8x8x3_t _rgb = vld3_u8(bgr);
976 
977             uint16x8_t _y16 = vmull_u8(_rgb.val[2], _R2Y);
978             _y16 = vmlal_u8(_y16, _rgb.val[1], _G2Y);
979             _y16 = vmlal_u8(_y16, _rgb.val[0], _B2Y);
980             _y16 = vshrq_n_u16(_y16, Y_shift);
981 
982             float32x4_t _ylow = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_y16)));
983             float32x4_t _yhigh = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_y16)));
984 
985             vst1q_f32(ptr, _ylow);
986             vst1q_f32(ptr + 4, _yhigh);
987 
988             bgr += 3 * 8;
989             ptr += 8;
990         }
991 #else
992         if (nn > 0)
993         {
994             asm volatile(
995                 "vdup.u8    d16, %6             \n"
996                 "vdup.u8    d17, %7             \n"
997                 "vdup.u8    d18, %8             \n"
998                 "0:                             \n"
999                 "pld        [%1, #256]          \n"
1000                 "vld3.u8    {d0-d2}, [%1]!      \n"
1001                 "vmull.u8   q2, d2, d16         \n"
1002                 "vmlal.u8   q2, d1, d17         \n"
1003                 "vmlal.u8   q2, d0, d18         \n"
1004                 "vshr.u16   q2, q2, #8          \n" // Y_shift
1005                 "vmovl.u16  q0, d4              \n"
1006                 "vmovl.u16  q1, d5              \n"
1007                 "vcvt.f32.u32   q0, q0          \n"
1008                 "vcvt.f32.u32   q1, q1          \n"
1009                 "subs       %0, #1              \n"
1010                 "vst1.f32   {d0-d3}, [%2]!      \n"
1011                 "bne        0b                  \n"
1012                 : "=r"(nn),  // %0
1013                 "=r"(bgr), // %1
1014                 "=r"(ptr)  // %2
1015                 : "0"(nn),
1016                 "1"(bgr),
1017                 "2"(ptr),
1018                 "r"(R2Y), // %6
1019                 "r"(G2Y), // %7
1020                 "r"(B2Y)  // %8
1021                 : "cc", "memory", "q0", "q1", "q2", "q8", "q9");
1022         }
1023 #endif // __aarch64__
1024 #endif // __ARM_NEON
1025         for (; remain > 0; remain--)
1026         {
1027             *ptr = static_cast<float>((bgr[2] * R2Y + bgr[1] * G2Y + bgr[0] * B2Y) >> Y_shift);
1028 
1029             bgr += 3;
1030             ptr++;
1031         }
1032 
1033         bgr += wgap;
1034     }
1035 
1036     return 0;
1037 }
1038 
from_bgr2rgba(const unsigned char * bgr,int w,int h,int stride,Mat & m,Allocator * allocator)1039 static int from_bgr2rgba(const unsigned char* bgr, int w, int h, int stride, Mat& m, Allocator* allocator)
1040 {
1041     m.create(w, h, 4, 4u, allocator);
1042     if (m.empty())
1043         return -100;
1044 
1045     Mat rgb_channels = m.channel_range(0, 3);
1046     from_rgb2bgr(bgr, w, h, stride, rgb_channels, allocator);
1047 
1048     Mat alpha_channel = m.channel(3);
1049     alpha_channel.fill(255.f);
1050 
1051     return 0;
1052 }
1053 
to_bgr2rgba(const Mat & m,unsigned char * rgba,int stride)1054 static void to_bgr2rgba(const Mat& m, unsigned char* rgba, int stride)
1055 {
1056     int w = m.w;
1057     int h = m.h;
1058 
1059     const int wgap = stride - w * 4;
1060     if (wgap == 0)
1061     {
1062         w = w * h;
1063         h = 1;
1064     }
1065 
1066     const float* ptr0 = m.channel(0);
1067     const float* ptr1 = m.channel(1);
1068     const float* ptr2 = m.channel(2);
1069 
1070     for (int y = 0; y < h; y++)
1071     {
1072 #define SATURATE_CAST_UCHAR(X) (unsigned char)::std::min(::std::max((int)(X), 0), 255);
1073 
1074 #if __ARM_NEON
1075         int nn = w >> 3;
1076         int remain = w - (nn << 3);
1077 #else
1078         int remain = w;
1079 #endif // __ARM_NEON
1080 
1081 #if __ARM_NEON
1082         uint8x8_t _a = vdup_n_u8(255);
1083         for (; nn > 0; nn--)
1084         {
1085             float32x4_t _rlow = vld1q_f32(ptr2);
1086             float32x4_t _rhigh = vld1q_f32(ptr2 + 4);
1087             float32x4_t _glow = vld1q_f32(ptr1);
1088             float32x4_t _ghigh = vld1q_f32(ptr1 + 4);
1089             float32x4_t _blow = vld1q_f32(ptr0);
1090             float32x4_t _bhigh = vld1q_f32(ptr0 + 4);
1091 
1092             int16x8_t _r16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_rlow)), vmovn_s32(vcvtq_s32_f32(_rhigh)));
1093             int16x8_t _g16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_glow)), vmovn_s32(vcvtq_s32_f32(_ghigh)));
1094             int16x8_t _b16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_blow)), vmovn_s32(vcvtq_s32_f32(_bhigh)));
1095 
1096             uint8x8x4_t _rgba;
1097             _rgba.val[0] = vqmovun_s16(_r16);
1098             _rgba.val[1] = vqmovun_s16(_g16);
1099             _rgba.val[2] = vqmovun_s16(_b16);
1100             _rgba.val[3] = _a;
1101 
1102             vst4_u8(rgba, _rgba);
1103 
1104             rgba += 4 * 8;
1105             ptr0 += 8;
1106             ptr1 += 8;
1107             ptr2 += 8;
1108         }
1109 #endif // __ARM_NEON
1110         for (; remain > 0; remain--)
1111         {
1112             rgba[0] = SATURATE_CAST_UCHAR(*ptr2);
1113             rgba[1] = SATURATE_CAST_UCHAR(*ptr1);
1114             rgba[2] = SATURATE_CAST_UCHAR(*ptr0);
1115             rgba[3] = 255;
1116 
1117             rgba += 4;
1118             ptr0++;
1119             ptr1++;
1120             ptr2++;
1121         }
1122 
1123 #undef SATURATE_CAST_UCHAR
1124         rgba += wgap;
1125     }
1126 }
1127 
from_gray2rgb(const unsigned char * gray,int w,int h,int stride,Mat & m,Allocator * allocator)1128 static int from_gray2rgb(const unsigned char* gray, int w, int h, int stride, Mat& m, Allocator* allocator)
1129 {
1130     m.create(w, h, 3, 4u, allocator);
1131     if (m.empty())
1132         return -100;
1133 
1134     const int wgap = stride - w;
1135     if (wgap == 0)
1136     {
1137         w = w * h;
1138         h = 1;
1139     }
1140 
1141     float* ptr0 = m.channel(0);
1142     float* ptr1 = m.channel(1);
1143     float* ptr2 = m.channel(2);
1144 
1145     for (int y = 0; y < h; y++)
1146     {
1147 #if __ARM_NEON
1148         int nn = w >> 4;
1149         int remain = w - (nn << 4);
1150 #else
1151         int remain = w;
1152 #endif // __ARM_NEON
1153 
1154 #if __ARM_NEON
1155 #if __aarch64__
1156         for (; nn > 0; nn--)
1157         {
1158             uint8x16_t _gray = vld1q_u8(gray);
1159             uint16x8_t _gray16_0 = vmovl_u8(vget_low_u8(_gray));
1160             uint16x8_t _gray16_1 = vmovl_u8(vget_high_u8(_gray));
1161 
1162             float32x4_t _graylow_0 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_gray16_0)));
1163             float32x4_t _grayhigh_0 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_gray16_0)));
1164             float32x4_t _graylow_1 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_gray16_1)));
1165             float32x4_t _grayhigh_1 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_gray16_1)));
1166 
1167             vst1q_f32(ptr0, _graylow_0);
1168             vst1q_f32(ptr0 + 4, _grayhigh_0);
1169             vst1q_f32(ptr0 + 8, _graylow_1);
1170             vst1q_f32(ptr0 + 12, _grayhigh_1);
1171 
1172             vst1q_f32(ptr1, _graylow_0);
1173             vst1q_f32(ptr1 + 4, _grayhigh_0);
1174             vst1q_f32(ptr1 + 8, _graylow_1);
1175             vst1q_f32(ptr1 + 12, _grayhigh_1);
1176 
1177             vst1q_f32(ptr2, _graylow_0);
1178             vst1q_f32(ptr2 + 4, _grayhigh_0);
1179             vst1q_f32(ptr2 + 8, _graylow_1);
1180             vst1q_f32(ptr2 + 12, _grayhigh_1);
1181 
1182             gray += 16;
1183             ptr0 += 16;
1184             ptr1 += 16;
1185             ptr2 += 16;
1186         }
1187 #else
1188         if (nn > 0)
1189         {
1190             asm volatile(
1191                 "0:                             \n"
1192                 "pld        [%1, #128]          \n"
1193                 "vld1.u8    {d0,d1}, [%1]!      \n"
1194                 "vmovl.u8   q8, d0              \n"
1195                 "vmovl.u8   q9, d1              \n"
1196                 "vmovl.u16  q0, d16             \n"
1197                 "vmovl.u16  q1, d17             \n"
1198                 "vmovl.u16  q2, d18             \n"
1199                 "vmovl.u16  q3, d19             \n"
1200                 "vcvt.f32.u32   q0, q0          \n"
1201                 "vcvt.f32.u32   q1, q1          \n"
1202                 "vcvt.f32.u32   q2, q2          \n"
1203                 "vcvt.f32.u32   q3, q3          \n"
1204                 "subs       %0, #1              \n"
1205                 "vst1.f32   {d0-d3}, [%2]!      \n"
1206                 "vst1.f32   {d4-d7}, [%2]!      \n"
1207                 "vst1.f32   {d0-d3}, [%3]!      \n"
1208                 "vst1.f32   {d4-d7}, [%3]!      \n"
1209                 "vst1.f32   {d0-d3}, [%4]!      \n"
1210                 "vst1.f32   {d4-d7}, [%4]!      \n"
1211                 "bne        0b                  \n"
1212                 : "=r"(nn),   // %0
1213                 "=r"(gray), // %1
1214                 "=r"(ptr0), // %2
1215                 "=r"(ptr1), // %3
1216                 "=r"(ptr2)  // %4
1217                 : "0"(nn),
1218                 "1"(gray),
1219                 "2"(ptr0),
1220                 "3"(ptr1),
1221                 "4"(ptr2)
1222                 : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9");
1223         }
1224 #endif // __aarch64__
1225 #endif // __ARM_NEON
1226         for (; remain > 0; remain--)
1227         {
1228             *ptr0 = *gray;
1229             *ptr1 = *gray;
1230             *ptr2 = *gray;
1231 
1232             gray++;
1233             ptr0++;
1234             ptr1++;
1235             ptr2++;
1236         }
1237 
1238         gray += wgap;
1239     }
1240 
1241     return 0;
1242 }
1243 
from_gray2rgba(const unsigned char * gray,int w,int h,int stride,Mat & m,Allocator * allocator)1244 static int from_gray2rgba(const unsigned char* gray, int w, int h, int stride, Mat& m, Allocator* allocator)
1245 {
1246     m.create(w, h, 4, 4u, allocator);
1247     if (m.empty())
1248         return -100;
1249 
1250     Mat rgb_channels = m.channel_range(0, 3);
1251     from_gray2rgb(gray, w, h, stride, rgb_channels, allocator);
1252 
1253     Mat alpha_channel = m.channel(3);
1254     alpha_channel.fill(255.f);
1255 
1256     return 0;
1257 }
1258 
to_gray2rgba(const Mat & m,unsigned char * rgba,int stride)1259 static void to_gray2rgba(const Mat& m, unsigned char* rgba, int stride)
1260 {
1261     int w = m.w;
1262     int h = m.h;
1263 
1264     const int wgap = stride - w * 4;
1265     if (wgap == 0)
1266     {
1267         w = w * h;
1268         h = 1;
1269     }
1270 
1271     const float* ptr = m;
1272 
1273     for (int y = 0; y < h; y++)
1274     {
1275 #define SATURATE_CAST_UCHAR(X) (unsigned char)::std::min(::std::max((int)(X), 0), 255);
1276 
1277 #if __ARM_NEON
1278         int nn = w >> 3;
1279         int remain = w - (nn << 3);
1280 #else
1281         int remain = w;
1282 #endif // __ARM_NEON
1283 
1284 #if __ARM_NEON
1285         uint8x8_t _a = vdup_n_u8(255);
1286         for (; nn > 0; nn--)
1287         {
1288             float32x4_t _glow = vld1q_f32(ptr);
1289             float32x4_t _ghigh = vld1q_f32(ptr + 4);
1290 
1291             int16x8_t _g16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_glow)), vmovn_s32(vcvtq_s32_f32(_ghigh)));
1292 
1293             uint8x8_t _gray = vqmovun_s16(_g16);
1294 
1295             uint8x8x4_t _rgba;
1296             _rgba.val[0] = _gray;
1297             _rgba.val[1] = _gray;
1298             _rgba.val[2] = _gray;
1299             _rgba.val[3] = _a;
1300 
1301             vst4_u8(rgba, _rgba);
1302 
1303             rgba += 4 * 8;
1304             ptr += 8;
1305         }
1306 #endif // __ARM_NEON
1307         for (; remain > 0; remain--)
1308         {
1309             unsigned char gray = SATURATE_CAST_UCHAR(*ptr);
1310             rgba[0] = gray;
1311             rgba[1] = gray;
1312             rgba[2] = gray;
1313             rgba[3] = 255;
1314 
1315             rgba += 4;
1316             ptr++;
1317         }
1318 
1319 #undef SATURATE_CAST_UCHAR
1320         rgba += wgap;
1321     }
1322 }
1323 
from_rgba2rgb(const unsigned char * rgba,int w,int h,int stride,Mat & m,Allocator * allocator)1324 static int from_rgba2rgb(const unsigned char* rgba, int w, int h, int stride, Mat& m, Allocator* allocator)
1325 {
1326     m.create(w, h, 3, 4u, allocator);
1327     if (m.empty())
1328         return -100;
1329 
1330     const int wgap = stride - w * 4;
1331     if (wgap == 0)
1332     {
1333         w = w * h;
1334         h = 1;
1335     }
1336 
1337     float* ptr0 = m.channel(0);
1338     float* ptr1 = m.channel(1);
1339     float* ptr2 = m.channel(2);
1340 
1341     for (int y = 0; y < h; y++)
1342     {
1343 #if __ARM_NEON
1344         int nn = w >> 3;
1345         int remain = w - (nn << 3);
1346 #else
1347         int remain = w;
1348 #endif // __ARM_NEON
1349 
1350 #if __ARM_NEON
1351 #if __aarch64__
1352         for (; nn > 0; nn--)
1353         {
1354             uint8x8x4_t _rgba = vld4_u8(rgba);
1355             int16x8_t _r16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[0]));
1356             int16x8_t _g16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[1]));
1357             int16x8_t _b16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[2]));
1358 
1359             float32x4_t _rlow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_r16)));
1360             float32x4_t _rhigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_r16)));
1361             float32x4_t _glow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_g16)));
1362             float32x4_t _ghigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_g16)));
1363             float32x4_t _blow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_b16)));
1364             float32x4_t _bhigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_b16)));
1365 
1366             vst1q_f32(ptr0, _rlow);
1367             vst1q_f32(ptr0 + 4, _rhigh);
1368             vst1q_f32(ptr1, _glow);
1369             vst1q_f32(ptr1 + 4, _ghigh);
1370             vst1q_f32(ptr2, _blow);
1371             vst1q_f32(ptr2 + 4, _bhigh);
1372 
1373             rgba += 4 * 8;
1374             ptr0 += 8;
1375             ptr1 += 8;
1376             ptr2 += 8;
1377         }
1378 #else
1379         if (nn > 0)
1380         {
1381             asm volatile(
1382                 "0:                             \n"
1383                 "pld        [%1, #256]          \n"
1384                 "vld4.u8    {d0-d3}, [%1]!      \n"
1385                 "vmovl.u8   q8, d0              \n"
1386                 "vmovl.u8   q9, d1              \n"
1387                 "vmovl.u8   q10, d2             \n"
1388                 "vmovl.u16  q0, d16             \n"
1389                 "vmovl.u16  q1, d17             \n"
1390                 "vmovl.u16  q2, d18             \n"
1391                 "vmovl.u16  q3, d19             \n"
1392                 "vmovl.u16  q8, d20             \n"
1393                 "vmovl.u16  q9, d21             \n"
1394                 "vcvt.f32.u32   q0, q0          \n"
1395                 "vcvt.f32.u32   q1, q1          \n"
1396                 "vcvt.f32.u32   q2, q2          \n"
1397                 "vcvt.f32.u32   q3, q3          \n"
1398                 "vcvt.f32.u32   q8, q8          \n"
1399                 "subs       %0, #1              \n"
1400                 "vst1.f32   {d0-d3}, [%2]!      \n"
1401                 "vcvt.f32.u32   q9, q9          \n"
1402                 "vst1.f32   {d4-d7}, [%3]!      \n"
1403                 "vst1.f32   {d16-d19}, [%4]!    \n"
1404                 "bne        0b                  \n"
1405                 : "=r"(nn),   // %0
1406                 "=r"(rgba), // %1
1407                 "=r"(ptr0), // %2
1408                 "=r"(ptr1), // %3
1409                 "=r"(ptr2)  // %4
1410                 : "0"(nn),
1411                 "1"(rgba),
1412                 "2"(ptr0),
1413                 "3"(ptr1),
1414                 "4"(ptr2)
1415                 : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9");
1416         }
1417 #endif // __aarch64__
1418 #endif // __ARM_NEON
1419         for (; remain > 0; remain--)
1420         {
1421             *ptr0 = rgba[0];
1422             *ptr1 = rgba[1];
1423             *ptr2 = rgba[2];
1424 
1425             rgba += 4;
1426             ptr0++;
1427             ptr1++;
1428             ptr2++;
1429         }
1430 
1431         rgba += wgap;
1432     }
1433 
1434     return 0;
1435 }
1436 
from_rgba2bgr(const unsigned char * rgba,int w,int h,int stride,Mat & m,Allocator * allocator)1437 static int from_rgba2bgr(const unsigned char* rgba, int w, int h, int stride, Mat& m, Allocator* allocator)
1438 {
1439     m.create(w, h, 3, 4u, allocator);
1440     if (m.empty())
1441         return -100;
1442 
1443     const int wgap = stride - w * 4;
1444     if (wgap == 0)
1445     {
1446         w = w * h;
1447         h = 1;
1448     }
1449 
1450     float* ptr0 = m.channel(0);
1451     float* ptr1 = m.channel(1);
1452     float* ptr2 = m.channel(2);
1453 
1454     for (int y = 0; y < h; y++)
1455     {
1456 #if __ARM_NEON
1457         int nn = w >> 3;
1458         int remain = w - (nn << 3);
1459 #else
1460         int remain = w;
1461 #endif // __ARM_NEON
1462 
1463 #if __ARM_NEON
1464 #if __aarch64__
1465         for (; nn > 0; nn--)
1466         {
1467             uint8x8x4_t _rgba = vld4_u8(rgba);
1468             int16x8_t _r16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[0]));
1469             int16x8_t _g16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[1]));
1470             int16x8_t _b16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[2]));
1471 
1472             float32x4_t _rlow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_r16)));
1473             float32x4_t _rhigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_r16)));
1474             float32x4_t _glow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_g16)));
1475             float32x4_t _ghigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_g16)));
1476             float32x4_t _blow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_b16)));
1477             float32x4_t _bhigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_b16)));
1478 
1479             vst1q_f32(ptr2, _rlow);
1480             vst1q_f32(ptr2 + 4, _rhigh);
1481             vst1q_f32(ptr1, _glow);
1482             vst1q_f32(ptr1 + 4, _ghigh);
1483             vst1q_f32(ptr0, _blow);
1484             vst1q_f32(ptr0 + 4, _bhigh);
1485 
1486             rgba += 4 * 8;
1487             ptr0 += 8;
1488             ptr1 += 8;
1489             ptr2 += 8;
1490         }
1491 #else
1492         if (nn > 0)
1493         {
1494             asm volatile(
1495                 "0:                             \n"
1496                 "pld        [%1, #256]          \n"
1497                 "vld4.u8    {d0-d3}, [%1]!      \n"
1498                 "vmovl.u8   q8, d0              \n"
1499                 "vmovl.u8   q9, d1              \n"
1500                 "vmovl.u8   q10, d2             \n"
1501                 "vmovl.u16  q0, d16             \n"
1502                 "vmovl.u16  q1, d17             \n"
1503                 "vmovl.u16  q2, d18             \n"
1504                 "vmovl.u16  q3, d19             \n"
1505                 "vmovl.u16  q8, d20             \n"
1506                 "vmovl.u16  q9, d21             \n"
1507                 "vcvt.f32.u32   q0, q0          \n"
1508                 "vcvt.f32.u32   q1, q1          \n"
1509                 "vcvt.f32.u32   q2, q2          \n"
1510                 "vcvt.f32.u32   q3, q3          \n"
1511                 "vcvt.f32.u32   q8, q8          \n"
1512                 "subs       %0, #1              \n"
1513                 "vst1.f32   {d0-d3}, [%4]!      \n"
1514                 "vcvt.f32.u32   q9, q9          \n"
1515                 "vst1.f32   {d4-d7}, [%3]!      \n"
1516                 "vst1.f32   {d16-d19}, [%2]!    \n"
1517                 "bne        0b                  \n"
1518                 : "=r"(nn),   // %0
1519                 "=r"(rgba), // %1
1520                 "=r"(ptr0), // %2
1521                 "=r"(ptr1), // %3
1522                 "=r"(ptr2)  // %4
1523                 : "0"(nn),
1524                 "1"(rgba),
1525                 "2"(ptr0),
1526                 "3"(ptr1),
1527                 "4"(ptr2)
1528                 : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10");
1529         }
1530 #endif // __aarch64__
1531 #endif // __ARM_NEON
1532         for (; remain > 0; remain--)
1533         {
1534             *ptr0 = rgba[2];
1535             *ptr1 = rgba[1];
1536             *ptr2 = rgba[0];
1537 
1538             rgba += 4;
1539             ptr0++;
1540             ptr1++;
1541             ptr2++;
1542         }
1543 
1544         rgba += wgap;
1545     }
1546 
1547     return 0;
1548 }
1549 
from_rgba2gray(const unsigned char * rgba,int w,int h,int stride,Mat & m,Allocator * allocator)1550 static int from_rgba2gray(const unsigned char* rgba, int w, int h, int stride, Mat& m, Allocator* allocator)
1551 {
1552     // coeffs for r g b = 0.299f, 0.587f, 0.114f
1553     const unsigned char Y_shift = 8; //14
1554     const unsigned char R2Y = 77;
1555     const unsigned char G2Y = 150;
1556     const unsigned char B2Y = 29;
1557 
1558     m.create(w, h, 1, 4u, allocator);
1559     if (m.empty())
1560         return -100;
1561 
1562     const int wgap = stride - w * 4;
1563     if (wgap == 0)
1564     {
1565         w = w * h;
1566         h = 1;
1567     }
1568 
1569     float* ptr = m;
1570 
1571     for (int y = 0; y < h; y++)
1572     {
1573 #if __ARM_NEON
1574         int nn = w >> 3;
1575         int remain = w - (nn << 3);
1576 #else
1577         int remain = w;
1578 #endif // __ARM_NEON
1579 
1580 #if __ARM_NEON
1581 #if __aarch64__
1582         uint8x8_t _R2Y = vdup_n_u8(R2Y);
1583         uint8x8_t _G2Y = vdup_n_u8(G2Y);
1584         uint8x8_t _B2Y = vdup_n_u8(B2Y);
1585         for (; nn > 0; nn--)
1586         {
1587             uint8x8x4_t _rgba = vld4_u8(rgba);
1588 
1589             uint16x8_t _y16 = vmull_u8(_rgba.val[0], _R2Y);
1590             _y16 = vmlal_u8(_y16, _rgba.val[1], _G2Y);
1591             _y16 = vmlal_u8(_y16, _rgba.val[2], _B2Y);
1592             _y16 = vshrq_n_u16(_y16, Y_shift);
1593 
1594             float32x4_t _ylow = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_y16)));
1595             float32x4_t _yhigh = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_y16)));
1596 
1597             vst1q_f32(ptr, _ylow);
1598             vst1q_f32(ptr + 4, _yhigh);
1599 
1600             rgba += 4 * 8;
1601             ptr += 8;
1602         }
1603 #else
1604         if (nn > 0)
1605         {
1606             asm volatile(
1607                 "vdup.u8    d16, %6             \n"
1608                 "vdup.u8    d17, %7             \n"
1609                 "vdup.u8    d18, %8             \n"
1610                 "0:                             \n"
1611                 "pld        [%1, #256]          \n"
1612                 "vld4.u8    {d0-d3}, [%1]!      \n"
1613                 "vmull.u8   q2, d0, d16         \n"
1614                 "vmlal.u8   q2, d1, d17         \n"
1615                 "vmlal.u8   q2, d2, d18         \n"
1616                 "vshr.u16   q2, q2, #8          \n" // Y_shift
1617                 "vmovl.u16  q0, d4              \n"
1618                 "vmovl.u16  q1, d5              \n"
1619                 "vcvt.f32.u32   q0, q0          \n"
1620                 "vcvt.f32.u32   q1, q1          \n"
1621                 "subs       %0, #1              \n"
1622                 "vst1.f32   {d0-d3}, [%2]!      \n"
1623                 "bne        0b                  \n"
1624                 : "=r"(nn),   // %0
1625                 "=r"(rgba), // %1
1626                 "=r"(ptr)   // %2
1627                 : "0"(nn),
1628                 "1"(rgba),
1629                 "2"(ptr),
1630                 "r"(R2Y), // %6
1631                 "r"(G2Y), // %7
1632                 "r"(B2Y)  // %8
1633                 : "cc", "memory", "q0", "q1", "q2", "q8", "q9");
1634         }
1635 #endif // __aarch64__
1636 #endif // __ARM_NEON
1637         for (; remain > 0; remain--)
1638         {
1639             *ptr = static_cast<float>((rgba[0] * R2Y + rgba[1] * G2Y + rgba[2] * B2Y) >> Y_shift);
1640 
1641             rgba += 4;
1642             ptr++;
1643         }
1644 
1645         rgba += wgap;
1646     }
1647 
1648     return 0;
1649 }
1650 
from_rgba2bgra(const unsigned char * rgba,int w,int h,int stride,Mat & m,Allocator * allocator)1651 static int from_rgba2bgra(const unsigned char* rgba, int w, int h, int stride, Mat& m, Allocator* allocator)
1652 {
1653     m.create(w, h, 4, 4u, allocator);
1654     if (m.empty())
1655         return -100;
1656 
1657     const int wgap = stride - w * 4;
1658     if (wgap == 0)
1659     {
1660         w = w * h;
1661         h = 1;
1662     }
1663 
1664     float* ptr0 = m.channel(0);
1665     float* ptr1 = m.channel(1);
1666     float* ptr2 = m.channel(2);
1667     float* ptr3 = m.channel(3);
1668 
1669     for (int y = 0; y < h; y++)
1670     {
1671 #if __ARM_NEON
1672         int nn = w >> 3;
1673         int remain = w - (nn << 3);
1674 #else
1675         int remain = w;
1676 #endif // __ARM_NEON
1677 
1678 #if __ARM_NEON
1679 #if __aarch64__
1680         for (; nn > 0; nn--)
1681         {
1682             uint8x8x4_t _rgba = vld4_u8(rgba);
1683             int16x8_t _r16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[0]));
1684             int16x8_t _g16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[1]));
1685             int16x8_t _b16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[2]));
1686             int16x8_t _a16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[3]));
1687 
1688             float32x4_t _rlow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_r16)));
1689             float32x4_t _rhigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_r16)));
1690             float32x4_t _glow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_g16)));
1691             float32x4_t _ghigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_g16)));
1692             float32x4_t _blow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_b16)));
1693             float32x4_t _bhigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_b16)));
1694             float32x4_t _alow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_a16)));
1695             float32x4_t _ahigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_a16)));
1696 
1697             vst1q_f32(ptr2, _rlow);
1698             vst1q_f32(ptr2 + 4, _rhigh);
1699             vst1q_f32(ptr1, _glow);
1700             vst1q_f32(ptr1 + 4, _ghigh);
1701             vst1q_f32(ptr0, _blow);
1702             vst1q_f32(ptr0 + 4, _bhigh);
1703             vst1q_f32(ptr3, _alow);
1704             vst1q_f32(ptr3 + 4, _ahigh);
1705 
1706             rgba += 4 * 8;
1707             ptr0 += 8;
1708             ptr1 += 8;
1709             ptr2 += 8;
1710             ptr3 += 8;
1711         }
1712 #else
1713         if (nn > 0)
1714         {
1715             asm volatile(
1716                 "0:                             \n"
1717                 "pld        [%1, #256]          \n"
1718                 "vld4.u8    {d0-d3}, [%1]!      \n"
1719                 "vmovl.u8   q8, d0              \n"
1720                 "vmovl.u8   q9, d1              \n"
1721                 "vmovl.u8   q10, d2             \n"
1722                 "vmovl.u8   q11, d3             \n"
1723                 "vmovl.u16  q0, d16             \n"
1724                 "vmovl.u16  q1, d17             \n"
1725                 "vmovl.u16  q2, d18             \n"
1726                 "vmovl.u16  q3, d19             \n"
1727                 "vmovl.u16  q8, d20             \n"
1728                 "vmovl.u16  q9, d21             \n"
1729                 "vmovl.u16  q10, d22            \n"
1730                 "vmovl.u16  q11, d23            \n"
1731                 "vcvt.f32.u32   q0, q0          \n"
1732                 "vcvt.f32.u32   q1, q1          \n"
1733                 "vcvt.f32.u32   q2, q2          \n"
1734                 "vcvt.f32.u32   q3, q3          \n"
1735                 "vcvt.f32.u32   q8, q8          \n"
1736                 "subs       %0, #1              \n"
1737                 "vst1.f32   {d0-d3}, [%4]!      \n"
1738                 "vcvt.f32.u32   q9, q9          \n"
1739                 "vcvt.f32.u32   q10, q10        \n"
1740                 "vst1.f32   {d4-d7}, [%3]!      \n"
1741                 "vcvt.f32.u32   q11, q11        \n"
1742                 "vst1.f32   {d16-d19}, [%2]!    \n"
1743                 "vst1.f32   {d20-d23}, [%5]!    \n"
1744                 "bne        0b                  \n"
1745                 : "=r"(nn),   // %0
1746                 "=r"(rgba), // %1
1747                 "=r"(ptr0), // %2
1748                 "=r"(ptr1), // %3
1749                 "=r"(ptr2), // %4
1750                 "=r"(ptr3)  // %5
1751                 : "0"(nn),
1752                 "1"(rgba),
1753                 "2"(ptr0),
1754                 "3"(ptr1),
1755                 "4"(ptr2),
1756                 "5"(ptr3)
1757                 : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
1758         }
1759 #endif // __aarch64__
1760 #endif // __ARM_NEON
1761         for (; remain > 0; remain--)
1762         {
1763             *ptr0 = rgba[2];
1764             *ptr1 = rgba[1];
1765             *ptr2 = rgba[0];
1766             *ptr3 = rgba[3];
1767 
1768             rgba += 4;
1769             ptr0++;
1770             ptr1++;
1771             ptr2++;
1772             ptr3++;
1773         }
1774 
1775         rgba += wgap;
1776     }
1777 
1778     return 0;
1779 }
1780 
to_rgba2bgra(const Mat & m,unsigned char * bgra,int stride)1781 static void to_rgba2bgra(const Mat& m, unsigned char* bgra, int stride)
1782 {
1783     int w = m.w;
1784     int h = m.h;
1785 
1786     const int wgap = stride - w * 4;
1787     if (wgap == 0)
1788     {
1789         w = w * h;
1790         h = 1;
1791     }
1792 
1793     const float* ptr0 = m.channel(0);
1794     const float* ptr1 = m.channel(1);
1795     const float* ptr2 = m.channel(2);
1796     const float* ptr3 = m.channel(3);
1797 
1798     for (int y = 0; y < h; y++)
1799     {
1800 #define SATURATE_CAST_UCHAR(X) (unsigned char)::std::min(::std::max((int)(X), 0), 255);
1801 
1802 #if __ARM_NEON
1803         int nn = w >> 3;
1804         int remain = w - (nn << 3);
1805 #else
1806         int remain = w;
1807 #endif // __ARM_NEON
1808 
1809 #if __ARM_NEON
1810         for (; nn > 0; nn--)
1811         {
1812             float32x4_t _rlow = vld1q_f32(ptr0);
1813             float32x4_t _rhigh = vld1q_f32(ptr0 + 4);
1814             float32x4_t _glow = vld1q_f32(ptr1);
1815             float32x4_t _ghigh = vld1q_f32(ptr1 + 4);
1816             float32x4_t _blow = vld1q_f32(ptr2);
1817             float32x4_t _bhigh = vld1q_f32(ptr2 + 4);
1818             float32x4_t _alow = vld1q_f32(ptr3);
1819             float32x4_t _ahigh = vld1q_f32(ptr3 + 4);
1820 
1821             int16x8_t _r16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_rlow)), vmovn_s32(vcvtq_s32_f32(_rhigh)));
1822             int16x8_t _g16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_glow)), vmovn_s32(vcvtq_s32_f32(_ghigh)));
1823             int16x8_t _b16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_blow)), vmovn_s32(vcvtq_s32_f32(_bhigh)));
1824             int16x8_t _a16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_alow)), vmovn_s32(vcvtq_s32_f32(_ahigh)));
1825 
1826             uint8x8x4_t _bgra;
1827             _bgra.val[0] = vqmovun_s16(_b16);
1828             _bgra.val[1] = vqmovun_s16(_g16);
1829             _bgra.val[2] = vqmovun_s16(_r16);
1830             _bgra.val[3] = vqmovun_s16(_a16);
1831 
1832             vst4_u8(bgra, _bgra);
1833 
1834             bgra += 4 * 8;
1835             ptr0 += 8;
1836             ptr1 += 8;
1837             ptr2 += 8;
1838             ptr3 += 8;
1839         }
1840 #endif // __ARM_NEON
1841         for (; remain > 0; remain--)
1842         {
1843             bgra[0] = SATURATE_CAST_UCHAR(*ptr2);
1844             bgra[1] = SATURATE_CAST_UCHAR(*ptr1);
1845             bgra[2] = SATURATE_CAST_UCHAR(*ptr0);
1846             bgra[3] = SATURATE_CAST_UCHAR(*ptr3);
1847 
1848             bgra += 4;
1849             ptr0++;
1850             ptr1++;
1851             ptr2++;
1852             ptr3++;
1853         }
1854 
1855 #undef SATURATE_CAST_UCHAR
1856         bgra += wgap;
1857     }
1858 }
1859 
from_bgra2gray(const unsigned char * bgra,int w,int h,int stride,Mat & m,Allocator * allocator)1860 static int from_bgra2gray(const unsigned char* bgra, int w, int h, int stride, Mat& m, Allocator* allocator)
1861 {
1862     // coeffs for r g b = 0.299f, 0.587f, 0.114f
1863     const unsigned char Y_shift = 8; //14
1864     const unsigned char R2Y = 77;
1865     const unsigned char G2Y = 150;
1866     const unsigned char B2Y = 29;
1867 
1868     m.create(w, h, 1, 4u, allocator);
1869     if (m.empty())
1870         return -100;
1871 
1872     const int wgap = stride - w * 4;
1873     if (wgap == 0)
1874     {
1875         w = w * h;
1876         h = 1;
1877     }
1878 
1879     float* ptr = m;
1880 
1881     for (int y = 0; y < h; y++)
1882     {
1883 #if __ARM_NEON
1884         int nn = w >> 3;
1885         int remain = w - (nn << 3);
1886 #else
1887         int remain = w;
1888 #endif // __ARM_NEON
1889 
1890 #if __ARM_NEON
1891 #if __aarch64__
1892         uint8x8_t _R2Y = vdup_n_u8(R2Y);
1893         uint8x8_t _G2Y = vdup_n_u8(G2Y);
1894         uint8x8_t _B2Y = vdup_n_u8(B2Y);
1895         for (; nn > 0; nn--)
1896         {
1897             uint8x8x4_t _bgra = vld4_u8(bgra);
1898 
1899             uint16x8_t _y16 = vmull_u8(_bgra.val[2], _R2Y);
1900             _y16 = vmlal_u8(_y16, _bgra.val[1], _G2Y);
1901             _y16 = vmlal_u8(_y16, _bgra.val[0], _B2Y);
1902             _y16 = vshrq_n_u16(_y16, Y_shift);
1903 
1904             float32x4_t _ylow = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_y16)));
1905             float32x4_t _yhigh = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_y16)));
1906 
1907             vst1q_f32(ptr, _ylow);
1908             vst1q_f32(ptr + 4, _yhigh);
1909 
1910             bgra += 4 * 8;
1911             ptr += 8;
1912         }
1913 #else
1914         if (nn > 0)
1915         {
1916             asm volatile(
1917                 "vdup.u8    d16, %6             \n"
1918                 "vdup.u8    d17, %7             \n"
1919                 "vdup.u8    d18, %8             \n"
1920                 "0:                             \n"
1921                 "pld        [%1, #256]          \n"
1922                 "vld4.u8    {d0-d3}, [%1]!      \n"
1923                 "vmull.u8   q2, d2, d16         \n"
1924                 "vmlal.u8   q2, d1, d17         \n"
1925                 "vmlal.u8   q2, d0, d18         \n"
1926                 "vshr.u16   q2, q2, #8          \n" // Y_shift
1927                 "vmovl.u16  q0, d4              \n"
1928                 "vmovl.u16  q1, d5              \n"
1929                 "vcvt.f32.u32   q0, q0          \n"
1930                 "vcvt.f32.u32   q1, q1          \n"
1931                 "subs       %0, #1              \n"
1932                 "vst1.f32   {d0-d3}, [%2]!      \n"
1933                 "bne        0b                  \n"
1934                 : "=r"(nn),   // %0
1935                 "=r"(bgra), // %1
1936                 "=r"(ptr)   // %2
1937                 : "0"(nn),
1938                 "1"(bgra),
1939                 "2"(ptr),
1940                 "r"(R2Y), // %6
1941                 "r"(G2Y), // %7
1942                 "r"(B2Y)  // %8
1943                 : "cc", "memory", "q0", "q1", "q2", "q8", "q9");
1944         }
1945 #endif // __aarch64__
1946 #endif // __ARM_NEON
1947         for (; remain > 0; remain--)
1948         {
1949             *ptr = static_cast<float>((bgra[2] * R2Y + bgra[1] * G2Y + bgra[0] * B2Y) >> Y_shift);
1950 
1951             bgra += 4;
1952             ptr++;
1953         }
1954 
1955         bgra += wgap;
1956     }
1957 
1958     return 0;
1959 }
1960 
yuv420sp2rgb(const unsigned char * yuv420sp,int w,int h,unsigned char * rgb)1961 void yuv420sp2rgb(const unsigned char* yuv420sp, int w, int h, unsigned char* rgb)
1962 {
1963     const unsigned char* yptr = yuv420sp;
1964     const unsigned char* vuptr = yuv420sp + w * h;
1965 
1966 #if __ARM_NEON
1967     uint8x8_t _v128 = vdup_n_u8(128);
1968     int8x8_t _v90 = vdup_n_s8(90);
1969     int8x8_t _v46 = vdup_n_s8(46);
1970     int8x8_t _v22 = vdup_n_s8(22);
1971     int8x8_t _v113 = vdup_n_s8(113);
1972 #endif // __ARM_NEON
1973 
1974     for (int y = 0; y < h; y += 2)
1975     {
1976         const unsigned char* yptr0 = yptr;
1977         const unsigned char* yptr1 = yptr + w;
1978         unsigned char* rgb0 = rgb;
1979         unsigned char* rgb1 = rgb + w * 3;
1980 
1981 #if __ARM_NEON
1982         int nn = w >> 3;
1983         int remain = w - (nn << 3);
1984 #else
1985         int remain = w;
1986 #endif // __ARM_NEON
1987 
1988 #if __ARM_NEON
1989 #if __aarch64__
1990         for (; nn > 0; nn--)
1991         {
1992             int16x8_t _yy0 = vreinterpretq_s16_u16(vshll_n_u8(vld1_u8(yptr0), 6));
1993             int16x8_t _yy1 = vreinterpretq_s16_u16(vshll_n_u8(vld1_u8(yptr1), 6));
1994 
1995             int8x8_t _vvuu = vreinterpret_s8_u8(vsub_u8(vld1_u8(vuptr), _v128));
1996             int8x8x2_t _vvvvuuuu = vtrn_s8(_vvuu, _vvuu);
1997             int8x8_t _vv = _vvvvuuuu.val[0];
1998             int8x8_t _uu = _vvvvuuuu.val[1];
1999 
2000             int16x8_t _r0 = vmlal_s8(_yy0, _vv, _v90);
2001             int16x8_t _g0 = vmlsl_s8(_yy0, _vv, _v46);
2002             _g0 = vmlsl_s8(_g0, _uu, _v22);
2003             int16x8_t _b0 = vmlal_s8(_yy0, _uu, _v113);
2004 
2005             int16x8_t _r1 = vmlal_s8(_yy1, _vv, _v90);
2006             int16x8_t _g1 = vmlsl_s8(_yy1, _vv, _v46);
2007             _g1 = vmlsl_s8(_g1, _uu, _v22);
2008             int16x8_t _b1 = vmlal_s8(_yy1, _uu, _v113);
2009 
2010             uint8x8x3_t _rgb0;
2011             _rgb0.val[0] = vqshrun_n_s16(_r0, 6);
2012             _rgb0.val[1] = vqshrun_n_s16(_g0, 6);
2013             _rgb0.val[2] = vqshrun_n_s16(_b0, 6);
2014 
2015             uint8x8x3_t _rgb1;
2016             _rgb1.val[0] = vqshrun_n_s16(_r1, 6);
2017             _rgb1.val[1] = vqshrun_n_s16(_g1, 6);
2018             _rgb1.val[2] = vqshrun_n_s16(_b1, 6);
2019 
2020             vst3_u8(rgb0, _rgb0);
2021             vst3_u8(rgb1, _rgb1);
2022 
2023             yptr0 += 8;
2024             yptr1 += 8;
2025             vuptr += 8;
2026             rgb0 += 24;
2027             rgb1 += 24;
2028         }
2029 #else
2030         if (nn > 0)
2031         {
2032             asm volatile(
2033                 "pld        [%3, #128]          \n"
2034                 "vld1.u8    {d2}, [%3]!         \n"
2035                 "vsub.s8    d2, d2, %12         \n"
2036                 "0:                             \n"
2037                 "pld        [%1, #128]          \n"
2038                 "vld1.u8    {d0}, [%1]!         \n"
2039                 "pld        [%2, #128]          \n"
2040                 "vld1.u8    {d1}, [%2]!         \n"
2041                 "vshll.u8   q2, d0, #6          \n"
2042                 "vorr       d3, d2, d2          \n"
2043                 "vshll.u8   q3, d1, #6          \n"
2044                 "vorr       q9, q2, q2          \n"
2045                 "vtrn.s8    d2, d3              \n"
2046                 "vorr       q11, q3, q3         \n"
2047                 "vmlsl.s8   q9, d2, %14         \n"
2048                 "vorr       q8, q2, q2          \n"
2049                 "vmlsl.s8   q11, d2, %14        \n"
2050                 "vorr       q10, q3, q3         \n"
2051                 "vmlal.s8   q8, d2, %13         \n"
2052                 "vmlal.s8   q2, d3, %16         \n"
2053                 "vmlal.s8   q10, d2, %13        \n"
2054                 "vmlsl.s8   q9, d3, %15         \n"
2055                 "vmlal.s8   q3, d3, %16         \n"
2056                 "vmlsl.s8   q11, d3, %15        \n"
2057                 "vqshrun.s16 d24, q8, #6        \n"
2058                 "vqshrun.s16 d26, q2, #6        \n"
2059                 "vqshrun.s16 d4, q10, #6        \n"
2060                 "vqshrun.s16 d25, q9, #6        \n"
2061                 "vqshrun.s16 d6, q3, #6         \n"
2062                 "vqshrun.s16 d5, q11, #6        \n"
2063                 "pld        [%3, #128]          \n"
2064                 "vld1.u8    {d2}, [%3]!         \n"
2065                 "subs       %0, #1              \n"
2066                 "vst3.u8    {d24-d26}, [%4]!    \n"
2067                 "vsub.s8    d2, d2, %12         \n"
2068                 "vst3.u8    {d4-d6}, [%5]!      \n"
2069                 "bne        0b                  \n"
2070                 "sub        %3, #8              \n"
2071                 : "=r"(nn),    // %0
2072                 "=r"(yptr0), // %1
2073                 "=r"(yptr1), // %2
2074                 "=r"(vuptr), // %3
2075                 "=r"(rgb0),  // %4
2076                 "=r"(rgb1)   // %5
2077                 : "0"(nn),
2078                 "1"(yptr0),
2079                 "2"(yptr1),
2080                 "3"(vuptr),
2081                 "4"(rgb0),
2082                 "5"(rgb1),
2083                 "w"(_v128), // %12
2084                 "w"(_v90),  // %13
2085                 "w"(_v46),  // %14
2086                 "w"(_v22),  // %15
2087                 "w"(_v113)  // %16
2088                 : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "d26");
2089         }
2090 #endif // __aarch64__
2091 #endif // __ARM_NEON
2092 
2093 #define SATURATE_CAST_UCHAR(X) (unsigned char)::std::min(::std::max((int)(X), 0), 255);
2094         for (; remain > 0; remain -= 2)
2095         {
2096             // R = 1.164 * yy + 1.596 * vv
2097             // G = 1.164 * yy - 0.813 * vv - 0.391 * uu
2098             // B = 1.164 * yy              + 2.018 * uu
2099 
2100             // R = Y + (1.370705 * (V-128))
2101             // G = Y - (0.698001 * (V-128)) - (0.337633 * (U-128))
2102             // B = Y + (1.732446 * (U-128))
2103 
2104             // R = ((Y << 6) + 87.72512 * (V-128)) >> 6
2105             // G = ((Y << 6) - 44.672064 * (V-128) - 21.608512 * (U-128)) >> 6
2106             // B = ((Y << 6) + 110.876544 * (U-128)) >> 6
2107 
2108             // R = ((Y << 6) + 90 * (V-128)) >> 6
2109             // G = ((Y << 6) - 46 * (V-128) - 22 * (U-128)) >> 6
2110             // B = ((Y << 6) + 113 * (U-128)) >> 6
2111 
2112             // R = (yy + 90 * vv) >> 6
2113             // G = (yy - 46 * vv - 22 * uu) >> 6
2114             // B = (yy + 113 * uu) >> 6
2115 
2116             int v = vuptr[0] - 128;
2117             int u = vuptr[1] - 128;
2118 
2119             int ruv = 90 * v;
2120             int guv = -46 * v + -22 * u;
2121             int buv = 113 * u;
2122 
2123             int y00 = yptr0[0] << 6;
2124             rgb0[0] = SATURATE_CAST_UCHAR((y00 + ruv) >> 6);
2125             rgb0[1] = SATURATE_CAST_UCHAR((y00 + guv) >> 6);
2126             rgb0[2] = SATURATE_CAST_UCHAR((y00 + buv) >> 6);
2127 
2128             int y01 = yptr0[1] << 6;
2129             rgb0[3] = SATURATE_CAST_UCHAR((y01 + ruv) >> 6);
2130             rgb0[4] = SATURATE_CAST_UCHAR((y01 + guv) >> 6);
2131             rgb0[5] = SATURATE_CAST_UCHAR((y01 + buv) >> 6);
2132 
2133             int y10 = yptr1[0] << 6;
2134             rgb1[0] = SATURATE_CAST_UCHAR((y10 + ruv) >> 6);
2135             rgb1[1] = SATURATE_CAST_UCHAR((y10 + guv) >> 6);
2136             rgb1[2] = SATURATE_CAST_UCHAR((y10 + buv) >> 6);
2137 
2138             int y11 = yptr1[1] << 6;
2139             rgb1[3] = SATURATE_CAST_UCHAR((y11 + ruv) >> 6);
2140             rgb1[4] = SATURATE_CAST_UCHAR((y11 + guv) >> 6);
2141             rgb1[5] = SATURATE_CAST_UCHAR((y11 + buv) >> 6);
2142 
2143             yptr0 += 2;
2144             yptr1 += 2;
2145             vuptr += 2;
2146             rgb0 += 6;
2147             rgb1 += 6;
2148         }
2149 #undef SATURATE_CAST_UCHAR
2150 
2151         yptr += 2 * w;
2152         rgb += 2 * 3 * w;
2153     }
2154 }
2155 
yuv420sp2rgb_nv12(const unsigned char * yuv420sp,int w,int h,unsigned char * rgb)2156 void yuv420sp2rgb_nv12(const unsigned char* yuv420sp, int w, int h, unsigned char* rgb)
2157 {
2158     const unsigned char* yptr = yuv420sp;
2159     const unsigned char* uvptr = yuv420sp + w * h;
2160 
2161 #if __ARM_NEON
2162     uint8x8_t _v128 = vdup_n_u8(128);
2163     int8x8_t _v90 = vdup_n_s8(90);
2164     int8x8_t _v46 = vdup_n_s8(46);
2165     int8x8_t _v22 = vdup_n_s8(22);
2166     int8x8_t _v113 = vdup_n_s8(113);
2167 #endif // __ARM_NEON
2168 
2169     for (int y = 0; y < h; y += 2)
2170     {
2171         const unsigned char* yptr0 = yptr;
2172         const unsigned char* yptr1 = yptr + w;
2173         unsigned char* rgb0 = rgb;
2174         unsigned char* rgb1 = rgb + w * 3;
2175 
2176 #if __ARM_NEON
2177         int nn = w >> 3;
2178         int remain = w - (nn << 3);
2179 #else
2180         int remain = w;
2181 #endif // __ARM_NEON
2182 
2183 #if __ARM_NEON
2184 #if __aarch64__
2185         for (; nn > 0; nn--)
2186         {
2187             int16x8_t _yy0 = vreinterpretq_s16_u16(vshll_n_u8(vld1_u8(yptr0), 6));
2188             int16x8_t _yy1 = vreinterpretq_s16_u16(vshll_n_u8(vld1_u8(yptr1), 6));
2189 
2190             int8x8_t _uuvv = vreinterpret_s8_u8(vsub_u8(vld1_u8(uvptr), _v128));
2191             int8x8x2_t _uuuuvvvv = vtrn_s8(_uuvv, _uuvv);
2192             int8x8_t _uu = _uuuuvvvv.val[0];
2193             int8x8_t _vv = _uuuuvvvv.val[1];
2194 
2195             int16x8_t _r0 = vmlal_s8(_yy0, _vv, _v90);
2196             int16x8_t _g0 = vmlsl_s8(_yy0, _vv, _v46);
2197             _g0 = vmlsl_s8(_g0, _uu, _v22);
2198             int16x8_t _b0 = vmlal_s8(_yy0, _uu, _v113);
2199 
2200             int16x8_t _r1 = vmlal_s8(_yy1, _vv, _v90);
2201             int16x8_t _g1 = vmlsl_s8(_yy1, _vv, _v46);
2202             _g1 = vmlsl_s8(_g1, _uu, _v22);
2203             int16x8_t _b1 = vmlal_s8(_yy1, _uu, _v113);
2204 
2205             uint8x8x3_t _rgb0;
2206             _rgb0.val[0] = vqshrun_n_s16(_r0, 6);
2207             _rgb0.val[1] = vqshrun_n_s16(_g0, 6);
2208             _rgb0.val[2] = vqshrun_n_s16(_b0, 6);
2209 
2210             uint8x8x3_t _rgb1;
2211             _rgb1.val[0] = vqshrun_n_s16(_r1, 6);
2212             _rgb1.val[1] = vqshrun_n_s16(_g1, 6);
2213             _rgb1.val[2] = vqshrun_n_s16(_b1, 6);
2214 
2215             vst3_u8(rgb0, _rgb0);
2216             vst3_u8(rgb1, _rgb1);
2217 
2218             yptr0 += 8;
2219             yptr1 += 8;
2220             uvptr += 8;
2221             rgb0 += 24;
2222             rgb1 += 24;
2223         }
2224 #else
2225         if (nn > 0)
2226         {
2227             asm volatile(
2228                 "pld        [%3, #128]          \n"
2229                 "vld1.u8    {d2}, [%3]!         \n"
2230                 "vsub.s8    d2, d2, %12         \n"
2231                 "0:                             \n"
2232                 "pld        [%1, #128]          \n"
2233                 "vld1.u8    {d0}, [%1]!         \n"
2234                 "pld        [%2, #128]          \n"
2235                 "vld1.u8    {d1}, [%2]!         \n"
2236                 "vshll.u8   q2, d0, #6          \n"
2237                 "vorr       d3, d2, d2          \n"
2238                 "vshll.u8   q3, d1, #6          \n"
2239                 "vorr       q9, q2, q2          \n"
2240                 "vtrn.s8    d2, d3              \n"
2241                 "vorr       q11, q3, q3         \n"
2242                 "vmlsl.s8   q9, d3, %14         \n"
2243                 "vorr       q8, q2, q2          \n"
2244                 "vmlsl.s8   q11, d3, %14        \n"
2245                 "vorr       q10, q3, q3         \n"
2246                 "vmlal.s8   q8, d3, %13         \n"
2247                 "vmlal.s8   q2, d2, %16         \n"
2248                 "vmlal.s8   q10, d3, %13        \n"
2249                 "vmlsl.s8   q9, d2, %15         \n"
2250                 "vmlal.s8   q3, d2, %16         \n"
2251                 "vmlsl.s8   q11, d2, %15        \n"
2252                 "vqshrun.s16 d24, q8, #6        \n"
2253                 "vqshrun.s16 d26, q2, #6        \n"
2254                 "vqshrun.s16 d4, q10, #6        \n"
2255                 "vqshrun.s16 d25, q9, #6        \n"
2256                 "vqshrun.s16 d6, q3, #6         \n"
2257                 "vqshrun.s16 d5, q11, #6        \n"
2258                 "pld        [%3, #128]          \n"
2259                 "vld1.u8    {d2}, [%3]!         \n"
2260                 "subs       %0, #1              \n"
2261                 "vst3.u8    {d24-d26}, [%4]!    \n"
2262                 "vsub.s8    d2, d2, %12         \n"
2263                 "vst3.u8    {d4-d6}, [%5]!      \n"
2264                 "bne        0b                  \n"
2265                 "sub        %3, #8              \n"
2266                 : "=r"(nn),    // %0
2267                 "=r"(yptr0), // %1
2268                 "=r"(yptr1), // %2
2269                 "=r"(uvptr), // %3
2270                 "=r"(rgb0),  // %4
2271                 "=r"(rgb1)   // %5
2272                 : "0"(nn),
2273                 "1"(yptr0),
2274                 "2"(yptr1),
2275                 "3"(uvptr),
2276                 "4"(rgb0),
2277                 "5"(rgb1),
2278                 "w"(_v128), // %12
2279                 "w"(_v90),  // %13
2280                 "w"(_v46),  // %14
2281                 "w"(_v22),  // %15
2282                 "w"(_v113)  // %16
2283                 : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "d26");
2284         }
2285 #endif // __aarch64__
2286 #endif // __ARM_NEON
2287 
2288 #define SATURATE_CAST_UCHAR(X) (unsigned char)::std::min(::std::max((int)(X), 0), 255);
2289         for (; remain > 0; remain -= 2)
2290         {
2291             // R = 1.164 * yy + 1.596 * vv
2292             // G = 1.164 * yy - 0.813 * vv - 0.391 * uu
2293             // B = 1.164 * yy              + 2.018 * uu
2294 
2295             // R = Y + (1.370705 * (V-128))
2296             // G = Y - (0.698001 * (V-128)) - (0.337633 * (U-128))
2297             // B = Y + (1.732446 * (U-128))
2298 
2299             // R = ((Y << 6) + 87.72512 * (V-128)) >> 6
2300             // G = ((Y << 6) - 44.672064 * (V-128) - 21.608512 * (U-128)) >> 6
2301             // B = ((Y << 6) + 110.876544 * (U-128)) >> 6
2302 
2303             // R = ((Y << 6) + 90 * (V-128)) >> 6
2304             // G = ((Y << 6) - 46 * (V-128) - 22 * (U-128)) >> 6
2305             // B = ((Y << 6) + 113 * (U-128)) >> 6
2306 
2307             // R = (yy + 90 * vv) >> 6
2308             // G = (yy - 46 * vv - 22 * uu) >> 6
2309             // B = (yy + 113 * uu) >> 6
2310 
2311             int u = uvptr[0] - 128;
2312             int v = uvptr[1] - 128;
2313 
2314             int ruv = 90 * v;
2315             int guv = -46 * v + -22 * u;
2316             int buv = 113 * u;
2317 
2318             int y00 = yptr0[0] << 6;
2319             rgb0[0] = SATURATE_CAST_UCHAR((y00 + ruv) >> 6);
2320             rgb0[1] = SATURATE_CAST_UCHAR((y00 + guv) >> 6);
2321             rgb0[2] = SATURATE_CAST_UCHAR((y00 + buv) >> 6);
2322 
2323             int y01 = yptr0[1] << 6;
2324             rgb0[3] = SATURATE_CAST_UCHAR((y01 + ruv) >> 6);
2325             rgb0[4] = SATURATE_CAST_UCHAR((y01 + guv) >> 6);
2326             rgb0[5] = SATURATE_CAST_UCHAR((y01 + buv) >> 6);
2327 
2328             int y10 = yptr1[0] << 6;
2329             rgb1[0] = SATURATE_CAST_UCHAR((y10 + ruv) >> 6);
2330             rgb1[1] = SATURATE_CAST_UCHAR((y10 + guv) >> 6);
2331             rgb1[2] = SATURATE_CAST_UCHAR((y10 + buv) >> 6);
2332 
2333             int y11 = yptr1[1] << 6;
2334             rgb1[3] = SATURATE_CAST_UCHAR((y11 + ruv) >> 6);
2335             rgb1[4] = SATURATE_CAST_UCHAR((y11 + guv) >> 6);
2336             rgb1[5] = SATURATE_CAST_UCHAR((y11 + buv) >> 6);
2337 
2338             yptr0 += 2;
2339             yptr1 += 2;
2340             uvptr += 2;
2341             rgb0 += 6;
2342             rgb1 += 6;
2343         }
2344 #undef SATURATE_CAST_UCHAR
2345 
2346         yptr += 2 * w;
2347         rgb += 2 * 3 * w;
2348     }
2349 }
2350 
yuv420sp2rgb_half(const unsigned char * yuv,int w,int h,unsigned char * rgb)2351 void yuv420sp2rgb_half(const unsigned char* yuv, int w, int h, unsigned char* rgb)
2352 {
2353     const unsigned char* puv = yuv + w * h;
2354     const unsigned char *py0 = yuv, *py1 = yuv + w;
2355     const int hstep = h / 2;
2356 #if __ARM_NEON
2357     const int wstep = w / 16, tailstep = (w - wstep * 16) / 2;
2358     uint8x8_t _u128 = vdup_n_u8(128);
2359     int8x8_t _s90 = vdup_n_s8(90);
2360     int8x8_t _sn46 = vdup_n_s8(-46);
2361     int8x8_t _s113 = vdup_n_s8(113);
2362     int8x8_t _sn22 = vdup_n_s8(-22);
2363     int16x8_t _s0 = vdupq_n_s16(0);
2364     int16x8_t _s16320 = vdupq_n_s16(16320); // 255 << 6
2365 #else
2366     const int tailstep = w / 2;
2367 #endif
2368 
2369     for (int i = 0; i < hstep; ++i)
2370     {
2371 #if __ARM_NEON
2372         for (int j = 0; j < wstep; ++j)
2373         {
2374             uint8x16_t y0 = vld1q_u8(py0);
2375             uint8x16_t y1 = vld1q_u8(py1);
2376 
2377             // first 8 Y
2378             uint16x8_t low = vaddl_u8(vget_low_u8(y0), vget_low_u8(y1));
2379             uint16x4_t low_sum = vpadd_u16(vget_low_u16(low), vget_high_u16(low));
2380 
2381             // last 8 Y
2382             uint16x8_t high = vaddl_u8(vget_high_u8(y0), vget_high_u8(y1));
2383             uint16x4_t high_sum = vpadd_u16(vget_low_u16(high), vget_high_u16(high));
2384 
2385             uint16x8_t y8_sum = vcombine_u16(low_sum, high_sum);
2386             // y8 = (y8_sum >> 2) << 6 = y8_sum << 4;
2387             int16x8_t y8 = vreinterpretq_s16_u16(vshlq_n_u16(y8_sum, 4));
2388 
2389             // prepare uv
2390             uint8x8x2_t vu = vld2_u8(puv);
2391             int8x8_t v = vreinterpret_s8_u8(vsub_u8(vu.val[0], _u128));
2392             int8x8_t u = vreinterpret_s8_u8(vsub_u8(vu.val[1], _u128));
2393 
2394             int16x8_t r_acc = vmlal_s8(y8, v, _s90);
2395             int16x8_t g_acc = vmlal_s8(y8, v, _sn46);
2396             g_acc = vmlal_s8(g_acc, u, _sn22);
2397             int16x8_t b_acc = vmlal_s8(y8, u, _s113);
2398 
2399 #define SHIFT_6_SATURATE(FROM, TO)                     \
2400     FROM = vmaxq_s16(vminq_s16((FROM), _s16320), _s0); \
2401     uint8x8_t TO = vshrn_n_u16(vreinterpretq_u16_s16((FROM)), 6);
2402 
2403             SHIFT_6_SATURATE(b_acc, b_out)
2404             SHIFT_6_SATURATE(g_acc, g_out)
2405             SHIFT_6_SATURATE(r_acc, r_out)
2406 #undef SHIFT_6_SATURATE
2407 
2408             uint8x8x3_t _rgb;
2409             _rgb.val[0] = r_out;
2410             _rgb.val[1] = g_out;
2411             _rgb.val[2] = b_out;
2412             vst3_u8(rgb, _rgb);
2413 
2414             rgb += 24;
2415             py0 += 16;
2416             py1 += 16;
2417             puv += 16;
2418         }
2419 #endif
2420 
2421         for (int idx = 0; idx < tailstep; ++idx)
2422         {
2423             int y = (static_cast<int>(py0[0]) + py0[1] + py1[2] + py1[1]) << 4;
2424             int v = static_cast<int>(puv[0]) - 128;
2425             int u = static_cast<int>(puv[1]) - 128;
2426 
2427             int ruv = 90 * v;
2428             int guv = -46 * v + -22 * u;
2429             int buv = 113 * u;
2430 
2431 #define SATURATE_CAST_UCHAR(X) (unsigned char)::std::min(::std::max((int)(X), 0), 255);
2432             rgb[0] = SATURATE_CAST_UCHAR((y + ruv) >> 6);
2433             rgb[1] = SATURATE_CAST_UCHAR((y + guv) >> 6);
2434             rgb[2] = SATURATE_CAST_UCHAR((y + buv) >> 6);
2435 #undef SATURATE_CAST_UCHAR
2436 
2437             rgb += 3;
2438             py0 += 2;
2439             py1 += 2;
2440             puv += 2;
2441         }
2442         // next two row
2443         py0 = py1;
2444         py1 = py0 + w;
2445     }
2446 }
2447 
from_pixels(const unsigned char * pixels,int type,int w,int h,Allocator * allocator)2448 Mat Mat::from_pixels(const unsigned char* pixels, int type, int w, int h, Allocator* allocator)
2449 {
2450     int type_from = type & PIXEL_FORMAT_MASK;
2451 
2452     if (type_from == PIXEL_RGB || type_from == PIXEL_BGR)
2453     {
2454         return Mat::from_pixels(pixels, type, w, h, w * 3, allocator);
2455     }
2456     else if (type_from == PIXEL_GRAY)
2457     {
2458         return Mat::from_pixels(pixels, type, w, h, w * 1, allocator);
2459     }
2460     else if (type_from == PIXEL_RGBA || type_from == PIXEL_BGRA)
2461     {
2462         return Mat::from_pixels(pixels, type, w, h, w * 4, allocator);
2463     }
2464 
2465     // unknown convert type
2466     NCNN_LOGE("unknown convert type %d", type);
2467     return Mat();
2468 }
2469 
from_pixels(const unsigned char * pixels,int type,int w,int h,int stride,Allocator * allocator)2470 Mat Mat::from_pixels(const unsigned char* pixels, int type, int w, int h, int stride, Allocator* allocator)
2471 {
2472     Mat m;
2473 
2474     if (type & PIXEL_CONVERT_MASK)
2475     {
2476         switch (type)
2477         {
2478         case PIXEL_RGB2BGR:
2479         case PIXEL_BGR2RGB:
2480             from_rgb2bgr(pixels, w, h, stride, m, allocator);
2481             break;
2482         case PIXEL_RGB2GRAY:
2483             from_rgb2gray(pixels, w, h, stride, m, allocator);
2484             break;
2485         case PIXEL_RGB2RGBA:
2486         case PIXEL_BGR2BGRA:
2487             from_rgb2rgba(pixels, w, h, stride, m, allocator);
2488             break;
2489         case PIXEL_BGR2GRAY:
2490             from_bgr2gray(pixels, w, h, stride, m, allocator);
2491             break;
2492         case PIXEL_BGR2RGBA:
2493         case PIXEL_RGB2BGRA:
2494             from_bgr2rgba(pixels, w, h, stride, m, allocator);
2495             break;
2496         case PIXEL_GRAY2RGB:
2497         case PIXEL_GRAY2BGR:
2498             from_gray2rgb(pixels, w, h, stride, m, allocator);
2499             break;
2500         case PIXEL_GRAY2RGBA:
2501         case PIXEL_GRAY2BGRA:
2502             from_gray2rgba(pixels, w, h, stride, m, allocator);
2503             break;
2504         case PIXEL_RGBA2RGB:
2505         case PIXEL_BGRA2BGR:
2506             from_rgba2rgb(pixels, w, h, stride, m, allocator);
2507             break;
2508         case PIXEL_RGBA2BGR:
2509         case PIXEL_BGRA2RGB:
2510             from_rgba2bgr(pixels, w, h, stride, m, allocator);
2511             break;
2512         case PIXEL_RGBA2GRAY:
2513             from_rgba2gray(pixels, w, h, stride, m, allocator);
2514             break;
2515         case PIXEL_RGBA2BGRA:
2516         case PIXEL_BGRA2RGBA:
2517             from_rgba2bgra(pixels, w, h, stride, m, allocator);
2518             break;
2519         case PIXEL_BGRA2GRAY:
2520             from_bgra2gray(pixels, w, h, stride, m, allocator);
2521             break;
2522         default:
2523             // unimplemented convert type
2524             NCNN_LOGE("unimplemented convert type %d", type);
2525             break;
2526         }
2527     }
2528     else
2529     {
2530         if (type == PIXEL_RGB || type == PIXEL_BGR)
2531             from_rgb(pixels, w, h, stride, m, allocator);
2532 
2533         if (type == PIXEL_GRAY)
2534             from_gray(pixels, w, h, stride, m, allocator);
2535 
2536         if (type == PIXEL_RGBA || type == PIXEL_BGRA)
2537             from_rgba(pixels, w, h, stride, m, allocator);
2538     }
2539 
2540     return m;
2541 }
2542 
from_pixels_resize(const unsigned char * pixels,int type,int w,int h,int target_width,int target_height,Allocator * allocator)2543 Mat Mat::from_pixels_resize(const unsigned char* pixels, int type, int w, int h, int target_width, int target_height, Allocator* allocator)
2544 {
2545     int type_from = type & PIXEL_FORMAT_MASK;
2546 
2547     if (type_from == PIXEL_RGB || type_from == PIXEL_BGR)
2548     {
2549         return Mat::from_pixels_resize(pixels, type, w, h, w * 3, target_width, target_height, allocator);
2550     }
2551     else if (type_from == PIXEL_GRAY)
2552     {
2553         return Mat::from_pixels_resize(pixels, type, w, h, w * 1, target_width, target_height, allocator);
2554     }
2555     else if (type_from == PIXEL_RGBA || type_from == PIXEL_BGRA)
2556     {
2557         return Mat::from_pixels_resize(pixels, type, w, h, w * 4, target_width, target_height, allocator);
2558     }
2559 
2560     // unknown convert type
2561     NCNN_LOGE("unknown convert type %d", type);
2562     return Mat();
2563 }
2564 
from_pixels_resize(const unsigned char * pixels,int type,int w,int h,int stride,int target_width,int target_height,Allocator * allocator)2565 Mat Mat::from_pixels_resize(const unsigned char* pixels, int type, int w, int h, int stride, int target_width, int target_height, Allocator* allocator)
2566 {
2567     if (w == target_width && h == target_height)
2568         return Mat::from_pixels(pixels, type, w, h, stride, allocator);
2569 
2570     int type_from = type & PIXEL_FORMAT_MASK;
2571 
2572     if (type_from == PIXEL_RGB || type_from == PIXEL_BGR)
2573     {
2574         Mat dst(target_width, target_height, (size_t)3u, 3);
2575         resize_bilinear_c3(pixels, w, h, stride, dst, target_width, target_height, target_width * 3);
2576 
2577         return Mat::from_pixels(dst, type, target_width, target_height, allocator);
2578     }
2579     else if (type_from == PIXEL_GRAY)
2580     {
2581         Mat dst(target_width, target_height, (size_t)1u, 1);
2582         resize_bilinear_c1(pixels, w, h, stride, dst, target_width, target_height, target_width * 1);
2583 
2584         return Mat::from_pixels(dst, type, target_width, target_height, allocator);
2585     }
2586     else if (type_from == PIXEL_RGBA || type_from == PIXEL_BGRA)
2587     {
2588         Mat dst(target_width, target_height, (size_t)4u, 4);
2589         resize_bilinear_c4(pixels, w, h, stride, dst, target_width, target_height, target_width * 4);
2590 
2591         return Mat::from_pixels(dst, type, target_width, target_height, allocator);
2592     }
2593 
2594     // unknown convert type
2595     NCNN_LOGE("unknown convert type %d", type);
2596     return Mat();
2597 }
2598 
from_pixels_roi(const unsigned char * pixels,int type,int w,int h,int roix,int roiy,int roiw,int roih,Allocator * allocator)2599 Mat Mat::from_pixels_roi(const unsigned char* pixels, int type, int w, int h, int roix, int roiy, int roiw, int roih, Allocator* allocator)
2600 {
2601     if (roix < 0 || roiy < 0 || roiw <= 0 || roih <= 0 || roix + roiw > w || roiy + roih > h)
2602     {
2603         NCNN_LOGE("roi %d %d %d %d out of image %d %d", roix, roiy, roiw, roih, w, h);
2604         return Mat();
2605     }
2606 
2607     int type_from = type & PIXEL_FORMAT_MASK;
2608 
2609     if (type_from == PIXEL_RGB || type_from == PIXEL_BGR)
2610     {
2611         return from_pixels(pixels + (roiy * w + roix) * 3, type, roiw, roih, w * 3, allocator);
2612     }
2613     else if (type_from == PIXEL_GRAY)
2614     {
2615         return from_pixels(pixels + (roiy * w + roix) * 1, type, roiw, roih, w * 1, allocator);
2616     }
2617     else if (type_from == PIXEL_RGBA || type_from == PIXEL_BGRA)
2618     {
2619         return from_pixels(pixels + (roiy * w + roix) * 4, type, roiw, roih, w * 4, allocator);
2620     }
2621 
2622     // unknown convert type
2623     NCNN_LOGE("unknown convert type %d", type);
2624     return Mat();
2625 }
2626 
from_pixels_roi(const unsigned char * pixels,int type,int w,int h,int stride,int roix,int roiy,int roiw,int roih,Allocator * allocator)2627 Mat Mat::from_pixels_roi(const unsigned char* pixels, int type, int w, int h, int stride, int roix, int roiy, int roiw, int roih, Allocator* allocator)
2628 {
2629     if (roix < 0 || roiy < 0 || roiw <= 0 || roih <= 0 || roix + roiw > w || roiy + roih > h)
2630     {
2631         NCNN_LOGE("roi %d %d %d %d out of image %d %d", roix, roiy, roiw, roih, w, h);
2632         return Mat();
2633     }
2634 
2635     int type_from = type & PIXEL_FORMAT_MASK;
2636 
2637     if (type_from == PIXEL_RGB || type_from == PIXEL_BGR)
2638     {
2639         return from_pixels(pixels + roiy * stride + roix * 3, type, roiw, roih, stride, allocator);
2640     }
2641     else if (type_from == PIXEL_GRAY)
2642     {
2643         return from_pixels(pixels + roiy * stride + roix * 1, type, roiw, roih, stride, allocator);
2644     }
2645     else if (type_from == PIXEL_RGBA || type_from == PIXEL_BGRA)
2646     {
2647         return from_pixels(pixels + roiy * stride + roix * 4, type, roiw, roih, stride, allocator);
2648     }
2649 
2650     // unknown convert type
2651     NCNN_LOGE("unknown convert type %d", type);
2652     return Mat();
2653 }
2654 
from_pixels_roi_resize(const unsigned char * pixels,int type,int w,int h,int roix,int roiy,int roiw,int roih,int target_width,int target_height,Allocator * allocator)2655 Mat Mat::from_pixels_roi_resize(const unsigned char* pixels, int type, int w, int h, int roix, int roiy, int roiw, int roih, int target_width, int target_height, Allocator* allocator)
2656 {
2657     if (roix < 0 || roiy < 0 || roiw <= 0 || roih <= 0 || roix + roiw > w || roiy + roih > h)
2658     {
2659         NCNN_LOGE("roi %d %d %d %d out of image %d %d", roix, roiy, roiw, roih, w, h);
2660         return Mat();
2661     }
2662 
2663     int type_from = type & PIXEL_FORMAT_MASK;
2664 
2665     if (type_from == PIXEL_RGB || type_from == PIXEL_BGR)
2666     {
2667         return from_pixels_resize(pixels + (roiy * w + roix) * 3, type, roiw, roih, w * 3, target_width, target_height, allocator);
2668     }
2669     else if (type_from == PIXEL_GRAY)
2670     {
2671         return from_pixels_resize(pixels + (roiy * w + roix) * 1, type, roiw, roih, w * 1, target_width, target_height, allocator);
2672     }
2673     else if (type_from == PIXEL_RGBA || type_from == PIXEL_BGRA)
2674     {
2675         return from_pixels_resize(pixels + (roiy * w + roix) * 4, type, roiw, roih, w * 4, target_width, target_height, allocator);
2676     }
2677 
2678     // unknown convert type
2679     NCNN_LOGE("unknown convert type %d", type);
2680     return Mat();
2681 }
2682 
from_pixels_roi_resize(const unsigned char * pixels,int type,int w,int h,int stride,int roix,int roiy,int roiw,int roih,int target_width,int target_height,Allocator * allocator)2683 Mat Mat::from_pixels_roi_resize(const unsigned char* pixels, int type, int w, int h, int stride, int roix, int roiy, int roiw, int roih, int target_width, int target_height, Allocator* allocator)
2684 {
2685     if (roix < 0 || roiy < 0 || roiw <= 0 || roih <= 0 || roix + roiw > w || roiy + roih > h)
2686     {
2687         NCNN_LOGE("roi %d %d %d %d out of image %d %d", roix, roiy, roiw, roih, w, h);
2688         return Mat();
2689     }
2690 
2691     int type_from = type & PIXEL_FORMAT_MASK;
2692 
2693     if (type_from == PIXEL_RGB || type_from == PIXEL_BGR)
2694     {
2695         return from_pixels_resize(pixels + roiy * stride + roix * 3, type, roiw, roih, stride, target_width, target_height, allocator);
2696     }
2697     else if (type_from == PIXEL_GRAY)
2698     {
2699         return from_pixels_resize(pixels + roiy * stride + roix * 1, type, roiw, roih, stride, target_width, target_height, allocator);
2700     }
2701     else if (type_from == PIXEL_RGBA || type_from == PIXEL_BGRA)
2702     {
2703         return from_pixels_resize(pixels + roiy * stride + roix * 4, type, roiw, roih, stride, target_width, target_height, allocator);
2704     }
2705 
2706     // unknown convert type
2707     NCNN_LOGE("unknown convert type %d", type);
2708     return Mat();
2709 }
2710 
to_pixels(unsigned char * pixels,int type) const2711 void Mat::to_pixels(unsigned char* pixels, int type) const
2712 {
2713     int type_to = (type & PIXEL_CONVERT_MASK) ? (type >> PIXEL_CONVERT_SHIFT) : (type & PIXEL_FORMAT_MASK);
2714 
2715     if (type_to == PIXEL_RGB || type_to == PIXEL_BGR)
2716     {
2717         to_pixels(pixels, type, w * 3);
2718     }
2719     else if (type_to == PIXEL_GRAY)
2720     {
2721         to_pixels(pixels, type, w * 1);
2722     }
2723     else if (type_to == PIXEL_RGBA || type_to == PIXEL_BGRA)
2724     {
2725         to_pixels(pixels, type, w * 4);
2726     }
2727 }
2728 
to_pixels(unsigned char * pixels,int type,int stride) const2729 void Mat::to_pixels(unsigned char* pixels, int type, int stride) const
2730 {
2731     if (type & PIXEL_CONVERT_MASK)
2732     {
2733         switch (type)
2734         {
2735         case PIXEL_RGB2BGR:
2736         case PIXEL_BGR2RGB:
2737             to_bgr2rgb(*this, pixels, stride);
2738             break;
2739         case PIXEL_RGB2RGBA:
2740         case PIXEL_BGR2BGRA:
2741             to_rgb2rgba(*this, pixels, stride);
2742             break;
2743         case PIXEL_BGR2RGBA:
2744         case PIXEL_RGB2BGRA:
2745             to_bgr2rgba(*this, pixels, stride);
2746             break;
2747         case PIXEL_GRAY2RGBA:
2748         case PIXEL_GRAY2BGRA:
2749             to_gray2rgba(*this, pixels, stride);
2750             break;
2751         case PIXEL_RGBA2BGRA:
2752         case PIXEL_BGRA2RGBA:
2753             to_rgba2bgra(*this, pixels, stride);
2754             break;
2755         default:
2756             // unimplemented convert type
2757             NCNN_LOGE("unimplemented convert type %d", type);
2758             break;
2759         }
2760     }
2761     else
2762     {
2763         if (type == PIXEL_RGB || type == PIXEL_BGR)
2764             to_rgb(*this, pixels, stride);
2765 
2766         if (type == PIXEL_GRAY)
2767             to_gray(*this, pixels, stride);
2768 
2769         if (type == PIXEL_RGBA || type == PIXEL_BGRA)
2770             to_rgba(*this, pixels, stride);
2771     }
2772 }
2773 
to_pixels_resize(unsigned char * pixels,int type,int target_width,int target_height) const2774 void Mat::to_pixels_resize(unsigned char* pixels, int type, int target_width, int target_height) const
2775 {
2776     int type_to = (type & PIXEL_CONVERT_MASK) ? (type >> PIXEL_CONVERT_SHIFT) : (type & PIXEL_FORMAT_MASK);
2777 
2778     if (type_to == PIXEL_RGB || type_to == PIXEL_BGR)
2779     {
2780         to_pixels_resize(pixels, type, target_width, target_height, target_width * 3);
2781     }
2782     else if (type_to == PIXEL_GRAY)
2783     {
2784         to_pixels_resize(pixels, type, target_width, target_height, target_width * 1);
2785     }
2786     else if (type_to == PIXEL_RGBA || type_to == PIXEL_BGRA)
2787     {
2788         to_pixels_resize(pixels, type, target_width, target_height, target_width * 4);
2789     }
2790 }
2791 
to_pixels_resize(unsigned char * pixels,int type,int target_width,int target_height,int target_stride) const2792 void Mat::to_pixels_resize(unsigned char* pixels, int type, int target_width, int target_height, int target_stride) const
2793 {
2794     if (w == target_width && h == target_height)
2795         return to_pixels(pixels, type);
2796 
2797     int type_to = (type & PIXEL_CONVERT_MASK) ? (type >> PIXEL_CONVERT_SHIFT) : (type & PIXEL_FORMAT_MASK);
2798 
2799     if (type_to == PIXEL_RGB || type_to == PIXEL_BGR)
2800     {
2801         Mat src(w, h, (size_t)3u, 3);
2802 
2803         to_pixels(src, type);
2804 
2805         resize_bilinear_c3(src, w, h, w * 3, pixels, target_width, target_height, target_stride);
2806     }
2807     else if (type_to == PIXEL_GRAY)
2808     {
2809         Mat src(w, h, (size_t)1u, 1);
2810 
2811         to_pixels(src, type);
2812 
2813         resize_bilinear_c1(src, w, h, w * 1, pixels, target_width, target_height, target_stride);
2814     }
2815     else if (type_to == PIXEL_RGBA || type_to == PIXEL_BGRA)
2816     {
2817         Mat src(w, h, (size_t)4u, 4);
2818 
2819         to_pixels(src, type);
2820 
2821         resize_bilinear_c4(src, w, h, w * 4, pixels, target_width, target_height, target_stride);
2822     }
2823 }
2824 #endif // NCNN_PIXEL
2825 
2826 } // namespace ncnn
2827