1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14
15 #include "mat.h"
16
17 #include <limits.h>
18 #include <math.h>
19 #if __ARM_NEON
20 #include <arm_neon.h>
21 #endif // __ARM_NEON
22 #include "platform.h"
23
24 namespace ncnn {
25
26 #if NCNN_PIXEL
from_rgb(const unsigned char * rgb,int w,int h,int stride,Mat & m,Allocator * allocator)27 static int from_rgb(const unsigned char* rgb, int w, int h, int stride, Mat& m, Allocator* allocator)
28 {
29 m.create(w, h, 3, 4u, allocator);
30 if (m.empty())
31 return -100;
32
33 const int wgap = stride - w * 3;
34 if (wgap == 0)
35 {
36 w = w * h;
37 h = 1;
38 }
39
40 float* ptr0 = m.channel(0);
41 float* ptr1 = m.channel(1);
42 float* ptr2 = m.channel(2);
43
44 for (int y = 0; y < h; y++)
45 {
46 #if __ARM_NEON
47 int nn = w >> 3;
48 int remain = w - (nn << 3);
49 #else
50 int remain = w;
51 #endif // __ARM_NEON
52
53 #if __ARM_NEON
54 #if __aarch64__
55 for (; nn > 0; nn--)
56 {
57 uint8x8x3_t _rgb = vld3_u8(rgb);
58 uint16x8_t _r16 = vmovl_u8(_rgb.val[0]);
59 uint16x8_t _g16 = vmovl_u8(_rgb.val[1]);
60 uint16x8_t _b16 = vmovl_u8(_rgb.val[2]);
61
62 float32x4_t _rlow = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_r16)));
63 float32x4_t _rhigh = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_r16)));
64 float32x4_t _glow = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_g16)));
65 float32x4_t _ghigh = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_g16)));
66 float32x4_t _blow = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_b16)));
67 float32x4_t _bhigh = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_b16)));
68
69 vst1q_f32(ptr0, _rlow);
70 vst1q_f32(ptr0 + 4, _rhigh);
71 vst1q_f32(ptr1, _glow);
72 vst1q_f32(ptr1 + 4, _ghigh);
73 vst1q_f32(ptr2, _blow);
74 vst1q_f32(ptr2 + 4, _bhigh);
75
76 rgb += 3 * 8;
77 ptr0 += 8;
78 ptr1 += 8;
79 ptr2 += 8;
80 }
81 #else
82 if (nn > 0)
83 {
84 asm volatile(
85 "0: \n"
86 "pld [%1, #256] \n"
87 "vld3.u8 {d0-d2}, [%1]! \n"
88 "vmovl.u8 q8, d0 \n"
89 "vmovl.u8 q9, d1 \n"
90 "vmovl.u8 q10, d2 \n"
91 "vmovl.u16 q0, d16 \n"
92 "vmovl.u16 q1, d17 \n"
93 "vmovl.u16 q2, d18 \n"
94 "vmovl.u16 q3, d19 \n"
95 "vmovl.u16 q8, d20 \n"
96 "vmovl.u16 q9, d21 \n"
97 "vcvt.f32.u32 q0, q0 \n"
98 "vcvt.f32.u32 q1, q1 \n"
99 "vcvt.f32.u32 q2, q2 \n"
100 "vcvt.f32.u32 q3, q3 \n"
101 "vcvt.f32.u32 q8, q8 \n"
102 "subs %0, #1 \n"
103 "vst1.f32 {d0-d3}, [%2]! \n"
104 "vcvt.f32.u32 q9, q9 \n"
105 "vst1.f32 {d4-d7}, [%3]! \n"
106 "vst1.f32 {d16-d19}, [%4]! \n"
107 "bne 0b \n"
108 : "=r"(nn), // %0
109 "=r"(rgb), // %1
110 "=r"(ptr0), // %2
111 "=r"(ptr1), // %3
112 "=r"(ptr2) // %4
113 : "0"(nn),
114 "1"(rgb),
115 "2"(ptr0),
116 "3"(ptr1),
117 "4"(ptr2)
118 : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10");
119 }
120 #endif // __aarch64__
121 #endif // __ARM_NEON
122 for (; remain > 0; remain--)
123 {
124 *ptr0 = rgb[0];
125 *ptr1 = rgb[1];
126 *ptr2 = rgb[2];
127
128 rgb += 3;
129 ptr0++;
130 ptr1++;
131 ptr2++;
132 }
133
134 rgb += wgap;
135 }
136
137 return 0;
138 }
139
to_rgb(const Mat & m,unsigned char * rgb,int stride)140 static void to_rgb(const Mat& m, unsigned char* rgb, int stride)
141 {
142 int w = m.w;
143 int h = m.h;
144
145 const int wgap = stride - w * 3;
146 if (wgap == 0)
147 {
148 w = w * h;
149 h = 1;
150 }
151
152 const float* ptr0 = m.channel(0);
153 const float* ptr1 = m.channel(1);
154 const float* ptr2 = m.channel(2);
155
156 for (int y = 0; y < h; y++)
157 {
158 #define SATURATE_CAST_UCHAR(X) (unsigned char)::std::min(::std::max((int)(X), 0), 255);
159
160 #if __ARM_NEON
161 int nn = w >> 3;
162 int remain = w - (nn << 3);
163 #else
164 int remain = w;
165 #endif // __ARM_NEON
166
167 #if __ARM_NEON
168 for (; nn > 0; nn--)
169 {
170 float32x4_t _rlow = vld1q_f32(ptr0);
171 float32x4_t _rhigh = vld1q_f32(ptr0 + 4);
172 float32x4_t _glow = vld1q_f32(ptr1);
173 float32x4_t _ghigh = vld1q_f32(ptr1 + 4);
174 float32x4_t _blow = vld1q_f32(ptr2);
175 float32x4_t _bhigh = vld1q_f32(ptr2 + 4);
176
177 int16x8_t _r16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_rlow)), vmovn_s32(vcvtq_s32_f32(_rhigh)));
178 int16x8_t _g16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_glow)), vmovn_s32(vcvtq_s32_f32(_ghigh)));
179 int16x8_t _b16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_blow)), vmovn_s32(vcvtq_s32_f32(_bhigh)));
180
181 uint8x8x3_t _rgb;
182 _rgb.val[0] = vqmovun_s16(_r16);
183 _rgb.val[1] = vqmovun_s16(_g16);
184 _rgb.val[2] = vqmovun_s16(_b16);
185
186 vst3_u8(rgb, _rgb);
187
188 rgb += 3 * 8;
189 ptr0 += 8;
190 ptr1 += 8;
191 ptr2 += 8;
192 }
193 #endif // __ARM_NEON
194 for (; remain > 0; remain--)
195 {
196 rgb[0] = SATURATE_CAST_UCHAR(*ptr0);
197 rgb[1] = SATURATE_CAST_UCHAR(*ptr1);
198 rgb[2] = SATURATE_CAST_UCHAR(*ptr2);
199
200 rgb += 3;
201 ptr0++;
202 ptr1++;
203 ptr2++;
204 }
205
206 #undef SATURATE_CAST_UCHAR
207 rgb += wgap;
208 }
209 }
210
from_gray(const unsigned char * gray,int w,int h,int stride,Mat & m,Allocator * allocator)211 static int from_gray(const unsigned char* gray, int w, int h, int stride, Mat& m, Allocator* allocator)
212 {
213 m.create(w, h, 1, 4u, allocator);
214 if (m.empty())
215 return -100;
216
217 const int wgap = stride - w;
218 if (wgap == 0)
219 {
220 w = w * h;
221 h = 1;
222 }
223
224 float* ptr = m;
225
226 for (int y = 0; y < h; y++)
227 {
228 #if __ARM_NEON
229 int nn = w >> 4;
230 int remain = w - (nn << 4);
231 #else
232 int remain = w;
233 #endif // __ARM_NEON
234
235 #if __ARM_NEON
236 #if __aarch64__
237 for (; nn > 0; nn--)
238 {
239 uint8x16_t _gray = vld1q_u8(gray);
240 uint16x8_t _gray16_0 = vmovl_u8(vget_low_u8(_gray));
241 uint16x8_t _gray16_1 = vmovl_u8(vget_high_u8(_gray));
242
243 float32x4_t _graylow_0 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_gray16_0)));
244 float32x4_t _grayhigh_0 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_gray16_0)));
245 float32x4_t _graylow_1 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_gray16_1)));
246 float32x4_t _grayhigh_1 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_gray16_1)));
247
248 vst1q_f32(ptr, _graylow_0);
249 vst1q_f32(ptr + 4, _grayhigh_0);
250 vst1q_f32(ptr + 8, _graylow_1);
251 vst1q_f32(ptr + 12, _grayhigh_1);
252
253 gray += 16;
254 ptr += 16;
255 }
256 #else
257 if (nn > 0)
258 {
259 asm volatile(
260 "0: \n"
261 "pld [%1, #128] \n"
262 "vld1.u8 {d0,d1}, [%1]! \n"
263 "vmovl.u8 q8, d0 \n"
264 "vmovl.u8 q9, d1 \n"
265 "vmovl.u16 q0, d16 \n"
266 "vmovl.u16 q1, d17 \n"
267 "vmovl.u16 q2, d18 \n"
268 "vmovl.u16 q3, d19 \n"
269 "vcvt.f32.u32 q0, q0 \n"
270 "vcvt.f32.u32 q1, q1 \n"
271 "vcvt.f32.u32 q2, q2 \n"
272 "vcvt.f32.u32 q3, q3 \n"
273 "subs %0, #1 \n"
274 "vst1.f32 {d0-d3}, [%2]! \n"
275 "vst1.f32 {d4-d7}, [%2]! \n"
276 "bne 0b \n"
277 : "=r"(nn), // %0
278 "=r"(gray), // %1
279 "=r"(ptr) // %2
280 : "0"(nn),
281 "1"(gray),
282 "2"(ptr)
283 : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9");
284 }
285 #endif // __aarch64__
286 #endif // __ARM_NEON
287 for (; remain > 0; remain--)
288 {
289 *ptr = *gray;
290
291 gray++;
292 ptr++;
293 }
294
295 gray += wgap;
296 }
297
298 return 0;
299 }
300
to_gray(const Mat & m,unsigned char * gray,int stride)301 static void to_gray(const Mat& m, unsigned char* gray, int stride)
302 {
303 int w = m.w;
304 int h = m.h;
305
306 const int wgap = stride - w;
307 if (wgap == 0)
308 {
309 w = w * h;
310 h = 1;
311 }
312
313 const float* ptr = m;
314
315 for (int y = 0; y < h; y++)
316 {
317 #define SATURATE_CAST_UCHAR(X) (unsigned char)::std::min(::std::max((int)(X), 0), 255);
318
319 #if __ARM_NEON
320 int nn = w >> 3;
321 int remain = w - (nn << 3);
322 #else
323 int remain = w;
324 #endif // __ARM_NEON
325
326 #if __ARM_NEON
327 for (; nn > 0; nn--)
328 {
329 float32x4_t _glow = vld1q_f32(ptr);
330 float32x4_t _ghigh = vld1q_f32(ptr + 4);
331
332 int16x8_t _g16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_glow)), vmovn_s32(vcvtq_s32_f32(_ghigh)));
333
334 uint8x8_t _gray = vqmovun_s16(_g16);
335
336 vst1_u8(gray, _gray);
337
338 gray += 8;
339 ptr += 8;
340 }
341 #endif // __ARM_NEON
342 for (; remain > 0; remain--)
343 {
344 *gray = SATURATE_CAST_UCHAR(*ptr);
345
346 gray++;
347 ptr++;
348 }
349
350 #undef SATURATE_CAST_UCHAR
351 gray += wgap;
352 }
353 }
354
from_rgba(const unsigned char * rgba,int w,int h,int stride,Mat & m,Allocator * allocator)355 static int from_rgba(const unsigned char* rgba, int w, int h, int stride, Mat& m, Allocator* allocator)
356 {
357 m.create(w, h, 4, 4u, allocator);
358 if (m.empty())
359 return -100;
360
361 const int wgap = stride - w * 4;
362 if (wgap == 0)
363 {
364 w = w * h;
365 h = 1;
366 }
367
368 float* ptr0 = m.channel(0);
369 float* ptr1 = m.channel(1);
370 float* ptr2 = m.channel(2);
371 float* ptr3 = m.channel(3);
372
373 for (int y = 0; y < h; y++)
374 {
375 #if __ARM_NEON
376 int nn = w >> 3;
377 int remain = w - (nn << 3);
378 #else
379 int remain = w;
380 #endif // __ARM_NEON
381
382 #if __ARM_NEON
383 #if __aarch64__
384 for (; nn > 0; nn--)
385 {
386 uint8x8x4_t _rgba = vld4_u8(rgba);
387 int16x8_t _r16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[0]));
388 int16x8_t _g16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[1]));
389 int16x8_t _b16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[2]));
390 int16x8_t _a16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[3]));
391
392 float32x4_t _rlow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_r16)));
393 float32x4_t _rhigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_r16)));
394 float32x4_t _glow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_g16)));
395 float32x4_t _ghigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_g16)));
396 float32x4_t _blow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_b16)));
397 float32x4_t _bhigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_b16)));
398 float32x4_t _alow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_a16)));
399 float32x4_t _ahigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_a16)));
400
401 vst1q_f32(ptr0, _rlow);
402 vst1q_f32(ptr0 + 4, _rhigh);
403 vst1q_f32(ptr1, _glow);
404 vst1q_f32(ptr1 + 4, _ghigh);
405 vst1q_f32(ptr2, _blow);
406 vst1q_f32(ptr2 + 4, _bhigh);
407 vst1q_f32(ptr3, _alow);
408 vst1q_f32(ptr3 + 4, _ahigh);
409
410 rgba += 4 * 8;
411 ptr0 += 8;
412 ptr1 += 8;
413 ptr2 += 8;
414 ptr3 += 8;
415 }
416 #else
417 if (nn > 0)
418 {
419 asm volatile(
420 "0: \n"
421 "pld [%1, #256] \n"
422 "vld4.u8 {d0-d3}, [%1]! \n"
423 "vmovl.u8 q8, d0 \n"
424 "vmovl.u8 q9, d1 \n"
425 "vmovl.u8 q10, d2 \n"
426 "vmovl.u8 q11, d3 \n"
427 "vmovl.u16 q0, d16 \n"
428 "vmovl.u16 q1, d17 \n"
429 "vmovl.u16 q2, d18 \n"
430 "vmovl.u16 q3, d19 \n"
431 "vmovl.u16 q8, d20 \n"
432 "vmovl.u16 q9, d21 \n"
433 "vmovl.u16 q10, d22 \n"
434 "vmovl.u16 q11, d23 \n"
435 "vcvt.f32.u32 q0, q0 \n"
436 "vcvt.f32.u32 q1, q1 \n"
437 "vcvt.f32.u32 q2, q2 \n"
438 "vcvt.f32.u32 q3, q3 \n"
439 "vcvt.f32.u32 q8, q8 \n"
440 "vcvt.f32.u32 q9, q9 \n"
441 "subs %0, #1 \n"
442 "vst1.f32 {d0-d3}, [%2]! \n"
443 "vcvt.f32.u32 q10, q10 \n"
444 "vcvt.f32.u32 q11, q11 \n"
445 "vst1.f32 {d4-d7}, [%3]! \n"
446 "vst1.f32 {d16-d19}, [%4]! \n"
447 "vst1.f32 {d20-d23}, [%5]! \n"
448 "bne 0b \n"
449 : "=r"(nn), // %0
450 "=r"(rgba), // %1
451 "=r"(ptr0), // %2
452 "=r"(ptr1), // %3
453 "=r"(ptr2), // %4
454 "=r"(ptr3) // %5
455 : "0"(nn),
456 "1"(rgba),
457 "2"(ptr0),
458 "3"(ptr1),
459 "4"(ptr2),
460 "5"(ptr3)
461 : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
462 }
463 #endif // __aarch64__
464 #endif // __ARM_NEON
465 for (; remain > 0; remain--)
466 {
467 *ptr0 = rgba[0];
468 *ptr1 = rgba[1];
469 *ptr2 = rgba[2];
470 *ptr3 = rgba[3];
471
472 rgba += 4;
473 ptr0++;
474 ptr1++;
475 ptr2++;
476 ptr3++;
477 }
478
479 rgba += wgap;
480 }
481
482 return 0;
483 }
484
to_rgba(const Mat & m,unsigned char * rgba,int stride)485 static void to_rgba(const Mat& m, unsigned char* rgba, int stride)
486 {
487 int w = m.w;
488 int h = m.h;
489
490 const int wgap = stride - w * 4;
491 if (wgap == 0)
492 {
493 w = w * h;
494 h = 1;
495 }
496
497 const float* ptr0 = m.channel(0);
498 const float* ptr1 = m.channel(1);
499 const float* ptr2 = m.channel(2);
500 const float* ptr3 = m.channel(3);
501
502 for (int y = 0; y < h; y++)
503 {
504 #define SATURATE_CAST_UCHAR(X) (unsigned char)::std::min(::std::max((int)(X), 0), 255);
505
506 #if __ARM_NEON
507 int nn = w >> 3;
508 int remain = w - (nn << 3);
509 #else
510 int remain = w;
511 #endif // __ARM_NEON
512
513 #if __ARM_NEON
514 for (; nn > 0; nn--)
515 {
516 float32x4_t _rlow = vld1q_f32(ptr0);
517 float32x4_t _rhigh = vld1q_f32(ptr0 + 4);
518 float32x4_t _glow = vld1q_f32(ptr1);
519 float32x4_t _ghigh = vld1q_f32(ptr1 + 4);
520 float32x4_t _blow = vld1q_f32(ptr2);
521 float32x4_t _bhigh = vld1q_f32(ptr2 + 4);
522 float32x4_t _alow = vld1q_f32(ptr3);
523 float32x4_t _ahigh = vld1q_f32(ptr3 + 4);
524
525 int16x8_t _r16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_rlow)), vmovn_s32(vcvtq_s32_f32(_rhigh)));
526 int16x8_t _g16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_glow)), vmovn_s32(vcvtq_s32_f32(_ghigh)));
527 int16x8_t _b16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_blow)), vmovn_s32(vcvtq_s32_f32(_bhigh)));
528 int16x8_t _a16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_alow)), vmovn_s32(vcvtq_s32_f32(_ahigh)));
529
530 uint8x8x4_t _rgba;
531 _rgba.val[0] = vqmovun_s16(_r16);
532 _rgba.val[1] = vqmovun_s16(_g16);
533 _rgba.val[2] = vqmovun_s16(_b16);
534 _rgba.val[3] = vqmovun_s16(_a16);
535
536 vst4_u8(rgba, _rgba);
537
538 rgba += 4 * 8;
539 ptr0 += 8;
540 ptr1 += 8;
541 ptr2 += 8;
542 ptr3 += 8;
543 }
544 #endif // __ARM_NEON
545 for (; remain > 0; remain--)
546 {
547 rgba[0] = SATURATE_CAST_UCHAR(*ptr0);
548 rgba[1] = SATURATE_CAST_UCHAR(*ptr1);
549 rgba[2] = SATURATE_CAST_UCHAR(*ptr2);
550 rgba[3] = SATURATE_CAST_UCHAR(*ptr3);
551
552 rgba += 4;
553 ptr0++;
554 ptr1++;
555 ptr2++;
556 ptr3++;
557 }
558
559 #undef SATURATE_CAST_UCHAR
560 rgba += wgap;
561 }
562 }
563
from_rgb2bgr(const unsigned char * rgb,int w,int h,int stride,Mat & m,Allocator * allocator)564 static int from_rgb2bgr(const unsigned char* rgb, int w, int h, int stride, Mat& m, Allocator* allocator)
565 {
566 m.create(w, h, 3, 4u, allocator);
567 if (m.empty())
568 return -100;
569
570 const int wgap = stride - w * 3;
571 if (wgap == 0)
572 {
573 w = w * h;
574 h = 1;
575 }
576
577 float* ptr0 = m.channel(0);
578 float* ptr1 = m.channel(1);
579 float* ptr2 = m.channel(2);
580
581 for (int y = 0; y < h; y++)
582 {
583 #if __ARM_NEON
584 int nn = w >> 3;
585 int remain = w - (nn << 3);
586 #else
587 int remain = w;
588 #endif // __ARM_NEON
589
590 #if __ARM_NEON
591 #if __aarch64__
592 for (; nn > 0; nn--)
593 {
594 uint8x8x3_t _rgb = vld3_u8(rgb);
595 uint16x8_t _r16 = vmovl_u8(_rgb.val[0]);
596 uint16x8_t _g16 = vmovl_u8(_rgb.val[1]);
597 uint16x8_t _b16 = vmovl_u8(_rgb.val[2]);
598
599 float32x4_t _rlow = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_r16)));
600 float32x4_t _rhigh = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_r16)));
601 float32x4_t _glow = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_g16)));
602 float32x4_t _ghigh = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_g16)));
603 float32x4_t _blow = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_b16)));
604 float32x4_t _bhigh = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_b16)));
605
606 vst1q_f32(ptr2, _rlow);
607 vst1q_f32(ptr2 + 4, _rhigh);
608 vst1q_f32(ptr1, _glow);
609 vst1q_f32(ptr1 + 4, _ghigh);
610 vst1q_f32(ptr0, _blow);
611 vst1q_f32(ptr0 + 4, _bhigh);
612
613 rgb += 3 * 8;
614 ptr0 += 8;
615 ptr1 += 8;
616 ptr2 += 8;
617 }
618 #else
619 if (nn > 0)
620 {
621 asm volatile(
622 "0: \n"
623 "pld [%1, #256] \n"
624 "vld3.u8 {d0-d2}, [%1]! \n"
625 "vmovl.u8 q8, d0 \n"
626 "vmovl.u8 q9, d1 \n"
627 "vmovl.u8 q10, d2 \n"
628 "vmovl.u16 q0, d16 \n"
629 "vmovl.u16 q1, d17 \n"
630 "vmovl.u16 q2, d18 \n"
631 "vmovl.u16 q3, d19 \n"
632 "vmovl.u16 q8, d20 \n"
633 "vmovl.u16 q9, d21 \n"
634 "vcvt.f32.u32 q0, q0 \n"
635 "vcvt.f32.u32 q1, q1 \n"
636 "vcvt.f32.u32 q2, q2 \n"
637 "vcvt.f32.u32 q3, q3 \n"
638 "vcvt.f32.u32 q8, q8 \n"
639 "subs %0, #1 \n"
640 "vst1.f32 {d0-d3}, [%4]! \n"
641 "vcvt.f32.u32 q9, q9 \n"
642 "vst1.f32 {d4-d7}, [%3]! \n"
643 "vst1.f32 {d16-d19}, [%2]! \n"
644 "bne 0b \n"
645 : "=r"(nn), // %0
646 "=r"(rgb), // %1
647 "=r"(ptr0), // %2
648 "=r"(ptr1), // %3
649 "=r"(ptr2) // %4
650 : "0"(nn),
651 "1"(rgb),
652 "2"(ptr0),
653 "3"(ptr1),
654 "4"(ptr2)
655 : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10");
656 }
657 #endif // __aarch64__
658 #endif // __ARM_NEON
659 for (; remain > 0; remain--)
660 {
661 *ptr0 = rgb[2];
662 *ptr1 = rgb[1];
663 *ptr2 = rgb[0];
664
665 rgb += 3;
666 ptr0++;
667 ptr1++;
668 ptr2++;
669 }
670
671 rgb += wgap;
672 }
673
674 return 0;
675 }
676
to_bgr2rgb(const Mat & m,unsigned char * rgb,int stride)677 static void to_bgr2rgb(const Mat& m, unsigned char* rgb, int stride)
678 {
679 int w = m.w;
680 int h = m.h;
681
682 const int wgap = stride - w * 3;
683 if (wgap == 0)
684 {
685 w = w * h;
686 h = 1;
687 }
688
689 const float* ptr0 = m.channel(0);
690 const float* ptr1 = m.channel(1);
691 const float* ptr2 = m.channel(2);
692
693 for (int y = 0; y < h; y++)
694 {
695 #define SATURATE_CAST_UCHAR(X) (unsigned char)::std::min(::std::max((int)(X), 0), 255);
696
697 #if __ARM_NEON
698 int nn = w >> 3;
699 int remain = w - (nn << 3);
700 #else
701 int remain = w;
702 #endif // __ARM_NEON
703
704 #if __ARM_NEON
705 for (; nn > 0; nn--)
706 {
707 float32x4_t _rlow = vld1q_f32(ptr2);
708 float32x4_t _rhigh = vld1q_f32(ptr2 + 4);
709 float32x4_t _glow = vld1q_f32(ptr1);
710 float32x4_t _ghigh = vld1q_f32(ptr1 + 4);
711 float32x4_t _blow = vld1q_f32(ptr0);
712 float32x4_t _bhigh = vld1q_f32(ptr0 + 4);
713
714 int16x8_t _r16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_rlow)), vmovn_s32(vcvtq_s32_f32(_rhigh)));
715 int16x8_t _g16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_glow)), vmovn_s32(vcvtq_s32_f32(_ghigh)));
716 int16x8_t _b16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_blow)), vmovn_s32(vcvtq_s32_f32(_bhigh)));
717
718 uint8x8x3_t _rgb;
719 _rgb.val[0] = vqmovun_s16(_r16);
720 _rgb.val[1] = vqmovun_s16(_g16);
721 _rgb.val[2] = vqmovun_s16(_b16);
722
723 vst3_u8(rgb, _rgb);
724
725 rgb += 3 * 8;
726 ptr0 += 8;
727 ptr1 += 8;
728 ptr2 += 8;
729 }
730 #endif // __ARM_NEON
731 for (; remain > 0; remain--)
732 {
733 rgb[2] = SATURATE_CAST_UCHAR(*ptr0);
734 rgb[1] = SATURATE_CAST_UCHAR(*ptr1);
735 rgb[0] = SATURATE_CAST_UCHAR(*ptr2);
736
737 rgb += 3;
738 ptr0++;
739 ptr1++;
740 ptr2++;
741 }
742
743 #undef SATURATE_CAST_UCHAR
744 rgb += wgap;
745 }
746 }
747
from_rgb2gray(const unsigned char * rgb,int w,int h,int stride,Mat & m,Allocator * allocator)748 static int from_rgb2gray(const unsigned char* rgb, int w, int h, int stride, Mat& m, Allocator* allocator)
749 {
750 // coeffs for r g b = 0.299f, 0.587f, 0.114f
751 const unsigned char Y_shift = 8; //14
752 const unsigned char R2Y = 77;
753 const unsigned char G2Y = 150;
754 const unsigned char B2Y = 29;
755
756 m.create(w, h, 1, 4u, allocator);
757 if (m.empty())
758 return -100;
759
760 const int wgap = stride - w * 3;
761 if (wgap == 0)
762 {
763 w = w * h;
764 h = 1;
765 }
766
767 float* ptr = m;
768
769 for (int y = 0; y < h; y++)
770 {
771 #if __ARM_NEON
772 int nn = w >> 3;
773 int remain = w - (nn << 3);
774 #else
775 int remain = w;
776 #endif // __ARM_NEON
777
778 #if __ARM_NEON
779 #if __aarch64__
780 uint8x8_t _R2Y = vdup_n_u8(R2Y);
781 uint8x8_t _G2Y = vdup_n_u8(G2Y);
782 uint8x8_t _B2Y = vdup_n_u8(B2Y);
783 for (; nn > 0; nn--)
784 {
785 uint8x8x3_t _rgb = vld3_u8(rgb);
786
787 uint16x8_t _y16 = vmull_u8(_rgb.val[0], _R2Y);
788 _y16 = vmlal_u8(_y16, _rgb.val[1], _G2Y);
789 _y16 = vmlal_u8(_y16, _rgb.val[2], _B2Y);
790 _y16 = vshrq_n_u16(_y16, Y_shift);
791
792 float32x4_t _ylow = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_y16)));
793 float32x4_t _yhigh = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_y16)));
794
795 vst1q_f32(ptr, _ylow);
796 vst1q_f32(ptr + 4, _yhigh);
797
798 rgb += 3 * 8;
799 ptr += 8;
800 }
801 #else
802 if (nn > 0)
803 {
804 asm volatile(
805 "vdup.u8 d16, %6 \n"
806 "vdup.u8 d17, %7 \n"
807 "vdup.u8 d18, %8 \n"
808 "0: \n"
809 "pld [%1, #256] \n"
810 "vld3.u8 {d0-d2}, [%1]! \n"
811 "vmull.u8 q2, d0, d16 \n"
812 "vmlal.u8 q2, d1, d17 \n"
813 "vmlal.u8 q2, d2, d18 \n"
814 "vshr.u16 q2, q2, #8 \n" // Y_shift
815 "vmovl.u16 q0, d4 \n"
816 "vmovl.u16 q1, d5 \n"
817 "vcvt.f32.u32 q0, q0 \n"
818 "vcvt.f32.u32 q1, q1 \n"
819 "subs %0, #1 \n"
820 "vst1.f32 {d0-d3}, [%2]! \n"
821 "bne 0b \n"
822 : "=r"(nn), // %0
823 "=r"(rgb), // %1
824 "=r"(ptr) // %2
825 : "0"(nn),
826 "1"(rgb),
827 "2"(ptr),
828 "r"(R2Y), // %6
829 "r"(G2Y), // %7
830 "r"(B2Y) // %8
831 : "cc", "memory", "q0", "q1", "q2", "q8", "q9");
832 }
833 #endif // __aarch64__
834 #endif // __ARM_NEON
835 for (; remain > 0; remain--)
836 {
837 *ptr = static_cast<float>((rgb[0] * R2Y + rgb[1] * G2Y + rgb[2] * B2Y) >> Y_shift);
838
839 rgb += 3;
840 ptr++;
841 }
842
843 rgb += wgap;
844 }
845
846 return 0;
847 }
848
from_rgb2rgba(const unsigned char * rgb,int w,int h,int stride,Mat & m,Allocator * allocator)849 static int from_rgb2rgba(const unsigned char* rgb, int w, int h, int stride, Mat& m, Allocator* allocator)
850 {
851 m.create(w, h, 4, 4u, allocator);
852 if (m.empty())
853 return -100;
854
855 Mat rgb_channels = m.channel_range(0, 3);
856 from_rgb(rgb, w, h, stride, rgb_channels, allocator);
857
858 Mat alpha_channel = m.channel(3);
859 alpha_channel.fill(255.f);
860
861 return 0;
862 }
863
to_rgb2rgba(const Mat & m,unsigned char * rgba,int stride)864 static void to_rgb2rgba(const Mat& m, unsigned char* rgba, int stride)
865 {
866 int w = m.w;
867 int h = m.h;
868
869 const int wgap = stride - w * 4;
870 if (wgap == 0)
871 {
872 w = w * h;
873 h = 1;
874 }
875
876 const float* ptr0 = m.channel(0);
877 const float* ptr1 = m.channel(1);
878 const float* ptr2 = m.channel(2);
879
880 for (int y = 0; y < h; y++)
881 {
882 #define SATURATE_CAST_UCHAR(X) (unsigned char)::std::min(::std::max((int)(X), 0), 255);
883
884 #if __ARM_NEON
885 int nn = w >> 3;
886 int remain = w - (nn << 3);
887 #else
888 int remain = w;
889 #endif // __ARM_NEON
890
891 #if __ARM_NEON
892 uint8x8_t _a = vdup_n_u8(255);
893 for (; nn > 0; nn--)
894 {
895 float32x4_t _rlow = vld1q_f32(ptr0);
896 float32x4_t _rhigh = vld1q_f32(ptr0 + 4);
897 float32x4_t _glow = vld1q_f32(ptr1);
898 float32x4_t _ghigh = vld1q_f32(ptr1 + 4);
899 float32x4_t _blow = vld1q_f32(ptr2);
900 float32x4_t _bhigh = vld1q_f32(ptr2 + 4);
901
902 int16x8_t _r16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_rlow)), vmovn_s32(vcvtq_s32_f32(_rhigh)));
903 int16x8_t _g16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_glow)), vmovn_s32(vcvtq_s32_f32(_ghigh)));
904 int16x8_t _b16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_blow)), vmovn_s32(vcvtq_s32_f32(_bhigh)));
905
906 uint8x8x4_t _rgba;
907 _rgba.val[0] = vqmovun_s16(_r16);
908 _rgba.val[1] = vqmovun_s16(_g16);
909 _rgba.val[2] = vqmovun_s16(_b16);
910 _rgba.val[3] = _a;
911
912 vst4_u8(rgba, _rgba);
913
914 rgba += 4 * 8;
915 ptr0 += 8;
916 ptr1 += 8;
917 ptr2 += 8;
918 }
919 #endif // __ARM_NEON
920 for (; remain > 0; remain--)
921 {
922 rgba[0] = SATURATE_CAST_UCHAR(*ptr0);
923 rgba[1] = SATURATE_CAST_UCHAR(*ptr1);
924 rgba[2] = SATURATE_CAST_UCHAR(*ptr2);
925 rgba[3] = 255;
926
927 rgba += 4;
928 ptr0++;
929 ptr1++;
930 ptr2++;
931 }
932
933 #undef SATURATE_CAST_UCHAR
934 rgba += wgap;
935 }
936 }
937
from_bgr2gray(const unsigned char * bgr,int w,int h,int stride,Mat & m,Allocator * allocator)938 static int from_bgr2gray(const unsigned char* bgr, int w, int h, int stride, Mat& m, Allocator* allocator)
939 {
940 // coeffs for r g b = 0.299f, 0.587f, 0.114f
941 const unsigned char Y_shift = 8; //14
942 const unsigned char R2Y = 77;
943 const unsigned char G2Y = 150;
944 const unsigned char B2Y = 29;
945
946 m.create(w, h, 1, 4u, allocator);
947 if (m.empty())
948 return -100;
949
950 const int wgap = stride - w * 3;
951 if (wgap == 0)
952 {
953 w = w * h;
954 h = 1;
955 }
956
957 float* ptr = m;
958
959 for (int y = 0; y < h; y++)
960 {
961 #if __ARM_NEON
962 int nn = w >> 3;
963 int remain = w - (nn << 3);
964 #else
965 int remain = w;
966 #endif // __ARM_NEON
967
968 #if __ARM_NEON
969 #if __aarch64__
970 uint8x8_t _R2Y = vdup_n_u8(R2Y);
971 uint8x8_t _G2Y = vdup_n_u8(G2Y);
972 uint8x8_t _B2Y = vdup_n_u8(B2Y);
973 for (; nn > 0; nn--)
974 {
975 uint8x8x3_t _rgb = vld3_u8(bgr);
976
977 uint16x8_t _y16 = vmull_u8(_rgb.val[2], _R2Y);
978 _y16 = vmlal_u8(_y16, _rgb.val[1], _G2Y);
979 _y16 = vmlal_u8(_y16, _rgb.val[0], _B2Y);
980 _y16 = vshrq_n_u16(_y16, Y_shift);
981
982 float32x4_t _ylow = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_y16)));
983 float32x4_t _yhigh = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_y16)));
984
985 vst1q_f32(ptr, _ylow);
986 vst1q_f32(ptr + 4, _yhigh);
987
988 bgr += 3 * 8;
989 ptr += 8;
990 }
991 #else
992 if (nn > 0)
993 {
994 asm volatile(
995 "vdup.u8 d16, %6 \n"
996 "vdup.u8 d17, %7 \n"
997 "vdup.u8 d18, %8 \n"
998 "0: \n"
999 "pld [%1, #256] \n"
1000 "vld3.u8 {d0-d2}, [%1]! \n"
1001 "vmull.u8 q2, d2, d16 \n"
1002 "vmlal.u8 q2, d1, d17 \n"
1003 "vmlal.u8 q2, d0, d18 \n"
1004 "vshr.u16 q2, q2, #8 \n" // Y_shift
1005 "vmovl.u16 q0, d4 \n"
1006 "vmovl.u16 q1, d5 \n"
1007 "vcvt.f32.u32 q0, q0 \n"
1008 "vcvt.f32.u32 q1, q1 \n"
1009 "subs %0, #1 \n"
1010 "vst1.f32 {d0-d3}, [%2]! \n"
1011 "bne 0b \n"
1012 : "=r"(nn), // %0
1013 "=r"(bgr), // %1
1014 "=r"(ptr) // %2
1015 : "0"(nn),
1016 "1"(bgr),
1017 "2"(ptr),
1018 "r"(R2Y), // %6
1019 "r"(G2Y), // %7
1020 "r"(B2Y) // %8
1021 : "cc", "memory", "q0", "q1", "q2", "q8", "q9");
1022 }
1023 #endif // __aarch64__
1024 #endif // __ARM_NEON
1025 for (; remain > 0; remain--)
1026 {
1027 *ptr = static_cast<float>((bgr[2] * R2Y + bgr[1] * G2Y + bgr[0] * B2Y) >> Y_shift);
1028
1029 bgr += 3;
1030 ptr++;
1031 }
1032
1033 bgr += wgap;
1034 }
1035
1036 return 0;
1037 }
1038
from_bgr2rgba(const unsigned char * bgr,int w,int h,int stride,Mat & m,Allocator * allocator)1039 static int from_bgr2rgba(const unsigned char* bgr, int w, int h, int stride, Mat& m, Allocator* allocator)
1040 {
1041 m.create(w, h, 4, 4u, allocator);
1042 if (m.empty())
1043 return -100;
1044
1045 Mat rgb_channels = m.channel_range(0, 3);
1046 from_rgb2bgr(bgr, w, h, stride, rgb_channels, allocator);
1047
1048 Mat alpha_channel = m.channel(3);
1049 alpha_channel.fill(255.f);
1050
1051 return 0;
1052 }
1053
to_bgr2rgba(const Mat & m,unsigned char * rgba,int stride)1054 static void to_bgr2rgba(const Mat& m, unsigned char* rgba, int stride)
1055 {
1056 int w = m.w;
1057 int h = m.h;
1058
1059 const int wgap = stride - w * 4;
1060 if (wgap == 0)
1061 {
1062 w = w * h;
1063 h = 1;
1064 }
1065
1066 const float* ptr0 = m.channel(0);
1067 const float* ptr1 = m.channel(1);
1068 const float* ptr2 = m.channel(2);
1069
1070 for (int y = 0; y < h; y++)
1071 {
1072 #define SATURATE_CAST_UCHAR(X) (unsigned char)::std::min(::std::max((int)(X), 0), 255);
1073
1074 #if __ARM_NEON
1075 int nn = w >> 3;
1076 int remain = w - (nn << 3);
1077 #else
1078 int remain = w;
1079 #endif // __ARM_NEON
1080
1081 #if __ARM_NEON
1082 uint8x8_t _a = vdup_n_u8(255);
1083 for (; nn > 0; nn--)
1084 {
1085 float32x4_t _rlow = vld1q_f32(ptr2);
1086 float32x4_t _rhigh = vld1q_f32(ptr2 + 4);
1087 float32x4_t _glow = vld1q_f32(ptr1);
1088 float32x4_t _ghigh = vld1q_f32(ptr1 + 4);
1089 float32x4_t _blow = vld1q_f32(ptr0);
1090 float32x4_t _bhigh = vld1q_f32(ptr0 + 4);
1091
1092 int16x8_t _r16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_rlow)), vmovn_s32(vcvtq_s32_f32(_rhigh)));
1093 int16x8_t _g16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_glow)), vmovn_s32(vcvtq_s32_f32(_ghigh)));
1094 int16x8_t _b16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_blow)), vmovn_s32(vcvtq_s32_f32(_bhigh)));
1095
1096 uint8x8x4_t _rgba;
1097 _rgba.val[0] = vqmovun_s16(_r16);
1098 _rgba.val[1] = vqmovun_s16(_g16);
1099 _rgba.val[2] = vqmovun_s16(_b16);
1100 _rgba.val[3] = _a;
1101
1102 vst4_u8(rgba, _rgba);
1103
1104 rgba += 4 * 8;
1105 ptr0 += 8;
1106 ptr1 += 8;
1107 ptr2 += 8;
1108 }
1109 #endif // __ARM_NEON
1110 for (; remain > 0; remain--)
1111 {
1112 rgba[0] = SATURATE_CAST_UCHAR(*ptr2);
1113 rgba[1] = SATURATE_CAST_UCHAR(*ptr1);
1114 rgba[2] = SATURATE_CAST_UCHAR(*ptr0);
1115 rgba[3] = 255;
1116
1117 rgba += 4;
1118 ptr0++;
1119 ptr1++;
1120 ptr2++;
1121 }
1122
1123 #undef SATURATE_CAST_UCHAR
1124 rgba += wgap;
1125 }
1126 }
1127
from_gray2rgb(const unsigned char * gray,int w,int h,int stride,Mat & m,Allocator * allocator)1128 static int from_gray2rgb(const unsigned char* gray, int w, int h, int stride, Mat& m, Allocator* allocator)
1129 {
1130 m.create(w, h, 3, 4u, allocator);
1131 if (m.empty())
1132 return -100;
1133
1134 const int wgap = stride - w;
1135 if (wgap == 0)
1136 {
1137 w = w * h;
1138 h = 1;
1139 }
1140
1141 float* ptr0 = m.channel(0);
1142 float* ptr1 = m.channel(1);
1143 float* ptr2 = m.channel(2);
1144
1145 for (int y = 0; y < h; y++)
1146 {
1147 #if __ARM_NEON
1148 int nn = w >> 4;
1149 int remain = w - (nn << 4);
1150 #else
1151 int remain = w;
1152 #endif // __ARM_NEON
1153
1154 #if __ARM_NEON
1155 #if __aarch64__
1156 for (; nn > 0; nn--)
1157 {
1158 uint8x16_t _gray = vld1q_u8(gray);
1159 uint16x8_t _gray16_0 = vmovl_u8(vget_low_u8(_gray));
1160 uint16x8_t _gray16_1 = vmovl_u8(vget_high_u8(_gray));
1161
1162 float32x4_t _graylow_0 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_gray16_0)));
1163 float32x4_t _grayhigh_0 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_gray16_0)));
1164 float32x4_t _graylow_1 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_gray16_1)));
1165 float32x4_t _grayhigh_1 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_gray16_1)));
1166
1167 vst1q_f32(ptr0, _graylow_0);
1168 vst1q_f32(ptr0 + 4, _grayhigh_0);
1169 vst1q_f32(ptr0 + 8, _graylow_1);
1170 vst1q_f32(ptr0 + 12, _grayhigh_1);
1171
1172 vst1q_f32(ptr1, _graylow_0);
1173 vst1q_f32(ptr1 + 4, _grayhigh_0);
1174 vst1q_f32(ptr1 + 8, _graylow_1);
1175 vst1q_f32(ptr1 + 12, _grayhigh_1);
1176
1177 vst1q_f32(ptr2, _graylow_0);
1178 vst1q_f32(ptr2 + 4, _grayhigh_0);
1179 vst1q_f32(ptr2 + 8, _graylow_1);
1180 vst1q_f32(ptr2 + 12, _grayhigh_1);
1181
1182 gray += 16;
1183 ptr0 += 16;
1184 ptr1 += 16;
1185 ptr2 += 16;
1186 }
1187 #else
1188 if (nn > 0)
1189 {
1190 asm volatile(
1191 "0: \n"
1192 "pld [%1, #128] \n"
1193 "vld1.u8 {d0,d1}, [%1]! \n"
1194 "vmovl.u8 q8, d0 \n"
1195 "vmovl.u8 q9, d1 \n"
1196 "vmovl.u16 q0, d16 \n"
1197 "vmovl.u16 q1, d17 \n"
1198 "vmovl.u16 q2, d18 \n"
1199 "vmovl.u16 q3, d19 \n"
1200 "vcvt.f32.u32 q0, q0 \n"
1201 "vcvt.f32.u32 q1, q1 \n"
1202 "vcvt.f32.u32 q2, q2 \n"
1203 "vcvt.f32.u32 q3, q3 \n"
1204 "subs %0, #1 \n"
1205 "vst1.f32 {d0-d3}, [%2]! \n"
1206 "vst1.f32 {d4-d7}, [%2]! \n"
1207 "vst1.f32 {d0-d3}, [%3]! \n"
1208 "vst1.f32 {d4-d7}, [%3]! \n"
1209 "vst1.f32 {d0-d3}, [%4]! \n"
1210 "vst1.f32 {d4-d7}, [%4]! \n"
1211 "bne 0b \n"
1212 : "=r"(nn), // %0
1213 "=r"(gray), // %1
1214 "=r"(ptr0), // %2
1215 "=r"(ptr1), // %3
1216 "=r"(ptr2) // %4
1217 : "0"(nn),
1218 "1"(gray),
1219 "2"(ptr0),
1220 "3"(ptr1),
1221 "4"(ptr2)
1222 : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9");
1223 }
1224 #endif // __aarch64__
1225 #endif // __ARM_NEON
1226 for (; remain > 0; remain--)
1227 {
1228 *ptr0 = *gray;
1229 *ptr1 = *gray;
1230 *ptr2 = *gray;
1231
1232 gray++;
1233 ptr0++;
1234 ptr1++;
1235 ptr2++;
1236 }
1237
1238 gray += wgap;
1239 }
1240
1241 return 0;
1242 }
1243
from_gray2rgba(const unsigned char * gray,int w,int h,int stride,Mat & m,Allocator * allocator)1244 static int from_gray2rgba(const unsigned char* gray, int w, int h, int stride, Mat& m, Allocator* allocator)
1245 {
1246 m.create(w, h, 4, 4u, allocator);
1247 if (m.empty())
1248 return -100;
1249
1250 Mat rgb_channels = m.channel_range(0, 3);
1251 from_gray2rgb(gray, w, h, stride, rgb_channels, allocator);
1252
1253 Mat alpha_channel = m.channel(3);
1254 alpha_channel.fill(255.f);
1255
1256 return 0;
1257 }
1258
to_gray2rgba(const Mat & m,unsigned char * rgba,int stride)1259 static void to_gray2rgba(const Mat& m, unsigned char* rgba, int stride)
1260 {
1261 int w = m.w;
1262 int h = m.h;
1263
1264 const int wgap = stride - w * 4;
1265 if (wgap == 0)
1266 {
1267 w = w * h;
1268 h = 1;
1269 }
1270
1271 const float* ptr = m;
1272
1273 for (int y = 0; y < h; y++)
1274 {
1275 #define SATURATE_CAST_UCHAR(X) (unsigned char)::std::min(::std::max((int)(X), 0), 255);
1276
1277 #if __ARM_NEON
1278 int nn = w >> 3;
1279 int remain = w - (nn << 3);
1280 #else
1281 int remain = w;
1282 #endif // __ARM_NEON
1283
1284 #if __ARM_NEON
1285 uint8x8_t _a = vdup_n_u8(255);
1286 for (; nn > 0; nn--)
1287 {
1288 float32x4_t _glow = vld1q_f32(ptr);
1289 float32x4_t _ghigh = vld1q_f32(ptr + 4);
1290
1291 int16x8_t _g16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_glow)), vmovn_s32(vcvtq_s32_f32(_ghigh)));
1292
1293 uint8x8_t _gray = vqmovun_s16(_g16);
1294
1295 uint8x8x4_t _rgba;
1296 _rgba.val[0] = _gray;
1297 _rgba.val[1] = _gray;
1298 _rgba.val[2] = _gray;
1299 _rgba.val[3] = _a;
1300
1301 vst4_u8(rgba, _rgba);
1302
1303 rgba += 4 * 8;
1304 ptr += 8;
1305 }
1306 #endif // __ARM_NEON
1307 for (; remain > 0; remain--)
1308 {
1309 unsigned char gray = SATURATE_CAST_UCHAR(*ptr);
1310 rgba[0] = gray;
1311 rgba[1] = gray;
1312 rgba[2] = gray;
1313 rgba[3] = 255;
1314
1315 rgba += 4;
1316 ptr++;
1317 }
1318
1319 #undef SATURATE_CAST_UCHAR
1320 rgba += wgap;
1321 }
1322 }
1323
from_rgba2rgb(const unsigned char * rgba,int w,int h,int stride,Mat & m,Allocator * allocator)1324 static int from_rgba2rgb(const unsigned char* rgba, int w, int h, int stride, Mat& m, Allocator* allocator)
1325 {
1326 m.create(w, h, 3, 4u, allocator);
1327 if (m.empty())
1328 return -100;
1329
1330 const int wgap = stride - w * 4;
1331 if (wgap == 0)
1332 {
1333 w = w * h;
1334 h = 1;
1335 }
1336
1337 float* ptr0 = m.channel(0);
1338 float* ptr1 = m.channel(1);
1339 float* ptr2 = m.channel(2);
1340
1341 for (int y = 0; y < h; y++)
1342 {
1343 #if __ARM_NEON
1344 int nn = w >> 3;
1345 int remain = w - (nn << 3);
1346 #else
1347 int remain = w;
1348 #endif // __ARM_NEON
1349
1350 #if __ARM_NEON
1351 #if __aarch64__
1352 for (; nn > 0; nn--)
1353 {
1354 uint8x8x4_t _rgba = vld4_u8(rgba);
1355 int16x8_t _r16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[0]));
1356 int16x8_t _g16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[1]));
1357 int16x8_t _b16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[2]));
1358
1359 float32x4_t _rlow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_r16)));
1360 float32x4_t _rhigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_r16)));
1361 float32x4_t _glow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_g16)));
1362 float32x4_t _ghigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_g16)));
1363 float32x4_t _blow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_b16)));
1364 float32x4_t _bhigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_b16)));
1365
1366 vst1q_f32(ptr0, _rlow);
1367 vst1q_f32(ptr0 + 4, _rhigh);
1368 vst1q_f32(ptr1, _glow);
1369 vst1q_f32(ptr1 + 4, _ghigh);
1370 vst1q_f32(ptr2, _blow);
1371 vst1q_f32(ptr2 + 4, _bhigh);
1372
1373 rgba += 4 * 8;
1374 ptr0 += 8;
1375 ptr1 += 8;
1376 ptr2 += 8;
1377 }
1378 #else
1379 if (nn > 0)
1380 {
1381 asm volatile(
1382 "0: \n"
1383 "pld [%1, #256] \n"
1384 "vld4.u8 {d0-d3}, [%1]! \n"
1385 "vmovl.u8 q8, d0 \n"
1386 "vmovl.u8 q9, d1 \n"
1387 "vmovl.u8 q10, d2 \n"
1388 "vmovl.u16 q0, d16 \n"
1389 "vmovl.u16 q1, d17 \n"
1390 "vmovl.u16 q2, d18 \n"
1391 "vmovl.u16 q3, d19 \n"
1392 "vmovl.u16 q8, d20 \n"
1393 "vmovl.u16 q9, d21 \n"
1394 "vcvt.f32.u32 q0, q0 \n"
1395 "vcvt.f32.u32 q1, q1 \n"
1396 "vcvt.f32.u32 q2, q2 \n"
1397 "vcvt.f32.u32 q3, q3 \n"
1398 "vcvt.f32.u32 q8, q8 \n"
1399 "subs %0, #1 \n"
1400 "vst1.f32 {d0-d3}, [%2]! \n"
1401 "vcvt.f32.u32 q9, q9 \n"
1402 "vst1.f32 {d4-d7}, [%3]! \n"
1403 "vst1.f32 {d16-d19}, [%4]! \n"
1404 "bne 0b \n"
1405 : "=r"(nn), // %0
1406 "=r"(rgba), // %1
1407 "=r"(ptr0), // %2
1408 "=r"(ptr1), // %3
1409 "=r"(ptr2) // %4
1410 : "0"(nn),
1411 "1"(rgba),
1412 "2"(ptr0),
1413 "3"(ptr1),
1414 "4"(ptr2)
1415 : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9");
1416 }
1417 #endif // __aarch64__
1418 #endif // __ARM_NEON
1419 for (; remain > 0; remain--)
1420 {
1421 *ptr0 = rgba[0];
1422 *ptr1 = rgba[1];
1423 *ptr2 = rgba[2];
1424
1425 rgba += 4;
1426 ptr0++;
1427 ptr1++;
1428 ptr2++;
1429 }
1430
1431 rgba += wgap;
1432 }
1433
1434 return 0;
1435 }
1436
from_rgba2bgr(const unsigned char * rgba,int w,int h,int stride,Mat & m,Allocator * allocator)1437 static int from_rgba2bgr(const unsigned char* rgba, int w, int h, int stride, Mat& m, Allocator* allocator)
1438 {
1439 m.create(w, h, 3, 4u, allocator);
1440 if (m.empty())
1441 return -100;
1442
1443 const int wgap = stride - w * 4;
1444 if (wgap == 0)
1445 {
1446 w = w * h;
1447 h = 1;
1448 }
1449
1450 float* ptr0 = m.channel(0);
1451 float* ptr1 = m.channel(1);
1452 float* ptr2 = m.channel(2);
1453
1454 for (int y = 0; y < h; y++)
1455 {
1456 #if __ARM_NEON
1457 int nn = w >> 3;
1458 int remain = w - (nn << 3);
1459 #else
1460 int remain = w;
1461 #endif // __ARM_NEON
1462
1463 #if __ARM_NEON
1464 #if __aarch64__
1465 for (; nn > 0; nn--)
1466 {
1467 uint8x8x4_t _rgba = vld4_u8(rgba);
1468 int16x8_t _r16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[0]));
1469 int16x8_t _g16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[1]));
1470 int16x8_t _b16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[2]));
1471
1472 float32x4_t _rlow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_r16)));
1473 float32x4_t _rhigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_r16)));
1474 float32x4_t _glow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_g16)));
1475 float32x4_t _ghigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_g16)));
1476 float32x4_t _blow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_b16)));
1477 float32x4_t _bhigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_b16)));
1478
1479 vst1q_f32(ptr2, _rlow);
1480 vst1q_f32(ptr2 + 4, _rhigh);
1481 vst1q_f32(ptr1, _glow);
1482 vst1q_f32(ptr1 + 4, _ghigh);
1483 vst1q_f32(ptr0, _blow);
1484 vst1q_f32(ptr0 + 4, _bhigh);
1485
1486 rgba += 4 * 8;
1487 ptr0 += 8;
1488 ptr1 += 8;
1489 ptr2 += 8;
1490 }
1491 #else
1492 if (nn > 0)
1493 {
1494 asm volatile(
1495 "0: \n"
1496 "pld [%1, #256] \n"
1497 "vld4.u8 {d0-d3}, [%1]! \n"
1498 "vmovl.u8 q8, d0 \n"
1499 "vmovl.u8 q9, d1 \n"
1500 "vmovl.u8 q10, d2 \n"
1501 "vmovl.u16 q0, d16 \n"
1502 "vmovl.u16 q1, d17 \n"
1503 "vmovl.u16 q2, d18 \n"
1504 "vmovl.u16 q3, d19 \n"
1505 "vmovl.u16 q8, d20 \n"
1506 "vmovl.u16 q9, d21 \n"
1507 "vcvt.f32.u32 q0, q0 \n"
1508 "vcvt.f32.u32 q1, q1 \n"
1509 "vcvt.f32.u32 q2, q2 \n"
1510 "vcvt.f32.u32 q3, q3 \n"
1511 "vcvt.f32.u32 q8, q8 \n"
1512 "subs %0, #1 \n"
1513 "vst1.f32 {d0-d3}, [%4]! \n"
1514 "vcvt.f32.u32 q9, q9 \n"
1515 "vst1.f32 {d4-d7}, [%3]! \n"
1516 "vst1.f32 {d16-d19}, [%2]! \n"
1517 "bne 0b \n"
1518 : "=r"(nn), // %0
1519 "=r"(rgba), // %1
1520 "=r"(ptr0), // %2
1521 "=r"(ptr1), // %3
1522 "=r"(ptr2) // %4
1523 : "0"(nn),
1524 "1"(rgba),
1525 "2"(ptr0),
1526 "3"(ptr1),
1527 "4"(ptr2)
1528 : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10");
1529 }
1530 #endif // __aarch64__
1531 #endif // __ARM_NEON
1532 for (; remain > 0; remain--)
1533 {
1534 *ptr0 = rgba[2];
1535 *ptr1 = rgba[1];
1536 *ptr2 = rgba[0];
1537
1538 rgba += 4;
1539 ptr0++;
1540 ptr1++;
1541 ptr2++;
1542 }
1543
1544 rgba += wgap;
1545 }
1546
1547 return 0;
1548 }
1549
from_rgba2gray(const unsigned char * rgba,int w,int h,int stride,Mat & m,Allocator * allocator)1550 static int from_rgba2gray(const unsigned char* rgba, int w, int h, int stride, Mat& m, Allocator* allocator)
1551 {
1552 // coeffs for r g b = 0.299f, 0.587f, 0.114f
1553 const unsigned char Y_shift = 8; //14
1554 const unsigned char R2Y = 77;
1555 const unsigned char G2Y = 150;
1556 const unsigned char B2Y = 29;
1557
1558 m.create(w, h, 1, 4u, allocator);
1559 if (m.empty())
1560 return -100;
1561
1562 const int wgap = stride - w * 4;
1563 if (wgap == 0)
1564 {
1565 w = w * h;
1566 h = 1;
1567 }
1568
1569 float* ptr = m;
1570
1571 for (int y = 0; y < h; y++)
1572 {
1573 #if __ARM_NEON
1574 int nn = w >> 3;
1575 int remain = w - (nn << 3);
1576 #else
1577 int remain = w;
1578 #endif // __ARM_NEON
1579
1580 #if __ARM_NEON
1581 #if __aarch64__
1582 uint8x8_t _R2Y = vdup_n_u8(R2Y);
1583 uint8x8_t _G2Y = vdup_n_u8(G2Y);
1584 uint8x8_t _B2Y = vdup_n_u8(B2Y);
1585 for (; nn > 0; nn--)
1586 {
1587 uint8x8x4_t _rgba = vld4_u8(rgba);
1588
1589 uint16x8_t _y16 = vmull_u8(_rgba.val[0], _R2Y);
1590 _y16 = vmlal_u8(_y16, _rgba.val[1], _G2Y);
1591 _y16 = vmlal_u8(_y16, _rgba.val[2], _B2Y);
1592 _y16 = vshrq_n_u16(_y16, Y_shift);
1593
1594 float32x4_t _ylow = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_y16)));
1595 float32x4_t _yhigh = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_y16)));
1596
1597 vst1q_f32(ptr, _ylow);
1598 vst1q_f32(ptr + 4, _yhigh);
1599
1600 rgba += 4 * 8;
1601 ptr += 8;
1602 }
1603 #else
1604 if (nn > 0)
1605 {
1606 asm volatile(
1607 "vdup.u8 d16, %6 \n"
1608 "vdup.u8 d17, %7 \n"
1609 "vdup.u8 d18, %8 \n"
1610 "0: \n"
1611 "pld [%1, #256] \n"
1612 "vld4.u8 {d0-d3}, [%1]! \n"
1613 "vmull.u8 q2, d0, d16 \n"
1614 "vmlal.u8 q2, d1, d17 \n"
1615 "vmlal.u8 q2, d2, d18 \n"
1616 "vshr.u16 q2, q2, #8 \n" // Y_shift
1617 "vmovl.u16 q0, d4 \n"
1618 "vmovl.u16 q1, d5 \n"
1619 "vcvt.f32.u32 q0, q0 \n"
1620 "vcvt.f32.u32 q1, q1 \n"
1621 "subs %0, #1 \n"
1622 "vst1.f32 {d0-d3}, [%2]! \n"
1623 "bne 0b \n"
1624 : "=r"(nn), // %0
1625 "=r"(rgba), // %1
1626 "=r"(ptr) // %2
1627 : "0"(nn),
1628 "1"(rgba),
1629 "2"(ptr),
1630 "r"(R2Y), // %6
1631 "r"(G2Y), // %7
1632 "r"(B2Y) // %8
1633 : "cc", "memory", "q0", "q1", "q2", "q8", "q9");
1634 }
1635 #endif // __aarch64__
1636 #endif // __ARM_NEON
1637 for (; remain > 0; remain--)
1638 {
1639 *ptr = static_cast<float>((rgba[0] * R2Y + rgba[1] * G2Y + rgba[2] * B2Y) >> Y_shift);
1640
1641 rgba += 4;
1642 ptr++;
1643 }
1644
1645 rgba += wgap;
1646 }
1647
1648 return 0;
1649 }
1650
from_rgba2bgra(const unsigned char * rgba,int w,int h,int stride,Mat & m,Allocator * allocator)1651 static int from_rgba2bgra(const unsigned char* rgba, int w, int h, int stride, Mat& m, Allocator* allocator)
1652 {
1653 m.create(w, h, 4, 4u, allocator);
1654 if (m.empty())
1655 return -100;
1656
1657 const int wgap = stride - w * 4;
1658 if (wgap == 0)
1659 {
1660 w = w * h;
1661 h = 1;
1662 }
1663
1664 float* ptr0 = m.channel(0);
1665 float* ptr1 = m.channel(1);
1666 float* ptr2 = m.channel(2);
1667 float* ptr3 = m.channel(3);
1668
1669 for (int y = 0; y < h; y++)
1670 {
1671 #if __ARM_NEON
1672 int nn = w >> 3;
1673 int remain = w - (nn << 3);
1674 #else
1675 int remain = w;
1676 #endif // __ARM_NEON
1677
1678 #if __ARM_NEON
1679 #if __aarch64__
1680 for (; nn > 0; nn--)
1681 {
1682 uint8x8x4_t _rgba = vld4_u8(rgba);
1683 int16x8_t _r16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[0]));
1684 int16x8_t _g16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[1]));
1685 int16x8_t _b16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[2]));
1686 int16x8_t _a16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[3]));
1687
1688 float32x4_t _rlow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_r16)));
1689 float32x4_t _rhigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_r16)));
1690 float32x4_t _glow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_g16)));
1691 float32x4_t _ghigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_g16)));
1692 float32x4_t _blow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_b16)));
1693 float32x4_t _bhigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_b16)));
1694 float32x4_t _alow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_a16)));
1695 float32x4_t _ahigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_a16)));
1696
1697 vst1q_f32(ptr2, _rlow);
1698 vst1q_f32(ptr2 + 4, _rhigh);
1699 vst1q_f32(ptr1, _glow);
1700 vst1q_f32(ptr1 + 4, _ghigh);
1701 vst1q_f32(ptr0, _blow);
1702 vst1q_f32(ptr0 + 4, _bhigh);
1703 vst1q_f32(ptr3, _alow);
1704 vst1q_f32(ptr3 + 4, _ahigh);
1705
1706 rgba += 4 * 8;
1707 ptr0 += 8;
1708 ptr1 += 8;
1709 ptr2 += 8;
1710 ptr3 += 8;
1711 }
1712 #else
1713 if (nn > 0)
1714 {
1715 asm volatile(
1716 "0: \n"
1717 "pld [%1, #256] \n"
1718 "vld4.u8 {d0-d3}, [%1]! \n"
1719 "vmovl.u8 q8, d0 \n"
1720 "vmovl.u8 q9, d1 \n"
1721 "vmovl.u8 q10, d2 \n"
1722 "vmovl.u8 q11, d3 \n"
1723 "vmovl.u16 q0, d16 \n"
1724 "vmovl.u16 q1, d17 \n"
1725 "vmovl.u16 q2, d18 \n"
1726 "vmovl.u16 q3, d19 \n"
1727 "vmovl.u16 q8, d20 \n"
1728 "vmovl.u16 q9, d21 \n"
1729 "vmovl.u16 q10, d22 \n"
1730 "vmovl.u16 q11, d23 \n"
1731 "vcvt.f32.u32 q0, q0 \n"
1732 "vcvt.f32.u32 q1, q1 \n"
1733 "vcvt.f32.u32 q2, q2 \n"
1734 "vcvt.f32.u32 q3, q3 \n"
1735 "vcvt.f32.u32 q8, q8 \n"
1736 "subs %0, #1 \n"
1737 "vst1.f32 {d0-d3}, [%4]! \n"
1738 "vcvt.f32.u32 q9, q9 \n"
1739 "vcvt.f32.u32 q10, q10 \n"
1740 "vst1.f32 {d4-d7}, [%3]! \n"
1741 "vcvt.f32.u32 q11, q11 \n"
1742 "vst1.f32 {d16-d19}, [%2]! \n"
1743 "vst1.f32 {d20-d23}, [%5]! \n"
1744 "bne 0b \n"
1745 : "=r"(nn), // %0
1746 "=r"(rgba), // %1
1747 "=r"(ptr0), // %2
1748 "=r"(ptr1), // %3
1749 "=r"(ptr2), // %4
1750 "=r"(ptr3) // %5
1751 : "0"(nn),
1752 "1"(rgba),
1753 "2"(ptr0),
1754 "3"(ptr1),
1755 "4"(ptr2),
1756 "5"(ptr3)
1757 : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
1758 }
1759 #endif // __aarch64__
1760 #endif // __ARM_NEON
1761 for (; remain > 0; remain--)
1762 {
1763 *ptr0 = rgba[2];
1764 *ptr1 = rgba[1];
1765 *ptr2 = rgba[0];
1766 *ptr3 = rgba[3];
1767
1768 rgba += 4;
1769 ptr0++;
1770 ptr1++;
1771 ptr2++;
1772 ptr3++;
1773 }
1774
1775 rgba += wgap;
1776 }
1777
1778 return 0;
1779 }
1780
to_rgba2bgra(const Mat & m,unsigned char * bgra,int stride)1781 static void to_rgba2bgra(const Mat& m, unsigned char* bgra, int stride)
1782 {
1783 int w = m.w;
1784 int h = m.h;
1785
1786 const int wgap = stride - w * 4;
1787 if (wgap == 0)
1788 {
1789 w = w * h;
1790 h = 1;
1791 }
1792
1793 const float* ptr0 = m.channel(0);
1794 const float* ptr1 = m.channel(1);
1795 const float* ptr2 = m.channel(2);
1796 const float* ptr3 = m.channel(3);
1797
1798 for (int y = 0; y < h; y++)
1799 {
1800 #define SATURATE_CAST_UCHAR(X) (unsigned char)::std::min(::std::max((int)(X), 0), 255);
1801
1802 #if __ARM_NEON
1803 int nn = w >> 3;
1804 int remain = w - (nn << 3);
1805 #else
1806 int remain = w;
1807 #endif // __ARM_NEON
1808
1809 #if __ARM_NEON
1810 for (; nn > 0; nn--)
1811 {
1812 float32x4_t _rlow = vld1q_f32(ptr0);
1813 float32x4_t _rhigh = vld1q_f32(ptr0 + 4);
1814 float32x4_t _glow = vld1q_f32(ptr1);
1815 float32x4_t _ghigh = vld1q_f32(ptr1 + 4);
1816 float32x4_t _blow = vld1q_f32(ptr2);
1817 float32x4_t _bhigh = vld1q_f32(ptr2 + 4);
1818 float32x4_t _alow = vld1q_f32(ptr3);
1819 float32x4_t _ahigh = vld1q_f32(ptr3 + 4);
1820
1821 int16x8_t _r16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_rlow)), vmovn_s32(vcvtq_s32_f32(_rhigh)));
1822 int16x8_t _g16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_glow)), vmovn_s32(vcvtq_s32_f32(_ghigh)));
1823 int16x8_t _b16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_blow)), vmovn_s32(vcvtq_s32_f32(_bhigh)));
1824 int16x8_t _a16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_alow)), vmovn_s32(vcvtq_s32_f32(_ahigh)));
1825
1826 uint8x8x4_t _bgra;
1827 _bgra.val[0] = vqmovun_s16(_b16);
1828 _bgra.val[1] = vqmovun_s16(_g16);
1829 _bgra.val[2] = vqmovun_s16(_r16);
1830 _bgra.val[3] = vqmovun_s16(_a16);
1831
1832 vst4_u8(bgra, _bgra);
1833
1834 bgra += 4 * 8;
1835 ptr0 += 8;
1836 ptr1 += 8;
1837 ptr2 += 8;
1838 ptr3 += 8;
1839 }
1840 #endif // __ARM_NEON
1841 for (; remain > 0; remain--)
1842 {
1843 bgra[0] = SATURATE_CAST_UCHAR(*ptr2);
1844 bgra[1] = SATURATE_CAST_UCHAR(*ptr1);
1845 bgra[2] = SATURATE_CAST_UCHAR(*ptr0);
1846 bgra[3] = SATURATE_CAST_UCHAR(*ptr3);
1847
1848 bgra += 4;
1849 ptr0++;
1850 ptr1++;
1851 ptr2++;
1852 ptr3++;
1853 }
1854
1855 #undef SATURATE_CAST_UCHAR
1856 bgra += wgap;
1857 }
1858 }
1859
from_bgra2gray(const unsigned char * bgra,int w,int h,int stride,Mat & m,Allocator * allocator)1860 static int from_bgra2gray(const unsigned char* bgra, int w, int h, int stride, Mat& m, Allocator* allocator)
1861 {
1862 // coeffs for r g b = 0.299f, 0.587f, 0.114f
1863 const unsigned char Y_shift = 8; //14
1864 const unsigned char R2Y = 77;
1865 const unsigned char G2Y = 150;
1866 const unsigned char B2Y = 29;
1867
1868 m.create(w, h, 1, 4u, allocator);
1869 if (m.empty())
1870 return -100;
1871
1872 const int wgap = stride - w * 4;
1873 if (wgap == 0)
1874 {
1875 w = w * h;
1876 h = 1;
1877 }
1878
1879 float* ptr = m;
1880
1881 for (int y = 0; y < h; y++)
1882 {
1883 #if __ARM_NEON
1884 int nn = w >> 3;
1885 int remain = w - (nn << 3);
1886 #else
1887 int remain = w;
1888 #endif // __ARM_NEON
1889
1890 #if __ARM_NEON
1891 #if __aarch64__
1892 uint8x8_t _R2Y = vdup_n_u8(R2Y);
1893 uint8x8_t _G2Y = vdup_n_u8(G2Y);
1894 uint8x8_t _B2Y = vdup_n_u8(B2Y);
1895 for (; nn > 0; nn--)
1896 {
1897 uint8x8x4_t _bgra = vld4_u8(bgra);
1898
1899 uint16x8_t _y16 = vmull_u8(_bgra.val[2], _R2Y);
1900 _y16 = vmlal_u8(_y16, _bgra.val[1], _G2Y);
1901 _y16 = vmlal_u8(_y16, _bgra.val[0], _B2Y);
1902 _y16 = vshrq_n_u16(_y16, Y_shift);
1903
1904 float32x4_t _ylow = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_y16)));
1905 float32x4_t _yhigh = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_y16)));
1906
1907 vst1q_f32(ptr, _ylow);
1908 vst1q_f32(ptr + 4, _yhigh);
1909
1910 bgra += 4 * 8;
1911 ptr += 8;
1912 }
1913 #else
1914 if (nn > 0)
1915 {
1916 asm volatile(
1917 "vdup.u8 d16, %6 \n"
1918 "vdup.u8 d17, %7 \n"
1919 "vdup.u8 d18, %8 \n"
1920 "0: \n"
1921 "pld [%1, #256] \n"
1922 "vld4.u8 {d0-d3}, [%1]! \n"
1923 "vmull.u8 q2, d2, d16 \n"
1924 "vmlal.u8 q2, d1, d17 \n"
1925 "vmlal.u8 q2, d0, d18 \n"
1926 "vshr.u16 q2, q2, #8 \n" // Y_shift
1927 "vmovl.u16 q0, d4 \n"
1928 "vmovl.u16 q1, d5 \n"
1929 "vcvt.f32.u32 q0, q0 \n"
1930 "vcvt.f32.u32 q1, q1 \n"
1931 "subs %0, #1 \n"
1932 "vst1.f32 {d0-d3}, [%2]! \n"
1933 "bne 0b \n"
1934 : "=r"(nn), // %0
1935 "=r"(bgra), // %1
1936 "=r"(ptr) // %2
1937 : "0"(nn),
1938 "1"(bgra),
1939 "2"(ptr),
1940 "r"(R2Y), // %6
1941 "r"(G2Y), // %7
1942 "r"(B2Y) // %8
1943 : "cc", "memory", "q0", "q1", "q2", "q8", "q9");
1944 }
1945 #endif // __aarch64__
1946 #endif // __ARM_NEON
1947 for (; remain > 0; remain--)
1948 {
1949 *ptr = static_cast<float>((bgra[2] * R2Y + bgra[1] * G2Y + bgra[0] * B2Y) >> Y_shift);
1950
1951 bgra += 4;
1952 ptr++;
1953 }
1954
1955 bgra += wgap;
1956 }
1957
1958 return 0;
1959 }
1960
yuv420sp2rgb(const unsigned char * yuv420sp,int w,int h,unsigned char * rgb)1961 void yuv420sp2rgb(const unsigned char* yuv420sp, int w, int h, unsigned char* rgb)
1962 {
1963 const unsigned char* yptr = yuv420sp;
1964 const unsigned char* vuptr = yuv420sp + w * h;
1965
1966 #if __ARM_NEON
1967 uint8x8_t _v128 = vdup_n_u8(128);
1968 int8x8_t _v90 = vdup_n_s8(90);
1969 int8x8_t _v46 = vdup_n_s8(46);
1970 int8x8_t _v22 = vdup_n_s8(22);
1971 int8x8_t _v113 = vdup_n_s8(113);
1972 #endif // __ARM_NEON
1973
1974 for (int y = 0; y < h; y += 2)
1975 {
1976 const unsigned char* yptr0 = yptr;
1977 const unsigned char* yptr1 = yptr + w;
1978 unsigned char* rgb0 = rgb;
1979 unsigned char* rgb1 = rgb + w * 3;
1980
1981 #if __ARM_NEON
1982 int nn = w >> 3;
1983 int remain = w - (nn << 3);
1984 #else
1985 int remain = w;
1986 #endif // __ARM_NEON
1987
1988 #if __ARM_NEON
1989 #if __aarch64__
1990 for (; nn > 0; nn--)
1991 {
1992 int16x8_t _yy0 = vreinterpretq_s16_u16(vshll_n_u8(vld1_u8(yptr0), 6));
1993 int16x8_t _yy1 = vreinterpretq_s16_u16(vshll_n_u8(vld1_u8(yptr1), 6));
1994
1995 int8x8_t _vvuu = vreinterpret_s8_u8(vsub_u8(vld1_u8(vuptr), _v128));
1996 int8x8x2_t _vvvvuuuu = vtrn_s8(_vvuu, _vvuu);
1997 int8x8_t _vv = _vvvvuuuu.val[0];
1998 int8x8_t _uu = _vvvvuuuu.val[1];
1999
2000 int16x8_t _r0 = vmlal_s8(_yy0, _vv, _v90);
2001 int16x8_t _g0 = vmlsl_s8(_yy0, _vv, _v46);
2002 _g0 = vmlsl_s8(_g0, _uu, _v22);
2003 int16x8_t _b0 = vmlal_s8(_yy0, _uu, _v113);
2004
2005 int16x8_t _r1 = vmlal_s8(_yy1, _vv, _v90);
2006 int16x8_t _g1 = vmlsl_s8(_yy1, _vv, _v46);
2007 _g1 = vmlsl_s8(_g1, _uu, _v22);
2008 int16x8_t _b1 = vmlal_s8(_yy1, _uu, _v113);
2009
2010 uint8x8x3_t _rgb0;
2011 _rgb0.val[0] = vqshrun_n_s16(_r0, 6);
2012 _rgb0.val[1] = vqshrun_n_s16(_g0, 6);
2013 _rgb0.val[2] = vqshrun_n_s16(_b0, 6);
2014
2015 uint8x8x3_t _rgb1;
2016 _rgb1.val[0] = vqshrun_n_s16(_r1, 6);
2017 _rgb1.val[1] = vqshrun_n_s16(_g1, 6);
2018 _rgb1.val[2] = vqshrun_n_s16(_b1, 6);
2019
2020 vst3_u8(rgb0, _rgb0);
2021 vst3_u8(rgb1, _rgb1);
2022
2023 yptr0 += 8;
2024 yptr1 += 8;
2025 vuptr += 8;
2026 rgb0 += 24;
2027 rgb1 += 24;
2028 }
2029 #else
2030 if (nn > 0)
2031 {
2032 asm volatile(
2033 "pld [%3, #128] \n"
2034 "vld1.u8 {d2}, [%3]! \n"
2035 "vsub.s8 d2, d2, %12 \n"
2036 "0: \n"
2037 "pld [%1, #128] \n"
2038 "vld1.u8 {d0}, [%1]! \n"
2039 "pld [%2, #128] \n"
2040 "vld1.u8 {d1}, [%2]! \n"
2041 "vshll.u8 q2, d0, #6 \n"
2042 "vorr d3, d2, d2 \n"
2043 "vshll.u8 q3, d1, #6 \n"
2044 "vorr q9, q2, q2 \n"
2045 "vtrn.s8 d2, d3 \n"
2046 "vorr q11, q3, q3 \n"
2047 "vmlsl.s8 q9, d2, %14 \n"
2048 "vorr q8, q2, q2 \n"
2049 "vmlsl.s8 q11, d2, %14 \n"
2050 "vorr q10, q3, q3 \n"
2051 "vmlal.s8 q8, d2, %13 \n"
2052 "vmlal.s8 q2, d3, %16 \n"
2053 "vmlal.s8 q10, d2, %13 \n"
2054 "vmlsl.s8 q9, d3, %15 \n"
2055 "vmlal.s8 q3, d3, %16 \n"
2056 "vmlsl.s8 q11, d3, %15 \n"
2057 "vqshrun.s16 d24, q8, #6 \n"
2058 "vqshrun.s16 d26, q2, #6 \n"
2059 "vqshrun.s16 d4, q10, #6 \n"
2060 "vqshrun.s16 d25, q9, #6 \n"
2061 "vqshrun.s16 d6, q3, #6 \n"
2062 "vqshrun.s16 d5, q11, #6 \n"
2063 "pld [%3, #128] \n"
2064 "vld1.u8 {d2}, [%3]! \n"
2065 "subs %0, #1 \n"
2066 "vst3.u8 {d24-d26}, [%4]! \n"
2067 "vsub.s8 d2, d2, %12 \n"
2068 "vst3.u8 {d4-d6}, [%5]! \n"
2069 "bne 0b \n"
2070 "sub %3, #8 \n"
2071 : "=r"(nn), // %0
2072 "=r"(yptr0), // %1
2073 "=r"(yptr1), // %2
2074 "=r"(vuptr), // %3
2075 "=r"(rgb0), // %4
2076 "=r"(rgb1) // %5
2077 : "0"(nn),
2078 "1"(yptr0),
2079 "2"(yptr1),
2080 "3"(vuptr),
2081 "4"(rgb0),
2082 "5"(rgb1),
2083 "w"(_v128), // %12
2084 "w"(_v90), // %13
2085 "w"(_v46), // %14
2086 "w"(_v22), // %15
2087 "w"(_v113) // %16
2088 : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "d26");
2089 }
2090 #endif // __aarch64__
2091 #endif // __ARM_NEON
2092
2093 #define SATURATE_CAST_UCHAR(X) (unsigned char)::std::min(::std::max((int)(X), 0), 255);
2094 for (; remain > 0; remain -= 2)
2095 {
2096 // R = 1.164 * yy + 1.596 * vv
2097 // G = 1.164 * yy - 0.813 * vv - 0.391 * uu
2098 // B = 1.164 * yy + 2.018 * uu
2099
2100 // R = Y + (1.370705 * (V-128))
2101 // G = Y - (0.698001 * (V-128)) - (0.337633 * (U-128))
2102 // B = Y + (1.732446 * (U-128))
2103
2104 // R = ((Y << 6) + 87.72512 * (V-128)) >> 6
2105 // G = ((Y << 6) - 44.672064 * (V-128) - 21.608512 * (U-128)) >> 6
2106 // B = ((Y << 6) + 110.876544 * (U-128)) >> 6
2107
2108 // R = ((Y << 6) + 90 * (V-128)) >> 6
2109 // G = ((Y << 6) - 46 * (V-128) - 22 * (U-128)) >> 6
2110 // B = ((Y << 6) + 113 * (U-128)) >> 6
2111
2112 // R = (yy + 90 * vv) >> 6
2113 // G = (yy - 46 * vv - 22 * uu) >> 6
2114 // B = (yy + 113 * uu) >> 6
2115
2116 int v = vuptr[0] - 128;
2117 int u = vuptr[1] - 128;
2118
2119 int ruv = 90 * v;
2120 int guv = -46 * v + -22 * u;
2121 int buv = 113 * u;
2122
2123 int y00 = yptr0[0] << 6;
2124 rgb0[0] = SATURATE_CAST_UCHAR((y00 + ruv) >> 6);
2125 rgb0[1] = SATURATE_CAST_UCHAR((y00 + guv) >> 6);
2126 rgb0[2] = SATURATE_CAST_UCHAR((y00 + buv) >> 6);
2127
2128 int y01 = yptr0[1] << 6;
2129 rgb0[3] = SATURATE_CAST_UCHAR((y01 + ruv) >> 6);
2130 rgb0[4] = SATURATE_CAST_UCHAR((y01 + guv) >> 6);
2131 rgb0[5] = SATURATE_CAST_UCHAR((y01 + buv) >> 6);
2132
2133 int y10 = yptr1[0] << 6;
2134 rgb1[0] = SATURATE_CAST_UCHAR((y10 + ruv) >> 6);
2135 rgb1[1] = SATURATE_CAST_UCHAR((y10 + guv) >> 6);
2136 rgb1[2] = SATURATE_CAST_UCHAR((y10 + buv) >> 6);
2137
2138 int y11 = yptr1[1] << 6;
2139 rgb1[3] = SATURATE_CAST_UCHAR((y11 + ruv) >> 6);
2140 rgb1[4] = SATURATE_CAST_UCHAR((y11 + guv) >> 6);
2141 rgb1[5] = SATURATE_CAST_UCHAR((y11 + buv) >> 6);
2142
2143 yptr0 += 2;
2144 yptr1 += 2;
2145 vuptr += 2;
2146 rgb0 += 6;
2147 rgb1 += 6;
2148 }
2149 #undef SATURATE_CAST_UCHAR
2150
2151 yptr += 2 * w;
2152 rgb += 2 * 3 * w;
2153 }
2154 }
2155
yuv420sp2rgb_nv12(const unsigned char * yuv420sp,int w,int h,unsigned char * rgb)2156 void yuv420sp2rgb_nv12(const unsigned char* yuv420sp, int w, int h, unsigned char* rgb)
2157 {
2158 const unsigned char* yptr = yuv420sp;
2159 const unsigned char* uvptr = yuv420sp + w * h;
2160
2161 #if __ARM_NEON
2162 uint8x8_t _v128 = vdup_n_u8(128);
2163 int8x8_t _v90 = vdup_n_s8(90);
2164 int8x8_t _v46 = vdup_n_s8(46);
2165 int8x8_t _v22 = vdup_n_s8(22);
2166 int8x8_t _v113 = vdup_n_s8(113);
2167 #endif // __ARM_NEON
2168
2169 for (int y = 0; y < h; y += 2)
2170 {
2171 const unsigned char* yptr0 = yptr;
2172 const unsigned char* yptr1 = yptr + w;
2173 unsigned char* rgb0 = rgb;
2174 unsigned char* rgb1 = rgb + w * 3;
2175
2176 #if __ARM_NEON
2177 int nn = w >> 3;
2178 int remain = w - (nn << 3);
2179 #else
2180 int remain = w;
2181 #endif // __ARM_NEON
2182
2183 #if __ARM_NEON
2184 #if __aarch64__
2185 for (; nn > 0; nn--)
2186 {
2187 int16x8_t _yy0 = vreinterpretq_s16_u16(vshll_n_u8(vld1_u8(yptr0), 6));
2188 int16x8_t _yy1 = vreinterpretq_s16_u16(vshll_n_u8(vld1_u8(yptr1), 6));
2189
2190 int8x8_t _uuvv = vreinterpret_s8_u8(vsub_u8(vld1_u8(uvptr), _v128));
2191 int8x8x2_t _uuuuvvvv = vtrn_s8(_uuvv, _uuvv);
2192 int8x8_t _uu = _uuuuvvvv.val[0];
2193 int8x8_t _vv = _uuuuvvvv.val[1];
2194
2195 int16x8_t _r0 = vmlal_s8(_yy0, _vv, _v90);
2196 int16x8_t _g0 = vmlsl_s8(_yy0, _vv, _v46);
2197 _g0 = vmlsl_s8(_g0, _uu, _v22);
2198 int16x8_t _b0 = vmlal_s8(_yy0, _uu, _v113);
2199
2200 int16x8_t _r1 = vmlal_s8(_yy1, _vv, _v90);
2201 int16x8_t _g1 = vmlsl_s8(_yy1, _vv, _v46);
2202 _g1 = vmlsl_s8(_g1, _uu, _v22);
2203 int16x8_t _b1 = vmlal_s8(_yy1, _uu, _v113);
2204
2205 uint8x8x3_t _rgb0;
2206 _rgb0.val[0] = vqshrun_n_s16(_r0, 6);
2207 _rgb0.val[1] = vqshrun_n_s16(_g0, 6);
2208 _rgb0.val[2] = vqshrun_n_s16(_b0, 6);
2209
2210 uint8x8x3_t _rgb1;
2211 _rgb1.val[0] = vqshrun_n_s16(_r1, 6);
2212 _rgb1.val[1] = vqshrun_n_s16(_g1, 6);
2213 _rgb1.val[2] = vqshrun_n_s16(_b1, 6);
2214
2215 vst3_u8(rgb0, _rgb0);
2216 vst3_u8(rgb1, _rgb1);
2217
2218 yptr0 += 8;
2219 yptr1 += 8;
2220 uvptr += 8;
2221 rgb0 += 24;
2222 rgb1 += 24;
2223 }
2224 #else
2225 if (nn > 0)
2226 {
2227 asm volatile(
2228 "pld [%3, #128] \n"
2229 "vld1.u8 {d2}, [%3]! \n"
2230 "vsub.s8 d2, d2, %12 \n"
2231 "0: \n"
2232 "pld [%1, #128] \n"
2233 "vld1.u8 {d0}, [%1]! \n"
2234 "pld [%2, #128] \n"
2235 "vld1.u8 {d1}, [%2]! \n"
2236 "vshll.u8 q2, d0, #6 \n"
2237 "vorr d3, d2, d2 \n"
2238 "vshll.u8 q3, d1, #6 \n"
2239 "vorr q9, q2, q2 \n"
2240 "vtrn.s8 d2, d3 \n"
2241 "vorr q11, q3, q3 \n"
2242 "vmlsl.s8 q9, d3, %14 \n"
2243 "vorr q8, q2, q2 \n"
2244 "vmlsl.s8 q11, d3, %14 \n"
2245 "vorr q10, q3, q3 \n"
2246 "vmlal.s8 q8, d3, %13 \n"
2247 "vmlal.s8 q2, d2, %16 \n"
2248 "vmlal.s8 q10, d3, %13 \n"
2249 "vmlsl.s8 q9, d2, %15 \n"
2250 "vmlal.s8 q3, d2, %16 \n"
2251 "vmlsl.s8 q11, d2, %15 \n"
2252 "vqshrun.s16 d24, q8, #6 \n"
2253 "vqshrun.s16 d26, q2, #6 \n"
2254 "vqshrun.s16 d4, q10, #6 \n"
2255 "vqshrun.s16 d25, q9, #6 \n"
2256 "vqshrun.s16 d6, q3, #6 \n"
2257 "vqshrun.s16 d5, q11, #6 \n"
2258 "pld [%3, #128] \n"
2259 "vld1.u8 {d2}, [%3]! \n"
2260 "subs %0, #1 \n"
2261 "vst3.u8 {d24-d26}, [%4]! \n"
2262 "vsub.s8 d2, d2, %12 \n"
2263 "vst3.u8 {d4-d6}, [%5]! \n"
2264 "bne 0b \n"
2265 "sub %3, #8 \n"
2266 : "=r"(nn), // %0
2267 "=r"(yptr0), // %1
2268 "=r"(yptr1), // %2
2269 "=r"(uvptr), // %3
2270 "=r"(rgb0), // %4
2271 "=r"(rgb1) // %5
2272 : "0"(nn),
2273 "1"(yptr0),
2274 "2"(yptr1),
2275 "3"(uvptr),
2276 "4"(rgb0),
2277 "5"(rgb1),
2278 "w"(_v128), // %12
2279 "w"(_v90), // %13
2280 "w"(_v46), // %14
2281 "w"(_v22), // %15
2282 "w"(_v113) // %16
2283 : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "d26");
2284 }
2285 #endif // __aarch64__
2286 #endif // __ARM_NEON
2287
2288 #define SATURATE_CAST_UCHAR(X) (unsigned char)::std::min(::std::max((int)(X), 0), 255);
2289 for (; remain > 0; remain -= 2)
2290 {
2291 // R = 1.164 * yy + 1.596 * vv
2292 // G = 1.164 * yy - 0.813 * vv - 0.391 * uu
2293 // B = 1.164 * yy + 2.018 * uu
2294
2295 // R = Y + (1.370705 * (V-128))
2296 // G = Y - (0.698001 * (V-128)) - (0.337633 * (U-128))
2297 // B = Y + (1.732446 * (U-128))
2298
2299 // R = ((Y << 6) + 87.72512 * (V-128)) >> 6
2300 // G = ((Y << 6) - 44.672064 * (V-128) - 21.608512 * (U-128)) >> 6
2301 // B = ((Y << 6) + 110.876544 * (U-128)) >> 6
2302
2303 // R = ((Y << 6) + 90 * (V-128)) >> 6
2304 // G = ((Y << 6) - 46 * (V-128) - 22 * (U-128)) >> 6
2305 // B = ((Y << 6) + 113 * (U-128)) >> 6
2306
2307 // R = (yy + 90 * vv) >> 6
2308 // G = (yy - 46 * vv - 22 * uu) >> 6
2309 // B = (yy + 113 * uu) >> 6
2310
2311 int u = uvptr[0] - 128;
2312 int v = uvptr[1] - 128;
2313
2314 int ruv = 90 * v;
2315 int guv = -46 * v + -22 * u;
2316 int buv = 113 * u;
2317
2318 int y00 = yptr0[0] << 6;
2319 rgb0[0] = SATURATE_CAST_UCHAR((y00 + ruv) >> 6);
2320 rgb0[1] = SATURATE_CAST_UCHAR((y00 + guv) >> 6);
2321 rgb0[2] = SATURATE_CAST_UCHAR((y00 + buv) >> 6);
2322
2323 int y01 = yptr0[1] << 6;
2324 rgb0[3] = SATURATE_CAST_UCHAR((y01 + ruv) >> 6);
2325 rgb0[4] = SATURATE_CAST_UCHAR((y01 + guv) >> 6);
2326 rgb0[5] = SATURATE_CAST_UCHAR((y01 + buv) >> 6);
2327
2328 int y10 = yptr1[0] << 6;
2329 rgb1[0] = SATURATE_CAST_UCHAR((y10 + ruv) >> 6);
2330 rgb1[1] = SATURATE_CAST_UCHAR((y10 + guv) >> 6);
2331 rgb1[2] = SATURATE_CAST_UCHAR((y10 + buv) >> 6);
2332
2333 int y11 = yptr1[1] << 6;
2334 rgb1[3] = SATURATE_CAST_UCHAR((y11 + ruv) >> 6);
2335 rgb1[4] = SATURATE_CAST_UCHAR((y11 + guv) >> 6);
2336 rgb1[5] = SATURATE_CAST_UCHAR((y11 + buv) >> 6);
2337
2338 yptr0 += 2;
2339 yptr1 += 2;
2340 uvptr += 2;
2341 rgb0 += 6;
2342 rgb1 += 6;
2343 }
2344 #undef SATURATE_CAST_UCHAR
2345
2346 yptr += 2 * w;
2347 rgb += 2 * 3 * w;
2348 }
2349 }
2350
yuv420sp2rgb_half(const unsigned char * yuv,int w,int h,unsigned char * rgb)2351 void yuv420sp2rgb_half(const unsigned char* yuv, int w, int h, unsigned char* rgb)
2352 {
2353 const unsigned char* puv = yuv + w * h;
2354 const unsigned char *py0 = yuv, *py1 = yuv + w;
2355 const int hstep = h / 2;
2356 #if __ARM_NEON
2357 const int wstep = w / 16, tailstep = (w - wstep * 16) / 2;
2358 uint8x8_t _u128 = vdup_n_u8(128);
2359 int8x8_t _s90 = vdup_n_s8(90);
2360 int8x8_t _sn46 = vdup_n_s8(-46);
2361 int8x8_t _s113 = vdup_n_s8(113);
2362 int8x8_t _sn22 = vdup_n_s8(-22);
2363 int16x8_t _s0 = vdupq_n_s16(0);
2364 int16x8_t _s16320 = vdupq_n_s16(16320); // 255 << 6
2365 #else
2366 const int tailstep = w / 2;
2367 #endif
2368
2369 for (int i = 0; i < hstep; ++i)
2370 {
2371 #if __ARM_NEON
2372 for (int j = 0; j < wstep; ++j)
2373 {
2374 uint8x16_t y0 = vld1q_u8(py0);
2375 uint8x16_t y1 = vld1q_u8(py1);
2376
2377 // first 8 Y
2378 uint16x8_t low = vaddl_u8(vget_low_u8(y0), vget_low_u8(y1));
2379 uint16x4_t low_sum = vpadd_u16(vget_low_u16(low), vget_high_u16(low));
2380
2381 // last 8 Y
2382 uint16x8_t high = vaddl_u8(vget_high_u8(y0), vget_high_u8(y1));
2383 uint16x4_t high_sum = vpadd_u16(vget_low_u16(high), vget_high_u16(high));
2384
2385 uint16x8_t y8_sum = vcombine_u16(low_sum, high_sum);
2386 // y8 = (y8_sum >> 2) << 6 = y8_sum << 4;
2387 int16x8_t y8 = vreinterpretq_s16_u16(vshlq_n_u16(y8_sum, 4));
2388
2389 // prepare uv
2390 uint8x8x2_t vu = vld2_u8(puv);
2391 int8x8_t v = vreinterpret_s8_u8(vsub_u8(vu.val[0], _u128));
2392 int8x8_t u = vreinterpret_s8_u8(vsub_u8(vu.val[1], _u128));
2393
2394 int16x8_t r_acc = vmlal_s8(y8, v, _s90);
2395 int16x8_t g_acc = vmlal_s8(y8, v, _sn46);
2396 g_acc = vmlal_s8(g_acc, u, _sn22);
2397 int16x8_t b_acc = vmlal_s8(y8, u, _s113);
2398
2399 #define SHIFT_6_SATURATE(FROM, TO) \
2400 FROM = vmaxq_s16(vminq_s16((FROM), _s16320), _s0); \
2401 uint8x8_t TO = vshrn_n_u16(vreinterpretq_u16_s16((FROM)), 6);
2402
2403 SHIFT_6_SATURATE(b_acc, b_out)
2404 SHIFT_6_SATURATE(g_acc, g_out)
2405 SHIFT_6_SATURATE(r_acc, r_out)
2406 #undef SHIFT_6_SATURATE
2407
2408 uint8x8x3_t _rgb;
2409 _rgb.val[0] = r_out;
2410 _rgb.val[1] = g_out;
2411 _rgb.val[2] = b_out;
2412 vst3_u8(rgb, _rgb);
2413
2414 rgb += 24;
2415 py0 += 16;
2416 py1 += 16;
2417 puv += 16;
2418 }
2419 #endif
2420
2421 for (int idx = 0; idx < tailstep; ++idx)
2422 {
2423 int y = (static_cast<int>(py0[0]) + py0[1] + py1[2] + py1[1]) << 4;
2424 int v = static_cast<int>(puv[0]) - 128;
2425 int u = static_cast<int>(puv[1]) - 128;
2426
2427 int ruv = 90 * v;
2428 int guv = -46 * v + -22 * u;
2429 int buv = 113 * u;
2430
2431 #define SATURATE_CAST_UCHAR(X) (unsigned char)::std::min(::std::max((int)(X), 0), 255);
2432 rgb[0] = SATURATE_CAST_UCHAR((y + ruv) >> 6);
2433 rgb[1] = SATURATE_CAST_UCHAR((y + guv) >> 6);
2434 rgb[2] = SATURATE_CAST_UCHAR((y + buv) >> 6);
2435 #undef SATURATE_CAST_UCHAR
2436
2437 rgb += 3;
2438 py0 += 2;
2439 py1 += 2;
2440 puv += 2;
2441 }
2442 // next two row
2443 py0 = py1;
2444 py1 = py0 + w;
2445 }
2446 }
2447
from_pixels(const unsigned char * pixels,int type,int w,int h,Allocator * allocator)2448 Mat Mat::from_pixels(const unsigned char* pixels, int type, int w, int h, Allocator* allocator)
2449 {
2450 int type_from = type & PIXEL_FORMAT_MASK;
2451
2452 if (type_from == PIXEL_RGB || type_from == PIXEL_BGR)
2453 {
2454 return Mat::from_pixels(pixels, type, w, h, w * 3, allocator);
2455 }
2456 else if (type_from == PIXEL_GRAY)
2457 {
2458 return Mat::from_pixels(pixels, type, w, h, w * 1, allocator);
2459 }
2460 else if (type_from == PIXEL_RGBA || type_from == PIXEL_BGRA)
2461 {
2462 return Mat::from_pixels(pixels, type, w, h, w * 4, allocator);
2463 }
2464
2465 // unknown convert type
2466 NCNN_LOGE("unknown convert type %d", type);
2467 return Mat();
2468 }
2469
from_pixels(const unsigned char * pixels,int type,int w,int h,int stride,Allocator * allocator)2470 Mat Mat::from_pixels(const unsigned char* pixels, int type, int w, int h, int stride, Allocator* allocator)
2471 {
2472 Mat m;
2473
2474 if (type & PIXEL_CONVERT_MASK)
2475 {
2476 switch (type)
2477 {
2478 case PIXEL_RGB2BGR:
2479 case PIXEL_BGR2RGB:
2480 from_rgb2bgr(pixels, w, h, stride, m, allocator);
2481 break;
2482 case PIXEL_RGB2GRAY:
2483 from_rgb2gray(pixels, w, h, stride, m, allocator);
2484 break;
2485 case PIXEL_RGB2RGBA:
2486 case PIXEL_BGR2BGRA:
2487 from_rgb2rgba(pixels, w, h, stride, m, allocator);
2488 break;
2489 case PIXEL_BGR2GRAY:
2490 from_bgr2gray(pixels, w, h, stride, m, allocator);
2491 break;
2492 case PIXEL_BGR2RGBA:
2493 case PIXEL_RGB2BGRA:
2494 from_bgr2rgba(pixels, w, h, stride, m, allocator);
2495 break;
2496 case PIXEL_GRAY2RGB:
2497 case PIXEL_GRAY2BGR:
2498 from_gray2rgb(pixels, w, h, stride, m, allocator);
2499 break;
2500 case PIXEL_GRAY2RGBA:
2501 case PIXEL_GRAY2BGRA:
2502 from_gray2rgba(pixels, w, h, stride, m, allocator);
2503 break;
2504 case PIXEL_RGBA2RGB:
2505 case PIXEL_BGRA2BGR:
2506 from_rgba2rgb(pixels, w, h, stride, m, allocator);
2507 break;
2508 case PIXEL_RGBA2BGR:
2509 case PIXEL_BGRA2RGB:
2510 from_rgba2bgr(pixels, w, h, stride, m, allocator);
2511 break;
2512 case PIXEL_RGBA2GRAY:
2513 from_rgba2gray(pixels, w, h, stride, m, allocator);
2514 break;
2515 case PIXEL_RGBA2BGRA:
2516 case PIXEL_BGRA2RGBA:
2517 from_rgba2bgra(pixels, w, h, stride, m, allocator);
2518 break;
2519 case PIXEL_BGRA2GRAY:
2520 from_bgra2gray(pixels, w, h, stride, m, allocator);
2521 break;
2522 default:
2523 // unimplemented convert type
2524 NCNN_LOGE("unimplemented convert type %d", type);
2525 break;
2526 }
2527 }
2528 else
2529 {
2530 if (type == PIXEL_RGB || type == PIXEL_BGR)
2531 from_rgb(pixels, w, h, stride, m, allocator);
2532
2533 if (type == PIXEL_GRAY)
2534 from_gray(pixels, w, h, stride, m, allocator);
2535
2536 if (type == PIXEL_RGBA || type == PIXEL_BGRA)
2537 from_rgba(pixels, w, h, stride, m, allocator);
2538 }
2539
2540 return m;
2541 }
2542
from_pixels_resize(const unsigned char * pixels,int type,int w,int h,int target_width,int target_height,Allocator * allocator)2543 Mat Mat::from_pixels_resize(const unsigned char* pixels, int type, int w, int h, int target_width, int target_height, Allocator* allocator)
2544 {
2545 int type_from = type & PIXEL_FORMAT_MASK;
2546
2547 if (type_from == PIXEL_RGB || type_from == PIXEL_BGR)
2548 {
2549 return Mat::from_pixels_resize(pixels, type, w, h, w * 3, target_width, target_height, allocator);
2550 }
2551 else if (type_from == PIXEL_GRAY)
2552 {
2553 return Mat::from_pixels_resize(pixels, type, w, h, w * 1, target_width, target_height, allocator);
2554 }
2555 else if (type_from == PIXEL_RGBA || type_from == PIXEL_BGRA)
2556 {
2557 return Mat::from_pixels_resize(pixels, type, w, h, w * 4, target_width, target_height, allocator);
2558 }
2559
2560 // unknown convert type
2561 NCNN_LOGE("unknown convert type %d", type);
2562 return Mat();
2563 }
2564
from_pixels_resize(const unsigned char * pixels,int type,int w,int h,int stride,int target_width,int target_height,Allocator * allocator)2565 Mat Mat::from_pixels_resize(const unsigned char* pixels, int type, int w, int h, int stride, int target_width, int target_height, Allocator* allocator)
2566 {
2567 if (w == target_width && h == target_height)
2568 return Mat::from_pixels(pixels, type, w, h, stride, allocator);
2569
2570 int type_from = type & PIXEL_FORMAT_MASK;
2571
2572 if (type_from == PIXEL_RGB || type_from == PIXEL_BGR)
2573 {
2574 Mat dst(target_width, target_height, (size_t)3u, 3);
2575 resize_bilinear_c3(pixels, w, h, stride, dst, target_width, target_height, target_width * 3);
2576
2577 return Mat::from_pixels(dst, type, target_width, target_height, allocator);
2578 }
2579 else if (type_from == PIXEL_GRAY)
2580 {
2581 Mat dst(target_width, target_height, (size_t)1u, 1);
2582 resize_bilinear_c1(pixels, w, h, stride, dst, target_width, target_height, target_width * 1);
2583
2584 return Mat::from_pixels(dst, type, target_width, target_height, allocator);
2585 }
2586 else if (type_from == PIXEL_RGBA || type_from == PIXEL_BGRA)
2587 {
2588 Mat dst(target_width, target_height, (size_t)4u, 4);
2589 resize_bilinear_c4(pixels, w, h, stride, dst, target_width, target_height, target_width * 4);
2590
2591 return Mat::from_pixels(dst, type, target_width, target_height, allocator);
2592 }
2593
2594 // unknown convert type
2595 NCNN_LOGE("unknown convert type %d", type);
2596 return Mat();
2597 }
2598
from_pixels_roi(const unsigned char * pixels,int type,int w,int h,int roix,int roiy,int roiw,int roih,Allocator * allocator)2599 Mat Mat::from_pixels_roi(const unsigned char* pixels, int type, int w, int h, int roix, int roiy, int roiw, int roih, Allocator* allocator)
2600 {
2601 if (roix < 0 || roiy < 0 || roiw <= 0 || roih <= 0 || roix + roiw > w || roiy + roih > h)
2602 {
2603 NCNN_LOGE("roi %d %d %d %d out of image %d %d", roix, roiy, roiw, roih, w, h);
2604 return Mat();
2605 }
2606
2607 int type_from = type & PIXEL_FORMAT_MASK;
2608
2609 if (type_from == PIXEL_RGB || type_from == PIXEL_BGR)
2610 {
2611 return from_pixels(pixels + (roiy * w + roix) * 3, type, roiw, roih, w * 3, allocator);
2612 }
2613 else if (type_from == PIXEL_GRAY)
2614 {
2615 return from_pixels(pixels + (roiy * w + roix) * 1, type, roiw, roih, w * 1, allocator);
2616 }
2617 else if (type_from == PIXEL_RGBA || type_from == PIXEL_BGRA)
2618 {
2619 return from_pixels(pixels + (roiy * w + roix) * 4, type, roiw, roih, w * 4, allocator);
2620 }
2621
2622 // unknown convert type
2623 NCNN_LOGE("unknown convert type %d", type);
2624 return Mat();
2625 }
2626
from_pixels_roi(const unsigned char * pixels,int type,int w,int h,int stride,int roix,int roiy,int roiw,int roih,Allocator * allocator)2627 Mat Mat::from_pixels_roi(const unsigned char* pixels, int type, int w, int h, int stride, int roix, int roiy, int roiw, int roih, Allocator* allocator)
2628 {
2629 if (roix < 0 || roiy < 0 || roiw <= 0 || roih <= 0 || roix + roiw > w || roiy + roih > h)
2630 {
2631 NCNN_LOGE("roi %d %d %d %d out of image %d %d", roix, roiy, roiw, roih, w, h);
2632 return Mat();
2633 }
2634
2635 int type_from = type & PIXEL_FORMAT_MASK;
2636
2637 if (type_from == PIXEL_RGB || type_from == PIXEL_BGR)
2638 {
2639 return from_pixels(pixels + roiy * stride + roix * 3, type, roiw, roih, stride, allocator);
2640 }
2641 else if (type_from == PIXEL_GRAY)
2642 {
2643 return from_pixels(pixels + roiy * stride + roix * 1, type, roiw, roih, stride, allocator);
2644 }
2645 else if (type_from == PIXEL_RGBA || type_from == PIXEL_BGRA)
2646 {
2647 return from_pixels(pixels + roiy * stride + roix * 4, type, roiw, roih, stride, allocator);
2648 }
2649
2650 // unknown convert type
2651 NCNN_LOGE("unknown convert type %d", type);
2652 return Mat();
2653 }
2654
from_pixels_roi_resize(const unsigned char * pixels,int type,int w,int h,int roix,int roiy,int roiw,int roih,int target_width,int target_height,Allocator * allocator)2655 Mat Mat::from_pixels_roi_resize(const unsigned char* pixels, int type, int w, int h, int roix, int roiy, int roiw, int roih, int target_width, int target_height, Allocator* allocator)
2656 {
2657 if (roix < 0 || roiy < 0 || roiw <= 0 || roih <= 0 || roix + roiw > w || roiy + roih > h)
2658 {
2659 NCNN_LOGE("roi %d %d %d %d out of image %d %d", roix, roiy, roiw, roih, w, h);
2660 return Mat();
2661 }
2662
2663 int type_from = type & PIXEL_FORMAT_MASK;
2664
2665 if (type_from == PIXEL_RGB || type_from == PIXEL_BGR)
2666 {
2667 return from_pixels_resize(pixels + (roiy * w + roix) * 3, type, roiw, roih, w * 3, target_width, target_height, allocator);
2668 }
2669 else if (type_from == PIXEL_GRAY)
2670 {
2671 return from_pixels_resize(pixels + (roiy * w + roix) * 1, type, roiw, roih, w * 1, target_width, target_height, allocator);
2672 }
2673 else if (type_from == PIXEL_RGBA || type_from == PIXEL_BGRA)
2674 {
2675 return from_pixels_resize(pixels + (roiy * w + roix) * 4, type, roiw, roih, w * 4, target_width, target_height, allocator);
2676 }
2677
2678 // unknown convert type
2679 NCNN_LOGE("unknown convert type %d", type);
2680 return Mat();
2681 }
2682
from_pixels_roi_resize(const unsigned char * pixels,int type,int w,int h,int stride,int roix,int roiy,int roiw,int roih,int target_width,int target_height,Allocator * allocator)2683 Mat Mat::from_pixels_roi_resize(const unsigned char* pixels, int type, int w, int h, int stride, int roix, int roiy, int roiw, int roih, int target_width, int target_height, Allocator* allocator)
2684 {
2685 if (roix < 0 || roiy < 0 || roiw <= 0 || roih <= 0 || roix + roiw > w || roiy + roih > h)
2686 {
2687 NCNN_LOGE("roi %d %d %d %d out of image %d %d", roix, roiy, roiw, roih, w, h);
2688 return Mat();
2689 }
2690
2691 int type_from = type & PIXEL_FORMAT_MASK;
2692
2693 if (type_from == PIXEL_RGB || type_from == PIXEL_BGR)
2694 {
2695 return from_pixels_resize(pixels + roiy * stride + roix * 3, type, roiw, roih, stride, target_width, target_height, allocator);
2696 }
2697 else if (type_from == PIXEL_GRAY)
2698 {
2699 return from_pixels_resize(pixels + roiy * stride + roix * 1, type, roiw, roih, stride, target_width, target_height, allocator);
2700 }
2701 else if (type_from == PIXEL_RGBA || type_from == PIXEL_BGRA)
2702 {
2703 return from_pixels_resize(pixels + roiy * stride + roix * 4, type, roiw, roih, stride, target_width, target_height, allocator);
2704 }
2705
2706 // unknown convert type
2707 NCNN_LOGE("unknown convert type %d", type);
2708 return Mat();
2709 }
2710
to_pixels(unsigned char * pixels,int type) const2711 void Mat::to_pixels(unsigned char* pixels, int type) const
2712 {
2713 int type_to = (type & PIXEL_CONVERT_MASK) ? (type >> PIXEL_CONVERT_SHIFT) : (type & PIXEL_FORMAT_MASK);
2714
2715 if (type_to == PIXEL_RGB || type_to == PIXEL_BGR)
2716 {
2717 to_pixels(pixels, type, w * 3);
2718 }
2719 else if (type_to == PIXEL_GRAY)
2720 {
2721 to_pixels(pixels, type, w * 1);
2722 }
2723 else if (type_to == PIXEL_RGBA || type_to == PIXEL_BGRA)
2724 {
2725 to_pixels(pixels, type, w * 4);
2726 }
2727 }
2728
to_pixels(unsigned char * pixels,int type,int stride) const2729 void Mat::to_pixels(unsigned char* pixels, int type, int stride) const
2730 {
2731 if (type & PIXEL_CONVERT_MASK)
2732 {
2733 switch (type)
2734 {
2735 case PIXEL_RGB2BGR:
2736 case PIXEL_BGR2RGB:
2737 to_bgr2rgb(*this, pixels, stride);
2738 break;
2739 case PIXEL_RGB2RGBA:
2740 case PIXEL_BGR2BGRA:
2741 to_rgb2rgba(*this, pixels, stride);
2742 break;
2743 case PIXEL_BGR2RGBA:
2744 case PIXEL_RGB2BGRA:
2745 to_bgr2rgba(*this, pixels, stride);
2746 break;
2747 case PIXEL_GRAY2RGBA:
2748 case PIXEL_GRAY2BGRA:
2749 to_gray2rgba(*this, pixels, stride);
2750 break;
2751 case PIXEL_RGBA2BGRA:
2752 case PIXEL_BGRA2RGBA:
2753 to_rgba2bgra(*this, pixels, stride);
2754 break;
2755 default:
2756 // unimplemented convert type
2757 NCNN_LOGE("unimplemented convert type %d", type);
2758 break;
2759 }
2760 }
2761 else
2762 {
2763 if (type == PIXEL_RGB || type == PIXEL_BGR)
2764 to_rgb(*this, pixels, stride);
2765
2766 if (type == PIXEL_GRAY)
2767 to_gray(*this, pixels, stride);
2768
2769 if (type == PIXEL_RGBA || type == PIXEL_BGRA)
2770 to_rgba(*this, pixels, stride);
2771 }
2772 }
2773
to_pixels_resize(unsigned char * pixels,int type,int target_width,int target_height) const2774 void Mat::to_pixels_resize(unsigned char* pixels, int type, int target_width, int target_height) const
2775 {
2776 int type_to = (type & PIXEL_CONVERT_MASK) ? (type >> PIXEL_CONVERT_SHIFT) : (type & PIXEL_FORMAT_MASK);
2777
2778 if (type_to == PIXEL_RGB || type_to == PIXEL_BGR)
2779 {
2780 to_pixels_resize(pixels, type, target_width, target_height, target_width * 3);
2781 }
2782 else if (type_to == PIXEL_GRAY)
2783 {
2784 to_pixels_resize(pixels, type, target_width, target_height, target_width * 1);
2785 }
2786 else if (type_to == PIXEL_RGBA || type_to == PIXEL_BGRA)
2787 {
2788 to_pixels_resize(pixels, type, target_width, target_height, target_width * 4);
2789 }
2790 }
2791
to_pixels_resize(unsigned char * pixels,int type,int target_width,int target_height,int target_stride) const2792 void Mat::to_pixels_resize(unsigned char* pixels, int type, int target_width, int target_height, int target_stride) const
2793 {
2794 if (w == target_width && h == target_height)
2795 return to_pixels(pixels, type);
2796
2797 int type_to = (type & PIXEL_CONVERT_MASK) ? (type >> PIXEL_CONVERT_SHIFT) : (type & PIXEL_FORMAT_MASK);
2798
2799 if (type_to == PIXEL_RGB || type_to == PIXEL_BGR)
2800 {
2801 Mat src(w, h, (size_t)3u, 3);
2802
2803 to_pixels(src, type);
2804
2805 resize_bilinear_c3(src, w, h, w * 3, pixels, target_width, target_height, target_stride);
2806 }
2807 else if (type_to == PIXEL_GRAY)
2808 {
2809 Mat src(w, h, (size_t)1u, 1);
2810
2811 to_pixels(src, type);
2812
2813 resize_bilinear_c1(src, w, h, w * 1, pixels, target_width, target_height, target_stride);
2814 }
2815 else if (type_to == PIXEL_RGBA || type_to == PIXEL_BGRA)
2816 {
2817 Mat src(w, h, (size_t)4u, 4);
2818
2819 to_pixels(src, type);
2820
2821 resize_bilinear_c4(src, w, h, w * 4, pixels, target_width, target_height, target_stride);
2822 }
2823 }
2824 #endif // NCNN_PIXEL
2825
2826 } // namespace ncnn
2827