1 /*
2  *  Copyright 2014 The LibYuv Project Authors. All rights reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS. All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "libyuv/row.h"
12 
13 #ifdef __cplusplus
14 namespace libyuv {
15 extern "C" {
16 #endif
17 
18 // This module is for GCC Neon armv8 64 bit.
19 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
20 
21 // Read 8 Y, 4 U and 4 V from 422
22 #define READYUV422                                                             \
23     MEMACCESS(0)                                                               \
24     "ld1        {v0.8b}, [%0], #8              \n"                             \
25     MEMACCESS(1)                                                               \
26     "ld1        {v1.s}[0], [%1], #4            \n"                             \
27     MEMACCESS(2)                                                               \
28     "ld1        {v1.s}[1], [%2], #4            \n"
29 
30 // Read 8 Y, 2 U and 2 V from 422
31 #define READYUV411                                                             \
32     MEMACCESS(0)                                                               \
33     "ld1        {v0.8b}, [%0], #8              \n"                             \
34     MEMACCESS(1)                                                               \
35     "ld1        {v2.h}[0], [%1], #2            \n"                             \
36     MEMACCESS(2)                                                               \
37     "ld1        {v2.h}[1], [%2], #2            \n"                             \
38     "zip1       v1.8b, v2.8b, v2.8b            \n"
39 
40 // Read 8 Y, 8 U and 8 V from 444
41 #define READYUV444                                                             \
42     MEMACCESS(0)                                                               \
43     "ld1        {v0.8b}, [%0], #8              \n"                             \
44     MEMACCESS(1)                                                               \
45     "ld1        {v1.d}[0], [%1], #8            \n"                             \
46     MEMACCESS(2)                                                               \
47     "ld1        {v1.d}[1], [%2], #8            \n"                             \
48     "uaddlp     v1.8h, v1.16b                  \n"                             \
49     "rshrn      v1.8b, v1.8h, #1               \n"
50 
51 // Read 8 Y, and set 4 U and 4 V to 128
52 #define READYUV400                                                             \
53     MEMACCESS(0)                                                               \
54     "ld1        {v0.8b}, [%0], #8              \n"                             \
55     "movi       v1.8b , #128                   \n"
56 
57 // Read 8 Y and 4 UV from NV12
58 #define READNV12                                                               \
59     MEMACCESS(0)                                                               \
60     "ld1        {v0.8b}, [%0], #8              \n"                             \
61     MEMACCESS(1)                                                               \
62     "ld1        {v2.8b}, [%1], #8              \n"                             \
63     "uzp1       v1.8b, v2.8b, v2.8b            \n"                             \
64     "uzp2       v3.8b, v2.8b, v2.8b            \n"                             \
65     "ins        v1.s[1], v3.s[0]               \n"
66 
67 // Read 8 Y and 4 VU from NV21
68 #define READNV21                                                               \
69     MEMACCESS(0)                                                               \
70     "ld1        {v0.8b}, [%0], #8              \n"                             \
71     MEMACCESS(1)                                                               \
72     "ld1        {v2.8b}, [%1], #8              \n"                             \
73     "uzp1       v3.8b, v2.8b, v2.8b            \n"                             \
74     "uzp2       v1.8b, v2.8b, v2.8b            \n"                             \
75     "ins        v1.s[1], v3.s[0]               \n"
76 
77 // Read 8 YUY2
78 #define READYUY2                                                               \
79     MEMACCESS(0)                                                               \
80     "ld2        {v0.8b, v1.8b}, [%0], #16      \n"                             \
81     "uzp2       v3.8b, v1.8b, v1.8b            \n"                             \
82     "uzp1       v1.8b, v1.8b, v1.8b            \n"                             \
83     "ins        v1.s[1], v3.s[0]               \n"
84 
85 // Read 8 UYVY
86 #define READUYVY                                                               \
87     MEMACCESS(0)                                                               \
88     "ld2        {v2.8b, v3.8b}, [%0], #16      \n"                             \
89     "orr        v0.8b, v3.8b, v3.8b            \n"                             \
90     "uzp1       v1.8b, v2.8b, v2.8b            \n"                             \
91     "uzp2       v3.8b, v2.8b, v2.8b            \n"                             \
92     "ins        v1.s[1], v3.s[0]               \n"
93 
94 #define YUV422TORGB_SETUP_REG                                                  \
95     "ld1r       {v24.8h}, [%[kUVBiasBGR]], #2  \n"                             \
96     "ld1r       {v25.8h}, [%[kUVBiasBGR]], #2  \n"                             \
97     "ld1r       {v26.8h}, [%[kUVBiasBGR]]      \n"                             \
98     "ld1r       {v31.4s}, [%[kYToRgb]]         \n"                             \
99     "movi       v27.8h, #128                   \n"                             \
100     "movi       v28.8h, #102                   \n"                             \
101     "movi       v29.8h, #25                    \n"                             \
102     "movi       v30.8h, #52                    \n"
103 
104 #define YUV422TORGB(vR, vG, vB)                                                \
105     "uxtl       v0.8h, v0.8b                   \n" /* Extract Y    */          \
106     "shll       v2.8h, v1.8b, #8               \n" /* Replicate UV */          \
107     "ushll2     v3.4s, v0.8h, #0               \n" /* Y */                     \
108     "ushll      v0.4s, v0.4h, #0               \n"                             \
109     "mul        v3.4s, v3.4s, v31.4s           \n"                             \
110     "mul        v0.4s, v0.4s, v31.4s           \n"                             \
111     "sqshrun    v0.4h, v0.4s, #16              \n"                             \
112     "sqshrun2   v0.8h, v3.4s, #16              \n" /* Y */                     \
113     "uaddw      v1.8h, v2.8h, v1.8b            \n" /* Replicate UV */          \
114     "mov        v2.d[0], v1.d[1]               \n" /* Extract V */             \
115     "uxtl       v2.8h, v2.8b                   \n"                             \
116     "uxtl       v1.8h, v1.8b                   \n" /* Extract U */             \
117     "mul        v3.8h, v1.8h, v27.8h           \n"                             \
118     "mul        v5.8h, v1.8h, v29.8h           \n"                             \
119     "mul        v6.8h, v2.8h, v30.8h           \n"                             \
120     "mul        v7.8h, v2.8h, v28.8h           \n"                             \
121     "sqadd      v6.8h, v6.8h, v5.8h            \n"                             \
122     "sqadd      " #vB ".8h, v24.8h, v0.8h      \n" /* B */                     \
123     "sqadd      " #vG ".8h, v25.8h, v0.8h      \n" /* G */                     \
124     "sqadd      " #vR ".8h, v26.8h, v0.8h      \n" /* R */                     \
125     "sqadd      " #vB ".8h, " #vB ".8h, v3.8h  \n" /* B */                     \
126     "sqsub      " #vG ".8h, " #vG ".8h, v6.8h  \n" /* G */                     \
127     "sqadd      " #vR ".8h, " #vR ".8h, v7.8h  \n" /* R */                     \
128     "sqshrun    " #vB ".8b, " #vB ".8h, #6     \n" /* B */                     \
129     "sqshrun    " #vG ".8b, " #vG ".8h, #6     \n" /* G */                     \
130     "sqshrun    " #vR ".8b, " #vR ".8h, #6     \n" /* R */                     \
131 
132 // YUV to RGB conversion constants.
133 // Y contribution to R,G,B.  Scale and bias.
134 #define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
135 #define YGB 1160 /* 1.164 * 64 * 16 - adjusted for even error distribution */
136 
137 // U and V contributions to R,G,B.
138 #define UB -128 /* -min(128, round(2.018 * 64)) */
139 #define UG 25 /* -round(-0.391 * 64) */
140 #define VG 52 /* -round(-0.813 * 64) */
141 #define VR -102 /* -round(1.596 * 64) */
142 
143 // Bias values to subtract 16 from Y and 128 from U and V.
144 #define BB (UB * 128            - YGB)
145 #define BG (UG * 128 + VG * 128 - YGB)
146 #define BR            (VR * 128 - YGB)
147 
148 static vec16 kUVBiasBGR = { BB, BG, BR, 0, 0, 0, 0, 0 };
149 static vec32 kYToRgb = { 0x0101 * YG, 0, 0, 0 };
150 
151 #undef YG
152 #undef YGB
153 #undef UB
154 #undef UG
155 #undef VG
156 #undef VR
157 #undef BB
158 #undef BG
159 #undef BR
160 
161 #define RGBTOUV_SETUP_REG                                                      \
162     "movi       v20.8h, #56, lsl #0  \n"  /* UB/VR coefficient (0.875) / 2 */  \
163     "movi       v21.8h, #37, lsl #0  \n"  /* UG coefficient (-0.5781) / 2  */  \
164     "movi       v22.8h, #19, lsl #0  \n"  /* UR coefficient (-0.2969) / 2  */  \
165     "movi       v23.8h, #9,  lsl #0  \n"  /* VB coefficient (-0.1406) / 2  */  \
166     "movi       v24.8h, #47, lsl #0  \n"  /* VG coefficient (-0.7344) / 2  */  \
167     "movi       v25.16b, #0x80       \n"  /* 128.5 (0x8080 in 16-bit)      */
168 
169 
170 #ifdef HAS_I444TOARGBROW_NEON
I444ToARGBRow_NEON(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_argb,int width)171 void I444ToARGBRow_NEON(const uint8* src_y,
172                         const uint8* src_u,
173                         const uint8* src_v,
174                         uint8* dst_argb,
175                         int width) {
176   asm volatile (
177     YUV422TORGB_SETUP_REG
178   "1:                                          \n"
179     READYUV444
180     YUV422TORGB(v22, v21, v20)
181     "subs       %w4, %w4, #8                 \n"
182     "movi       v23.8b, #255                   \n" /* A */
183     MEMACCESS(3)
184     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
185     "b.gt       1b                             \n"
186     : "+r"(src_y),     // %0
187       "+r"(src_u),     // %1
188       "+r"(src_v),     // %2
189       "+r"(dst_argb),  // %3
190       "+r"(width)      // %4
191     : [kUVBiasBGR]"r"(&kUVBiasBGR),
192       [kYToRgb]"r"(&kYToRgb)
193     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
194       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
195   );
196 }
197 #endif  // HAS_I444TOARGBROW_NEON
198 
199 #ifdef HAS_I422TOARGBROW_NEON
I422ToARGBRow_NEON(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_argb,int width)200 void I422ToARGBRow_NEON(const uint8* src_y,
201                         const uint8* src_u,
202                         const uint8* src_v,
203                         uint8* dst_argb,
204                         int width) {
205   asm volatile (
206     YUV422TORGB_SETUP_REG
207   "1:                                          \n"
208     READYUV422
209     YUV422TORGB(v22, v21, v20)
210     "subs       %w4, %w4, #8                   \n"
211     "movi       v23.8b, #255                   \n" /* A */
212     MEMACCESS(3)
213     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
214     "b.gt       1b                             \n"
215     : "+r"(src_y),     // %0
216       "+r"(src_u),     // %1
217       "+r"(src_v),     // %2
218       "+r"(dst_argb),  // %3
219       "+r"(width)      // %4
220     : [kUVBiasBGR]"r"(&kUVBiasBGR),
221       [kYToRgb]"r"(&kYToRgb)
222     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
223       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
224   );
225 }
226 #endif  // HAS_I422TOARGBROW_NEON
227 
228 #ifdef HAS_I411TOARGBROW_NEON
I411ToARGBRow_NEON(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_argb,int width)229 void I411ToARGBRow_NEON(const uint8* src_y,
230                         const uint8* src_u,
231                         const uint8* src_v,
232                         uint8* dst_argb,
233                         int width) {
234   asm volatile (
235     YUV422TORGB_SETUP_REG
236   "1:                                          \n"
237     READYUV411
238     YUV422TORGB(v22, v21, v20)
239     "subs       %w4, %w4, #8                   \n"
240     "movi       v23.8b, #255                   \n" /* A */
241     MEMACCESS(3)
242     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
243     "b.gt       1b                             \n"
244     : "+r"(src_y),     // %0
245       "+r"(src_u),     // %1
246       "+r"(src_v),     // %2
247       "+r"(dst_argb),  // %3
248       "+r"(width)      // %4
249     : [kUVBiasBGR]"r"(&kUVBiasBGR),
250       [kYToRgb]"r"(&kYToRgb)
251     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
252       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
253   );
254 }
255 #endif  // HAS_I411TOARGBROW_NEON
256 
257 #ifdef HAS_I422TOBGRAROW_NEON
I422ToBGRARow_NEON(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_bgra,int width)258 void I422ToBGRARow_NEON(const uint8* src_y,
259                         const uint8* src_u,
260                         const uint8* src_v,
261                         uint8* dst_bgra,
262                         int width) {
263   asm volatile (
264     YUV422TORGB_SETUP_REG
265   "1:                                          \n"
266     READYUV422
267     YUV422TORGB(v21, v22, v23)
268     "subs       %w4, %w4, #8                   \n"
269     "movi       v20.8b, #255                   \n" /* A */
270     MEMACCESS(3)
271     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
272     "b.gt       1b                             \n"
273     : "+r"(src_y),     // %0
274       "+r"(src_u),     // %1
275       "+r"(src_v),     // %2
276       "+r"(dst_bgra),  // %3
277       "+r"(width)      // %4
278     : [kUVBiasBGR]"r"(&kUVBiasBGR),
279       [kYToRgb]"r"(&kYToRgb)
280     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
281       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
282   );
283 }
284 #endif  // HAS_I422TOBGRAROW_NEON
285 
286 #ifdef HAS_I422TOABGRROW_NEON
I422ToABGRRow_NEON(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_abgr,int width)287 void I422ToABGRRow_NEON(const uint8* src_y,
288                         const uint8* src_u,
289                         const uint8* src_v,
290                         uint8* dst_abgr,
291                         int width) {
292   asm volatile (
293     YUV422TORGB_SETUP_REG
294   "1:                                          \n"
295     READYUV422
296     YUV422TORGB(v20, v21, v22)
297     "subs       %w4, %w4, #8                   \n"
298     "movi       v23.8b, #255                   \n" /* A */
299     MEMACCESS(3)
300     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
301     "b.gt       1b                             \n"
302     : "+r"(src_y),     // %0
303       "+r"(src_u),     // %1
304       "+r"(src_v),     // %2
305       "+r"(dst_abgr),  // %3
306       "+r"(width)      // %4
307     : [kUVBiasBGR]"r"(&kUVBiasBGR),
308       [kYToRgb]"r"(&kYToRgb)
309     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
310       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
311   );
312 }
313 #endif  // HAS_I422TOABGRROW_NEON
314 
315 #ifdef HAS_I422TORGBAROW_NEON
I422ToRGBARow_NEON(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_rgba,int width)316 void I422ToRGBARow_NEON(const uint8* src_y,
317                         const uint8* src_u,
318                         const uint8* src_v,
319                         uint8* dst_rgba,
320                         int width) {
321   asm volatile (
322     YUV422TORGB_SETUP_REG
323   "1:                                          \n"
324     READYUV422
325     YUV422TORGB(v23, v22, v21)
326     "subs       %w4, %w4, #8                   \n"
327     "movi       v20.8b, #255                   \n" /* A */
328     MEMACCESS(3)
329     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
330     "b.gt       1b                             \n"
331     : "+r"(src_y),     // %0
332       "+r"(src_u),     // %1
333       "+r"(src_v),     // %2
334       "+r"(dst_rgba),  // %3
335       "+r"(width)      // %4
336     : [kUVBiasBGR]"r"(&kUVBiasBGR),
337       [kYToRgb]"r"(&kYToRgb)
338     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
339       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
340   );
341 }
342 #endif  // HAS_I422TORGBAROW_NEON
343 
344 #ifdef HAS_I422TORGB24ROW_NEON
I422ToRGB24Row_NEON(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_rgb24,int width)345 void I422ToRGB24Row_NEON(const uint8* src_y,
346                          const uint8* src_u,
347                          const uint8* src_v,
348                          uint8* dst_rgb24,
349                          int width) {
350   asm volatile (
351     YUV422TORGB_SETUP_REG
352   "1:                                          \n"
353     READYUV422
354     YUV422TORGB(v22, v21, v20)
355     "subs       %w4, %w4, #8                   \n"
356     MEMACCESS(3)
357     "st3        {v20.8b,v21.8b,v22.8b}, [%3], #24     \n"
358     "b.gt       1b                             \n"
359     : "+r"(src_y),     // %0
360       "+r"(src_u),     // %1
361       "+r"(src_v),     // %2
362       "+r"(dst_rgb24), // %3
363       "+r"(width)      // %4
364     : [kUVBiasBGR]"r"(&kUVBiasBGR),
365       [kYToRgb]"r"(&kYToRgb)
366     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
367       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
368   );
369 }
370 #endif  // HAS_I422TORGB24ROW_NEON
371 
372 #ifdef HAS_I422TORAWROW_NEON
I422ToRAWRow_NEON(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_raw,int width)373 void I422ToRAWRow_NEON(const uint8* src_y,
374                        const uint8* src_u,
375                        const uint8* src_v,
376                        uint8* dst_raw,
377                        int width) {
378   asm volatile (
379     YUV422TORGB_SETUP_REG
380   "1:                                          \n"
381     READYUV422
382     YUV422TORGB(v20, v21, v22)
383     "subs       %w4, %w4, #8                   \n"
384     MEMACCESS(3)
385     "st3        {v20.8b,v21.8b,v22.8b}, [%3], #24     \n"
386     "b.gt       1b                             \n"
387     : "+r"(src_y),     // %0
388       "+r"(src_u),     // %1
389       "+r"(src_v),     // %2
390       "+r"(dst_raw),   // %3
391       "+r"(width)      // %4
392     : [kUVBiasBGR]"r"(&kUVBiasBGR),
393       [kYToRgb]"r"(&kYToRgb)
394     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
395       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
396   );
397 }
398 #endif  // HAS_I422TORAWROW_NEON
399 
400 #define ARGBTORGB565                                                           \
401     "shll       v0.8h,  v22.8b, #8             \n"  /* R                    */ \
402     "shll       v20.8h, v20.8b, #8             \n"  /* B                    */ \
403     "shll       v21.8h, v21.8b, #8             \n"  /* G                    */ \
404     "sri        v0.8h,  v21.8h, #5             \n"  /* RG                   */ \
405     "sri        v0.8h,  v20.8h, #11            \n"  /* RGB                  */
406 
407 #ifdef HAS_I422TORGB565ROW_NEON
I422ToRGB565Row_NEON(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_rgb565,int width)408 void I422ToRGB565Row_NEON(const uint8* src_y,
409                           const uint8* src_u,
410                           const uint8* src_v,
411                           uint8* dst_rgb565,
412                           int width) {
413   asm volatile (
414     YUV422TORGB_SETUP_REG
415   "1:                                          \n"
416     READYUV422
417     YUV422TORGB(v22, v21, v20)
418     "subs       %w4, %w4, #8                   \n"
419     ARGBTORGB565
420     MEMACCESS(3)
421     "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels RGB565.
422     "b.gt       1b                             \n"
423     : "+r"(src_y),    // %0
424       "+r"(src_u),    // %1
425       "+r"(src_v),    // %2
426       "+r"(dst_rgb565),  // %3
427       "+r"(width)     // %4
428     : [kUVBiasBGR]"r"(&kUVBiasBGR),
429       [kYToRgb]"r"(&kYToRgb)
430     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
431       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
432   );
433 }
434 #endif  // HAS_I422TORGB565ROW_NEON
435 
436 #define ARGBTOARGB1555                                                         \
437     "shll       v0.8h,  v23.8b, #8             \n"  /* A                    */ \
438     "shll       v22.8h, v22.8b, #8             \n"  /* R                    */ \
439     "shll       v20.8h, v20.8b, #8             \n"  /* B                    */ \
440     "shll       v21.8h, v21.8b, #8             \n"  /* G                    */ \
441     "sri        v0.8h,  v22.8h, #1             \n"  /* AR                   */ \
442     "sri        v0.8h,  v21.8h, #6             \n"  /* ARG                  */ \
443     "sri        v0.8h,  v20.8h, #11            \n"  /* ARGB                 */
444 
445 #ifdef HAS_I422TOARGB1555ROW_NEON
I422ToARGB1555Row_NEON(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_argb1555,int width)446 void I422ToARGB1555Row_NEON(const uint8* src_y,
447                             const uint8* src_u,
448                             const uint8* src_v,
449                             uint8* dst_argb1555,
450                             int width) {
451   asm volatile (
452     YUV422TORGB_SETUP_REG
453   "1:                                          \n"
454     READYUV422
455     YUV422TORGB(v22, v21, v20)
456     "subs       %w4, %w4, #8                   \n"
457     "movi       v23.8b, #255                   \n"
458     ARGBTOARGB1555
459     MEMACCESS(3)
460     "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels RGB565.
461     "b.gt       1b                             \n"
462     : "+r"(src_y),    // %0
463       "+r"(src_u),    // %1
464       "+r"(src_v),    // %2
465       "+r"(dst_argb1555),  // %3
466       "+r"(width)     // %4
467     : [kUVBiasBGR]"r"(&kUVBiasBGR),
468       [kYToRgb]"r"(&kYToRgb)
469     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
470       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
471   );
472 }
473 #endif  // HAS_I422TOARGB1555ROW_NEON
474 
475 #define ARGBTOARGB4444                                                         \
476     /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f        */ \
477     "ushr       v20.8b, v20.8b, #4             \n"  /* B                    */ \
478     "bic        v21.8b, v21.8b, v4.8b          \n"  /* G                    */ \
479     "ushr       v22.8b, v22.8b, #4             \n"  /* R                    */ \
480     "bic        v23.8b, v23.8b, v4.8b          \n"  /* A                    */ \
481     "orr        v0.8b,  v20.8b, v21.8b         \n"  /* BG                   */ \
482     "orr        v1.8b,  v22.8b, v23.8b         \n"  /* RA                   */ \
483     "zip1       v0.16b, v0.16b, v1.16b         \n"  /* BGRA                 */
484 
485 #ifdef HAS_I422TOARGB4444ROW_NEON
I422ToARGB4444Row_NEON(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_argb4444,int width)486 void I422ToARGB4444Row_NEON(const uint8* src_y,
487                             const uint8* src_u,
488                             const uint8* src_v,
489                             uint8* dst_argb4444,
490                             int width) {
491   asm volatile (
492     YUV422TORGB_SETUP_REG
493     "movi       v4.16b, #0x0f                  \n"  // bits to clear with vbic.
494   "1:                                          \n"
495     READYUV422
496     YUV422TORGB(v22, v21, v20)
497     "subs       %w4, %w4, #8                   \n"
498     "movi       v23.8b, #255                   \n"
499     ARGBTOARGB4444
500     MEMACCESS(3)
501     "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels ARGB4444.
502     "b.gt       1b                             \n"
503     : "+r"(src_y),    // %0
504       "+r"(src_u),    // %1
505       "+r"(src_v),    // %2
506       "+r"(dst_argb4444),  // %3
507       "+r"(width)     // %4
508     : [kUVBiasBGR]"r"(&kUVBiasBGR),
509       [kYToRgb]"r"(&kYToRgb)
510     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
511       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
512   );
513 }
514 #endif  // HAS_I422TOARGB4444ROW_NEON
515 
516 #ifdef HAS_I400TOARGBROW_NEON
I400ToARGBRow_NEON(const uint8 * src_y,uint8 * dst_argb,int width)517 void I400ToARGBRow_NEON(const uint8* src_y,
518                         uint8* dst_argb,
519                         int width) {
520   int64 width64 = (int64)(width);
521   asm volatile (
522     YUV422TORGB_SETUP_REG
523   "1:                                          \n"
524     READYUV400
525     YUV422TORGB(v22, v21, v20)
526     "subs       %w2, %w2, #8                   \n"
527     "movi       v23.8b, #255                   \n"
528     MEMACCESS(1)
529     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32     \n"
530     "b.gt       1b                             \n"
531     : "+r"(src_y),     // %0
532       "+r"(dst_argb),  // %1
533       "+r"(width64)    // %2
534     : [kUVBiasBGR]"r"(&kUVBiasBGR),
535       [kYToRgb]"r"(&kYToRgb)
536     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
537       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
538   );
539 }
540 #endif  // HAS_I400TOARGBROW_NEON
541 
542 #ifdef HAS_J400TOARGBROW_NEON
J400ToARGBRow_NEON(const uint8 * src_y,uint8 * dst_argb,int width)543 void J400ToARGBRow_NEON(const uint8* src_y,
544                         uint8* dst_argb,
545                         int width) {
546   asm volatile (
547     "movi       v23.8b, #255                   \n"
548   "1:                                          \n"
549     MEMACCESS(0)
550     "ld1        {v20.8b}, [%0], #8             \n"
551     "orr        v21.8b, v20.8b, v20.8b         \n"
552     "orr        v22.8b, v20.8b, v20.8b         \n"
553     "subs       %w2, %w2, #8                   \n"
554     MEMACCESS(1)
555     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32     \n"
556     "b.gt       1b                             \n"
557     : "+r"(src_y),     // %0
558       "+r"(dst_argb),  // %1
559       "+r"(width)      // %2
560     :
561     : "cc", "memory", "v20", "v21", "v22", "v23"
562   );
563 }
564 #endif  // HAS_J400TOARGBROW_NEON
565 
566 #ifdef HAS_NV12TOARGBROW_NEON
NV12ToARGBRow_NEON(const uint8 * src_y,const uint8 * src_uv,uint8 * dst_argb,int width)567 void NV12ToARGBRow_NEON(const uint8* src_y,
568                         const uint8* src_uv,
569                         uint8* dst_argb,
570                         int width) {
571   asm volatile (
572     YUV422TORGB_SETUP_REG
573   "1:                                          \n"
574     READNV12
575     YUV422TORGB(v22, v21, v20)
576     "subs       %w3, %w3, #8                   \n"
577     "movi       v23.8b, #255                   \n"
578     MEMACCESS(2)
579     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32     \n"
580     "b.gt       1b                             \n"
581     : "+r"(src_y),     // %0
582       "+r"(src_uv),    // %1
583       "+r"(dst_argb),  // %2
584       "+r"(width)      // %3
585     : [kUVBiasBGR]"r"(&kUVBiasBGR),
586       [kYToRgb]"r"(&kYToRgb)
587     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
588       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
589   );
590 }
591 #endif  // HAS_NV12TOARGBROW_NEON
592 
593 #ifdef HAS_NV21TOARGBROW_NEON
NV21ToARGBRow_NEON(const uint8 * src_y,const uint8 * src_uv,uint8 * dst_argb,int width)594 void NV21ToARGBRow_NEON(const uint8* src_y,
595                         const uint8* src_uv,
596                         uint8* dst_argb,
597                         int width) {
598   asm volatile (
599     YUV422TORGB_SETUP_REG
600   "1:                                          \n"
601     READNV21
602     YUV422TORGB(v22, v21, v20)
603     "subs       %w3, %w3, #8                   \n"
604     "movi       v23.8b, #255                   \n"
605     MEMACCESS(2)
606     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32     \n"
607     "b.gt       1b                             \n"
608     : "+r"(src_y),     // %0
609       "+r"(src_uv),    // %1
610       "+r"(dst_argb),  // %2
611       "+r"(width)      // %3
612     : [kUVBiasBGR]"r"(&kUVBiasBGR),
613       [kYToRgb]"r"(&kYToRgb)
614     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
615       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
616   );
617 }
618 #endif  // HAS_NV21TOARGBROW_NEON
619 
620 #ifdef HAS_NV12TORGB565ROW_NEON
NV12ToRGB565Row_NEON(const uint8 * src_y,const uint8 * src_uv,uint8 * dst_rgb565,int width)621 void NV12ToRGB565Row_NEON(const uint8* src_y,
622                           const uint8* src_uv,
623                           uint8* dst_rgb565,
624                           int width) {
625   asm volatile (
626     YUV422TORGB_SETUP_REG
627   "1:                                          \n"
628     READNV12
629     YUV422TORGB(v22, v21, v20)
630     "subs       %w3, %w3, #8                   \n"
631     ARGBTORGB565
632     MEMACCESS(2)
633     "st1        {v0.8h}, [%2], 16              \n"  // store 8 pixels RGB565.
634     "b.gt       1b                             \n"
635     : "+r"(src_y),     // %0
636       "+r"(src_uv),    // %1
637       "+r"(dst_rgb565),  // %2
638       "+r"(width)      // %3
639     : [kUVBiasBGR]"r"(&kUVBiasBGR),
640       [kYToRgb]"r"(&kYToRgb)
641     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
642       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
643   );
644 }
645 #endif  // HAS_NV12TORGB565ROW_NEON
646 
647 #ifdef HAS_NV21TORGB565ROW_NEON
NV21ToRGB565Row_NEON(const uint8 * src_y,const uint8 * src_uv,uint8 * dst_rgb565,int width)648 void NV21ToRGB565Row_NEON(const uint8* src_y,
649                           const uint8* src_uv,
650                           uint8* dst_rgb565,
651                           int width) {
652   asm volatile (
653     YUV422TORGB_SETUP_REG
654   "1:                                          \n"
655     READNV21
656     YUV422TORGB(v22, v21, v20)
657     "subs       %w3, %w3, #8                   \n"
658     ARGBTORGB565
659     MEMACCESS(2)
660     "st1        {v0.8h}, [%2], 16              \n"  // store 8 pixels RGB565.
661     "b.gt       1b                             \n"
662     : "+r"(src_y),     // %0
663       "+r"(src_uv),    // %1
664       "+r"(dst_rgb565),  // %2
665       "+r"(width)      // %3
666     : [kUVBiasBGR]"r"(&kUVBiasBGR),
667       [kYToRgb]"r"(&kYToRgb)
668     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
669       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
670   );
671 }
672 #endif  // HAS_NV21TORGB565ROW_NEON
673 
674 #ifdef HAS_YUY2TOARGBROW_NEON
YUY2ToARGBRow_NEON(const uint8 * src_yuy2,uint8 * dst_argb,int width)675 void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
676                         uint8* dst_argb,
677                         int width) {
678   int64 width64 = (int64)(width);
679   asm volatile (
680     YUV422TORGB_SETUP_REG
681   "1:                                          \n"
682     READYUY2
683     YUV422TORGB(v22, v21, v20)
684     "subs       %w2, %w2, #8                   \n"
685     "movi       v23.8b, #255                   \n"
686     MEMACCESS(1)
687     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32      \n"
688     "b.gt       1b                             \n"
689     : "+r"(src_yuy2),  // %0
690       "+r"(dst_argb),  // %1
691       "+r"(width64)    // %2
692     : [kUVBiasBGR]"r"(&kUVBiasBGR),
693       [kYToRgb]"r"(&kYToRgb)
694     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
695       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
696   );
697 }
698 #endif  // HAS_YUY2TOARGBROW_NEON
699 
700 #ifdef HAS_UYVYTOARGBROW_NEON
UYVYToARGBRow_NEON(const uint8 * src_uyvy,uint8 * dst_argb,int width)701 void UYVYToARGBRow_NEON(const uint8* src_uyvy,
702                         uint8* dst_argb,
703                         int width) {
704   int64 width64 = (int64)(width);
705   asm volatile (
706     YUV422TORGB_SETUP_REG
707   "1:                                          \n"
708     READUYVY
709     YUV422TORGB(v22, v21, v20)
710     "subs       %w2, %w2, #8                   \n"
711     "movi       v23.8b, #255                   \n"
712     MEMACCESS(1)
713     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32      \n"
714     "b.gt       1b                             \n"
715     : "+r"(src_uyvy),  // %0
716       "+r"(dst_argb),  // %1
717       "+r"(width64)    // %2
718     : [kUVBiasBGR]"r"(&kUVBiasBGR),
719       [kYToRgb]"r"(&kYToRgb)
720     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
721       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
722   );
723 }
724 #endif  // HAS_UYVYTOARGBROW_NEON
725 
726 // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
727 #ifdef HAS_SPLITUVROW_NEON
SplitUVRow_NEON(const uint8 * src_uv,uint8 * dst_u,uint8 * dst_v,int width)728 void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
729                      int width) {
730   asm volatile (
731   "1:                                          \n"
732     MEMACCESS(0)
733     "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pairs of UV
734     "subs       %w3, %w3, #16                  \n"  // 16 processed per loop
735     MEMACCESS(1)
736     "st1        {v0.16b}, [%1], #16            \n"  // store U
737     MEMACCESS(2)
738     "st1        {v1.16b}, [%2], #16            \n"  // store V
739     "b.gt       1b                             \n"
740     : "+r"(src_uv),  // %0
741       "+r"(dst_u),   // %1
742       "+r"(dst_v),   // %2
743       "+r"(width)    // %3  // Output registers
744     :                       // Input registers
745     : "cc", "memory", "v0", "v1"  // Clobber List
746   );
747 }
748 #endif  // HAS_SPLITUVROW_NEON
749 
750 // Reads 16 U's and V's and writes out 16 pairs of UV.
751 #ifdef HAS_MERGEUVROW_NEON
MergeUVRow_NEON(const uint8 * src_u,const uint8 * src_v,uint8 * dst_uv,int width)752 void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
753                      int width) {
754   asm volatile (
755   "1:                                          \n"
756     MEMACCESS(0)
757     "ld1        {v0.16b}, [%0], #16            \n"  // load U
758     MEMACCESS(1)
759     "ld1        {v1.16b}, [%1], #16            \n"  // load V
760     "subs       %w3, %w3, #16                  \n"  // 16 processed per loop
761     MEMACCESS(2)
762     "st2        {v0.16b,v1.16b}, [%2], #32     \n"  // store 16 pairs of UV
763     "b.gt       1b                             \n"
764     :
765       "+r"(src_u),   // %0
766       "+r"(src_v),   // %1
767       "+r"(dst_uv),  // %2
768       "+r"(width)    // %3  // Output registers
769     :                       // Input registers
770     : "cc", "memory", "v0", "v1"  // Clobber List
771   );
772 }
773 #endif  // HAS_MERGEUVROW_NEON
774 
775 // Copy multiple of 32.  vld4.8  allow unaligned and is fastest on a15.
776 #ifdef HAS_COPYROW_NEON
CopyRow_NEON(const uint8 * src,uint8 * dst,int count)777 void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
778   asm volatile (
779   "1:                                          \n"
780     MEMACCESS(0)
781     "ld1        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32       \n"  // load 32
782     "subs       %w2, %w2, #32                  \n"  // 32 processed per loop
783     MEMACCESS(1)
784     "st1        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32       \n"  // store 32
785     "b.gt       1b                             \n"
786   : "+r"(src),   // %0
787     "+r"(dst),   // %1
788     "+r"(count)  // %2  // Output registers
789   :                     // Input registers
790   : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
791   );
792 }
793 #endif  // HAS_COPYROW_NEON
794 
795 // SetRow writes 'count' bytes using an 8 bit value repeated.
SetRow_NEON(uint8 * dst,uint8 v8,int count)796 void SetRow_NEON(uint8* dst, uint8 v8, int count) {
797   asm volatile (
798     "dup        v0.16b, %w2                    \n"  // duplicate 16 bytes
799   "1:                                          \n"
800     "subs      %w1, %w1, #16                   \n"  // 16 bytes per loop
801     MEMACCESS(0)
802     "st1        {v0.16b}, [%0], #16            \n"  // store
803     "b.gt      1b                              \n"
804   : "+r"(dst),   // %0
805     "+r"(count)  // %1
806   : "r"(v8)      // %2
807   : "cc", "memory", "v0"
808   );
809 }
810 
ARGBSetRow_NEON(uint8 * dst,uint32 v32,int count)811 void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) {
812   asm volatile (
813     "dup        v0.4s, %w2                     \n"  // duplicate 4 ints
814   "1:                                          \n"
815     "subs      %w1, %w1, #4                    \n"  // 4 ints per loop
816     MEMACCESS(0)
817     "st1        {v0.16b}, [%0], #16            \n"  // store
818     "b.gt      1b                              \n"
819   : "+r"(dst),   // %0
820     "+r"(count)  // %1
821   : "r"(v32)     // %2
822   : "cc", "memory", "v0"
823   );
824 }
825 
826 #ifdef HAS_MIRRORROW_NEON
MirrorRow_NEON(const uint8 * src,uint8 * dst,int width)827 void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
828   int64 width64 = (int64) width;
829   asm volatile (
830     // Start at end of source row.
831     "add        %0, %0, %2                     \n"
832     "sub        %0, %0, #16                    \n"
833 
834   "1:                                          \n"
835     MEMACCESS(0)
836     "ld1        {v0.16b}, [%0], %3             \n"  // src -= 16
837     "subs       %2, %2, #16                   \n"  // 16 pixels per loop.
838     "rev64      v0.16b, v0.16b                 \n"
839     MEMACCESS(1)
840     "st1        {v0.D}[1], [%1], #8            \n"  // dst += 16
841     MEMACCESS(1)
842     "st1        {v0.D}[0], [%1], #8            \n"
843     "b.gt       1b                             \n"
844   : "+r"(src),   // %0
845     "+r"(dst),   // %1
846     "+r"(width64)  // %2
847   : "r"((ptrdiff_t)-16)    // %3
848   : "cc", "memory", "v0"
849   );
850 }
851 #endif  // HAS_MIRRORROW_NEON
852 
853 #ifdef HAS_MIRRORUVROW_NEON
MirrorUVRow_NEON(const uint8 * src_uv,uint8 * dst_u,uint8 * dst_v,int width)854 void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
855                       int width) {
856   int64 width64 = (int64) width;
857   asm volatile (
858     // Start at end of source row.
859     "add        %0, %0, %3, lsl #1             \n"
860     "sub        %0, %0, #16                    \n"
861 
862   "1:                                          \n"
863     MEMACCESS(0)
864     "ld2        {v0.8b, v1.8b}, [%0], %4       \n"  // src -= 16
865     "subs       %3, %3, #8                     \n"  // 8 pixels per loop.
866     "rev64      v0.8b, v0.8b                   \n"
867     "rev64      v1.8b, v1.8b                   \n"
868     MEMACCESS(1)
869     "st1        {v0.8b}, [%1], #8              \n"  // dst += 8
870     MEMACCESS(2)
871     "st1        {v1.8b}, [%2], #8              \n"
872     "b.gt       1b                             \n"
873   : "+r"(src_uv),  // %0
874     "+r"(dst_u),   // %1
875     "+r"(dst_v),   // %2
876     "+r"(width64)    // %3
877   : "r"((ptrdiff_t)-16)      // %4
878   : "cc", "memory", "v0", "v1"
879   );
880 }
881 #endif  // HAS_MIRRORUVROW_NEON
882 
883 #ifdef HAS_ARGBMIRRORROW_NEON
ARGBMirrorRow_NEON(const uint8 * src,uint8 * dst,int width)884 void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
885   int64 width64 = (int64) width;
886   asm volatile (
887     // Start at end of source row.
888     "add        %0, %0, %2, lsl #2             \n"
889     "sub        %0, %0, #16                    \n"
890 
891   "1:                                          \n"
892     MEMACCESS(0)
893     "ld1        {v0.16b}, [%0], %3             \n"  // src -= 16
894     "subs       %2, %2, #4                     \n"  // 4 pixels per loop.
895     "rev64      v0.4s, v0.4s                   \n"
896     MEMACCESS(1)
897     "st1        {v0.D}[1], [%1], #8            \n"  // dst += 16
898     MEMACCESS(1)
899     "st1        {v0.D}[0], [%1], #8            \n"
900     "b.gt       1b                             \n"
901   : "+r"(src),   // %0
902     "+r"(dst),   // %1
903     "+r"(width64)  // %2
904   : "r"((ptrdiff_t)-16)    // %3
905   : "cc", "memory", "v0"
906   );
907 }
908 #endif  // HAS_ARGBMIRRORROW_NEON
909 
910 #ifdef HAS_RGB24TOARGBROW_NEON
RGB24ToARGBRow_NEON(const uint8 * src_rgb24,uint8 * dst_argb,int pix)911 void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
912   asm volatile (
913     "movi       v4.8b, #255                    \n"  // Alpha
914   "1:                                          \n"
915     MEMACCESS(0)
916     "ld3        {v1.8b,v2.8b,v3.8b}, [%0], #24 \n"  // load 8 pixels of RGB24.
917     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
918     MEMACCESS(1)
919     "st4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n"  // store 8 ARGB pixels
920     "b.gt       1b                             \n"
921   : "+r"(src_rgb24),  // %0
922     "+r"(dst_argb),   // %1
923     "+r"(pix)         // %2
924   :
925   : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List
926   );
927 }
928 #endif  // HAS_RGB24TOARGBROW_NEON
929 
930 #ifdef HAS_RAWTOARGBROW_NEON
RAWToARGBRow_NEON(const uint8 * src_raw,uint8 * dst_argb,int pix)931 void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {
932   asm volatile (
933     "movi       v5.8b, #255                    \n"  // Alpha
934   "1:                                          \n"
935     MEMACCESS(0)
936     "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // read r g b
937     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
938     "orr        v3.8b, v1.8b, v1.8b            \n"  // move g
939     "orr        v4.8b, v0.8b, v0.8b            \n"  // move r
940     MEMACCESS(1)
941     "st4        {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n"  // store b g r a
942     "b.gt       1b                             \n"
943   : "+r"(src_raw),   // %0
944     "+r"(dst_argb),  // %1
945     "+r"(pix)        // %2
946   :
947   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5"  // Clobber List
948   );
949 }
950 #endif  // HAS_RAWTOARGBROW_NEON
951 
952 #define RGB565TOARGB                                                           \
953     "shrn       v6.8b, v0.8h, #5               \n"  /* G xxGGGGGG           */ \
954     "shl        v6.8b, v6.8b, #2               \n"  /* G GGGGGG00 upper 6   */ \
955     "ushr       v4.8b, v6.8b, #6               \n"  /* G 000000GG lower 2   */ \
956     "orr        v1.8b, v4.8b, v6.8b            \n"  /* G                    */ \
957     "xtn        v2.8b, v0.8h                   \n"  /* B xxxBBBBB           */ \
958     "ushr       v0.8h, v0.8h, #11              \n"  /* R 000RRRRR           */ \
959     "xtn2       v2.16b,v0.8h                   \n"  /* R in upper part      */ \
960     "shl        v2.16b, v2.16b, #3             \n"  /* R,B BBBBB000 upper 5 */ \
961     "ushr       v0.16b, v2.16b, #5             \n"  /* R,B 00000BBB lower 3 */ \
962     "orr        v0.16b, v0.16b, v2.16b         \n"  /* R,B                  */ \
963     "dup        v2.2D, v0.D[1]                 \n"  /* R                    */
964 
965 #ifdef HAS_RGB565TOARGBROW_NEON
RGB565ToARGBRow_NEON(const uint8 * src_rgb565,uint8 * dst_argb,int pix)966 void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) {
967   asm volatile (
968     "movi       v3.8b, #255                    \n"  // Alpha
969   "1:                                          \n"
970     MEMACCESS(0)
971     "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.
972     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
973     RGB565TOARGB
974     MEMACCESS(1)
975     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
976     "b.gt       1b                             \n"
977   : "+r"(src_rgb565),  // %0
978     "+r"(dst_argb),    // %1
979     "+r"(pix)          // %2
980   :
981   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6"  // Clobber List
982   );
983 }
984 #endif  // HAS_RGB565TOARGBROW_NEON
985 
986 #define ARGB1555TOARGB                                                         \
987     "ushr       v2.8h, v0.8h, #10              \n"  /* R xxxRRRRR           */ \
988     "shl        v2.8h, v2.8h, #3               \n"  /* R RRRRR000 upper 5   */ \
989     "xtn        v3.8b, v2.8h                   \n"  /* RRRRR000 AAAAAAAA    */ \
990                                                                                \
991     "sshr       v2.8h, v0.8h, #15              \n"  /* A AAAAAAAA           */ \
992     "xtn2       v3.16b, v2.8h                  \n"                             \
993                                                                                \
994     "xtn        v2.8b, v0.8h                   \n"  /* B xxxBBBBB           */ \
995     "shrn2      v2.16b,v0.8h, #5               \n"  /* G xxxGGGGG           */ \
996                                                                                \
997     "ushr       v1.16b, v3.16b, #5             \n"  /* R,A 00000RRR lower 3 */ \
998     "shl        v0.16b, v2.16b, #3             \n"  /* B,G BBBBB000 upper 5 */ \
999     "ushr       v2.16b, v0.16b, #5             \n"  /* B,G 00000BBB lower 3 */ \
1000                                                                                \
1001     "orr        v0.16b, v0.16b, v2.16b         \n"  /* B,G                  */ \
1002     "orr        v2.16b, v1.16b, v3.16b         \n"  /* R,A                  */ \
1003     "dup        v1.2D, v0.D[1]                 \n"                             \
1004     "dup        v3.2D, v2.D[1]                 \n"
1005 
1006 // RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
1007 #define RGB555TOARGB                                                           \
1008     "ushr       v2.8h, v0.8h, #10              \n"  /* R xxxRRRRR           */ \
1009     "shl        v2.8h, v2.8h, #3               \n"  /* R RRRRR000 upper 5   */ \
1010     "xtn        v3.8b, v2.8h                   \n"  /* RRRRR000             */ \
1011                                                                                \
1012     "xtn        v2.8b, v0.8h                   \n"  /* B xxxBBBBB           */ \
1013     "shrn2      v2.16b,v0.8h, #5               \n"  /* G xxxGGGGG           */ \
1014                                                                                \
1015     "ushr       v1.16b, v3.16b, #5             \n"  /* R   00000RRR lower 3 */ \
1016     "shl        v0.16b, v2.16b, #3             \n"  /* B,G BBBBB000 upper 5 */ \
1017     "ushr       v2.16b, v0.16b, #5             \n"  /* B,G 00000BBB lower 3 */ \
1018                                                                                \
1019     "orr        v0.16b, v0.16b, v2.16b         \n"  /* B,G                  */ \
1020     "orr        v2.16b, v1.16b, v3.16b         \n"  /* R                    */ \
1021     "dup        v1.2D, v0.D[1]                 \n"  /* G */                    \
1022 
1023 #ifdef HAS_ARGB1555TOARGBROW_NEON
ARGB1555ToARGBRow_NEON(const uint8 * src_argb1555,uint8 * dst_argb,int pix)1024 void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
1025                             int pix) {
1026   asm volatile (
1027     "movi       v3.8b, #255                    \n"  // Alpha
1028   "1:                                          \n"
1029     MEMACCESS(0)
1030     "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.
1031     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
1032     ARGB1555TOARGB
1033     MEMACCESS(1)
1034     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
1035     "b.gt       1b                             \n"
1036   : "+r"(src_argb1555),  // %0
1037     "+r"(dst_argb),    // %1
1038     "+r"(pix)          // %2
1039   :
1040   : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
1041   );
1042 }
1043 #endif  // HAS_ARGB1555TOARGBROW_NEON
1044 
1045 #define ARGB4444TOARGB                                                         \
1046     "shrn       v1.8b,  v0.8h, #8              \n"  /* v1(l) AR             */ \
1047     "xtn2       v1.16b, v0.8h                  \n"  /* v1(h) GB             */ \
1048     "shl        v2.16b, v1.16b, #4             \n"  /* B,R BBBB0000         */ \
1049     "ushr       v3.16b, v1.16b, #4             \n"  /* G,A 0000GGGG         */ \
1050     "ushr       v0.16b, v2.16b, #4             \n"  /* B,R 0000BBBB         */ \
1051     "shl        v1.16b, v3.16b, #4             \n"  /* G,A GGGG0000         */ \
1052     "orr        v2.16b, v0.16b, v2.16b         \n"  /* B,R BBBBBBBB         */ \
1053     "orr        v3.16b, v1.16b, v3.16b         \n"  /* G,A GGGGGGGG         */ \
1054     "dup        v0.2D, v2.D[1]                 \n"                             \
1055     "dup        v1.2D, v3.D[1]                 \n"
1056 
1057 #ifdef HAS_ARGB4444TOARGBROW_NEON
ARGB4444ToARGBRow_NEON(const uint8 * src_argb4444,uint8 * dst_argb,int pix)1058 void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
1059                             int pix) {
1060   asm volatile (
1061   "1:                                          \n"
1062     MEMACCESS(0)
1063     "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.
1064     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
1065     ARGB4444TOARGB
1066     MEMACCESS(1)
1067     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
1068     "b.gt       1b                             \n"
1069   : "+r"(src_argb4444),  // %0
1070     "+r"(dst_argb),    // %1
1071     "+r"(pix)          // %2
1072   :
1073   : "cc", "memory", "v0", "v1", "v2", "v3", "v4"  // Clobber List
1074   );
1075 }
1076 #endif  // HAS_ARGB4444TOARGBROW_NEON
1077 
1078 #ifdef HAS_ARGBTORGB24ROW_NEON
ARGBToRGB24Row_NEON(const uint8 * src_argb,uint8 * dst_rgb24,int pix)1079 void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) {
1080   asm volatile (
1081   "1:                                          \n"
1082     MEMACCESS(0)
1083     "ld4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load 8 ARGB pixels
1084     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
1085     MEMACCESS(1)
1086     "st3        {v1.8b,v2.8b,v3.8b}, [%1], #24 \n"  // store 8 pixels of RGB24.
1087     "b.gt       1b                             \n"
1088   : "+r"(src_argb),   // %0
1089     "+r"(dst_rgb24),  // %1
1090     "+r"(pix)         // %2
1091   :
1092   : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List
1093   );
1094 }
1095 #endif  // HAS_ARGBTORGB24ROW_NEON
1096 
1097 #ifdef HAS_ARGBTORAWROW_NEON
ARGBToRAWRow_NEON(const uint8 * src_argb,uint8 * dst_raw,int pix)1098 void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) {
1099   asm volatile (
1100   "1:                                          \n"
1101     MEMACCESS(0)
1102     "ld4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load b g r a
1103     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
1104     "orr        v4.8b, v2.8b, v2.8b            \n"  // mov g
1105     "orr        v5.8b, v1.8b, v1.8b            \n"  // mov b
1106     MEMACCESS(1)
1107     "st3        {v3.8b,v4.8b,v5.8b}, [%1], #24 \n"  // store r g b
1108     "b.gt       1b                             \n"
1109   : "+r"(src_argb),  // %0
1110     "+r"(dst_raw),   // %1
1111     "+r"(pix)        // %2
1112   :
1113   : "cc", "memory", "v1", "v2", "v3", "v4", "v5"  // Clobber List
1114   );
1115 }
1116 #endif  // HAS_ARGBTORAWROW_NEON
1117 
1118 #ifdef HAS_YUY2TOYROW_NEON
YUY2ToYRow_NEON(const uint8 * src_yuy2,uint8 * dst_y,int pix)1119 void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) {
1120   asm volatile (
1121   "1:                                          \n"
1122     MEMACCESS(0)
1123     "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pixels of YUY2.
1124     "subs       %w2, %w2, #16                  \n"  // 16 processed per loop.
1125     MEMACCESS(1)
1126     "st1        {v0.16b}, [%1], #16            \n"  // store 16 pixels of Y.
1127     "b.gt       1b                             \n"
1128   : "+r"(src_yuy2),  // %0
1129     "+r"(dst_y),     // %1
1130     "+r"(pix)        // %2
1131   :
1132   : "cc", "memory", "v0", "v1"  // Clobber List
1133   );
1134 }
1135 #endif  // HAS_YUY2TOYROW_NEON
1136 
1137 #ifdef HAS_UYVYTOYROW_NEON
UYVYToYRow_NEON(const uint8 * src_uyvy,uint8 * dst_y,int pix)1138 void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) {
1139   asm volatile (
1140   "1:                                          \n"
1141     MEMACCESS(0)
1142     "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pixels of UYVY.
1143     "subs       %w2, %w2, #16                  \n"  // 16 processed per loop.
1144     MEMACCESS(1)
1145     "st1        {v1.16b}, [%1], #16            \n"  // store 16 pixels of Y.
1146     "b.gt       1b                             \n"
1147   : "+r"(src_uyvy),  // %0
1148     "+r"(dst_y),     // %1
1149     "+r"(pix)        // %2
1150   :
1151   : "cc", "memory", "v0", "v1"  // Clobber List
1152   );
1153 }
1154 #endif  // HAS_UYVYTOYROW_NEON
1155 
1156 #ifdef HAS_YUY2TOUV422ROW_NEON
YUY2ToUV422Row_NEON(const uint8 * src_yuy2,uint8 * dst_u,uint8 * dst_v,int pix)1157 void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
1158                          int pix) {
1159   asm volatile (
1160   "1:                                          \n"
1161     MEMACCESS(0)
1162     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 YUY2 pixels
1163     "subs       %w3, %w3, #16                  \n"  // 16 pixels = 8 UVs.
1164     MEMACCESS(1)
1165     "st1        {v1.8b}, [%1], #8              \n"  // store 8 U.
1166     MEMACCESS(2)
1167     "st1        {v3.8b}, [%2], #8              \n"  // store 8 V.
1168     "b.gt       1b                             \n"
1169   : "+r"(src_yuy2),  // %0
1170     "+r"(dst_u),     // %1
1171     "+r"(dst_v),     // %2
1172     "+r"(pix)        // %3
1173   :
1174   : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
1175   );
1176 }
1177 #endif  // HAS_YUY2TOUV422ROW_NEON
1178 
1179 #ifdef HAS_UYVYTOUV422ROW_NEON
UYVYToUV422Row_NEON(const uint8 * src_uyvy,uint8 * dst_u,uint8 * dst_v,int pix)1180 void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
1181                          int pix) {
1182   asm volatile (
1183   "1:                                          \n"
1184     MEMACCESS(0)
1185     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 UYVY pixels
1186     "subs       %w3, %w3, #16                  \n"  // 16 pixels = 8 UVs.
1187     MEMACCESS(1)
1188     "st1        {v0.8b}, [%1], #8              \n"  // store 8 U.
1189     MEMACCESS(2)
1190     "st1        {v2.8b}, [%2], #8              \n"  // store 8 V.
1191     "b.gt       1b                             \n"
1192   : "+r"(src_uyvy),  // %0
1193     "+r"(dst_u),     // %1
1194     "+r"(dst_v),     // %2
1195     "+r"(pix)        // %3
1196   :
1197   : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
1198   );
1199 }
1200 #endif  // HAS_UYVYTOUV422ROW_NEON
1201 
1202 #ifdef HAS_YUY2TOUVROW_NEON
YUY2ToUVRow_NEON(const uint8 * src_yuy2,int stride_yuy2,uint8 * dst_u,uint8 * dst_v,int pix)1203 void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
1204                       uint8* dst_u, uint8* dst_v, int pix) {
1205   const uint8* src_yuy2b = src_yuy2 + stride_yuy2;
1206   asm volatile (
1207   "1:                                          \n"
1208     MEMACCESS(0)
1209     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels
1210     "subs       %w4, %w4, #16                  \n"  // 16 pixels = 8 UVs.
1211     MEMACCESS(1)
1212     "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load next row
1213     "urhadd     v1.8b, v1.8b, v5.8b            \n"  // average rows of U
1214     "urhadd     v3.8b, v3.8b, v7.8b            \n"  // average rows of V
1215     MEMACCESS(2)
1216     "st1        {v1.8b}, [%2], #8              \n"  // store 8 U.
1217     MEMACCESS(3)
1218     "st1        {v3.8b}, [%3], #8              \n"  // store 8 V.
1219     "b.gt       1b                             \n"
1220   : "+r"(src_yuy2),     // %0
1221     "+r"(src_yuy2b),    // %1
1222     "+r"(dst_u),        // %2
1223     "+r"(dst_v),        // %3
1224     "+r"(pix)           // %4
1225   :
1226   : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
1227     "v5", "v6", "v7"  // Clobber List
1228   );
1229 }
1230 #endif  // HAS_YUY2TOUVROW_NEON
1231 
1232 #ifdef HAS_UYVYTOUVROW_NEON
UYVYToUVRow_NEON(const uint8 * src_uyvy,int stride_uyvy,uint8 * dst_u,uint8 * dst_v,int pix)1233 void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
1234                       uint8* dst_u, uint8* dst_v, int pix) {
1235   const uint8* src_uyvyb = src_uyvy + stride_uyvy;
1236   asm volatile (
1237   "1:                                          \n"
1238     MEMACCESS(0)
1239     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels
1240     "subs       %w4, %w4, #16                  \n"  // 16 pixels = 8 UVs.
1241     MEMACCESS(1)
1242     "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load next row
1243     "urhadd     v0.8b, v0.8b, v4.8b            \n"  // average rows of U
1244     "urhadd     v2.8b, v2.8b, v6.8b            \n"  // average rows of V
1245     MEMACCESS(2)
1246     "st1        {v0.8b}, [%2], #8              \n"  // store 8 U.
1247     MEMACCESS(3)
1248     "st1        {v2.8b}, [%3], #8              \n"  // store 8 V.
1249     "b.gt       1b                             \n"
1250   : "+r"(src_uyvy),     // %0
1251     "+r"(src_uyvyb),    // %1
1252     "+r"(dst_u),        // %2
1253     "+r"(dst_v),        // %3
1254     "+r"(pix)           // %4
1255   :
1256   : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
1257     "v5", "v6", "v7"  // Clobber List
1258   );
1259 }
1260 #endif  // HAS_UYVYTOUVROW_NEON
1261 
1262 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
1263 #ifdef HAS_ARGBSHUFFLEROW_NEON
ARGBShuffleRow_NEON(const uint8 * src_argb,uint8 * dst_argb,const uint8 * shuffler,int pix)1264 void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
1265                          const uint8* shuffler, int pix) {
1266   asm volatile (
1267     MEMACCESS(3)
1268     "ld1        {v2.16b}, [%3]                 \n"  // shuffler
1269   "1:                                          \n"
1270     MEMACCESS(0)
1271     "ld1        {v0.16b}, [%0], #16            \n"  // load 4 pixels.
1272     "subs       %w2, %w2, #4                   \n"  // 4 processed per loop
1273     "tbl        v1.16b, {v0.16b}, v2.16b       \n"  // look up 4 pixels
1274     MEMACCESS(1)
1275     "st1        {v1.16b}, [%1], #16            \n"  // store 4.
1276     "b.gt       1b                             \n"
1277   : "+r"(src_argb),  // %0
1278     "+r"(dst_argb),  // %1
1279     "+r"(pix)        // %2
1280   : "r"(shuffler)    // %3
1281   : "cc", "memory", "v0", "v1", "v2"  // Clobber List
1282   );
1283 }
1284 #endif  // HAS_ARGBSHUFFLEROW_NEON
1285 
1286 #ifdef HAS_I422TOYUY2ROW_NEON
I422ToYUY2Row_NEON(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_yuy2,int width)1287 void I422ToYUY2Row_NEON(const uint8* src_y,
1288                         const uint8* src_u,
1289                         const uint8* src_v,
1290                         uint8* dst_yuy2, int width) {
1291   asm volatile (
1292   "1:                                          \n"
1293     MEMACCESS(0)
1294     "ld2        {v0.8b, v1.8b}, [%0], #16      \n"  // load 16 Ys
1295     "orr        v2.8b, v1.8b, v1.8b            \n"
1296     MEMACCESS(1)
1297     "ld1        {v1.8b}, [%1], #8              \n"  // load 8 Us
1298     MEMACCESS(2)
1299     "ld1        {v3.8b}, [%2], #8              \n"  // load 8 Vs
1300     "subs       %w4, %w4, #16                  \n"  // 16 pixels
1301     MEMACCESS(3)
1302     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels.
1303     "b.gt       1b                             \n"
1304   : "+r"(src_y),     // %0
1305     "+r"(src_u),     // %1
1306     "+r"(src_v),     // %2
1307     "+r"(dst_yuy2),  // %3
1308     "+r"(width)      // %4
1309   :
1310   : "cc", "memory", "v0", "v1", "v2", "v3"
1311   );
1312 }
1313 #endif  // HAS_I422TOYUY2ROW_NEON
1314 
1315 #ifdef HAS_I422TOUYVYROW_NEON
I422ToUYVYRow_NEON(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_uyvy,int width)1316 void I422ToUYVYRow_NEON(const uint8* src_y,
1317                         const uint8* src_u,
1318                         const uint8* src_v,
1319                         uint8* dst_uyvy, int width) {
1320   asm volatile (
1321   "1:                                          \n"
1322     MEMACCESS(0)
1323     "ld2        {v1.8b,v2.8b}, [%0], #16       \n"  // load 16 Ys
1324     "orr        v3.8b, v2.8b, v2.8b            \n"
1325     MEMACCESS(1)
1326     "ld1        {v0.8b}, [%1], #8              \n"  // load 8 Us
1327     MEMACCESS(2)
1328     "ld1        {v2.8b}, [%2], #8              \n"  // load 8 Vs
1329     "subs       %w4, %w4, #16                  \n"  // 16 pixels
1330     MEMACCESS(3)
1331     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels.
1332     "b.gt       1b                             \n"
1333   : "+r"(src_y),     // %0
1334     "+r"(src_u),     // %1
1335     "+r"(src_v),     // %2
1336     "+r"(dst_uyvy),  // %3
1337     "+r"(width)      // %4
1338   :
1339   : "cc", "memory", "v0", "v1", "v2", "v3"
1340   );
1341 }
1342 #endif  // HAS_I422TOUYVYROW_NEON
1343 
1344 #ifdef HAS_ARGBTORGB565ROW_NEON
ARGBToRGB565Row_NEON(const uint8 * src_argb,uint8 * dst_rgb565,int pix)1345 void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) {
1346   asm volatile (
1347   "1:                                          \n"
1348     MEMACCESS(0)
1349     "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
1350     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
1351     ARGBTORGB565
1352     MEMACCESS(1)
1353     "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels RGB565.
1354     "b.gt       1b                             \n"
1355   : "+r"(src_argb),  // %0
1356     "+r"(dst_rgb565),  // %1
1357     "+r"(pix)        // %2
1358   :
1359   : "cc", "memory", "v0", "v20", "v21", "v22", "v23"
1360   );
1361 }
1362 #endif  // HAS_ARGBTORGB565ROW_NEON
1363 
1364 #ifdef HAS_ARGBTORGB565DITHERROW_NEON
ARGBToRGB565DitherRow_NEON(const uint8 * src_argb,uint8 * dst_rgb,const uint32 dither4,int width)1365 void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,
1366                                 const uint32 dither4, int width) {
1367   asm volatile (
1368     "dup        v1.4s, %w2                     \n"  // dither4
1369   "1:                                          \n"
1370     MEMACCESS(1)
1371     "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"  // load 8 pixels
1372     "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
1373     "uqadd      v20.8b, v20.8b, v1.8b          \n"
1374     "uqadd      v21.8b, v21.8b, v1.8b          \n"
1375     "uqadd      v22.8b, v22.8b, v1.8b          \n"
1376     ARGBTORGB565
1377     MEMACCESS(0)
1378     "st1        {v0.16b}, [%0], #16            \n"  // store 8 pixels RGB565.
1379     "b.gt       1b                             \n"
1380   : "+r"(dst_rgb)    // %0
1381   : "r"(src_argb),   // %1
1382     "r"(dither4),    // %2
1383     "r"(width)       // %3
1384   : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23"
1385   );
1386 }
1387 #endif  // HAS_ARGBTORGB565ROW_NEON
1388 
1389 #ifdef HAS_ARGBTOARGB1555ROW_NEON
ARGBToARGB1555Row_NEON(const uint8 * src_argb,uint8 * dst_argb1555,int pix)1390 void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
1391                             int pix) {
1392   asm volatile (
1393   "1:                                          \n"
1394     MEMACCESS(0)
1395     "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
1396     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
1397     ARGBTOARGB1555
1398     MEMACCESS(1)
1399     "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels ARGB1555.
1400     "b.gt       1b                             \n"
1401   : "+r"(src_argb),  // %0
1402     "+r"(dst_argb1555),  // %1
1403     "+r"(pix)        // %2
1404   :
1405   : "cc", "memory", "v0", "v20", "v21", "v22", "v23"
1406   );
1407 }
1408 #endif  // HAS_ARGBTOARGB1555ROW_NEON
1409 
1410 #ifdef HAS_ARGBTOARGB4444ROW_NEON
ARGBToARGB4444Row_NEON(const uint8 * src_argb,uint8 * dst_argb4444,int pix)1411 void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
1412                             int pix) {
1413   asm volatile (
1414     "movi       v4.16b, #0x0f                  \n"  // bits to clear with vbic.
1415   "1:                                          \n"
1416     MEMACCESS(0)
1417     "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
1418     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
1419     ARGBTOARGB4444
1420     MEMACCESS(1)
1421     "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels ARGB4444.
1422     "b.gt       1b                             \n"
1423   : "+r"(src_argb),      // %0
1424     "+r"(dst_argb4444),  // %1
1425     "+r"(pix)            // %2
1426   :
1427   : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23"
1428   );
1429 }
1430 #endif  // HAS_ARGBTOARGB4444ROW_NEON
1431 
1432 #ifdef HAS_ARGBTOYROW_NEON
ARGBToYRow_NEON(const uint8 * src_argb,uint8 * dst_y,int pix)1433 void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
1434   asm volatile (
1435     "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
1436     "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
1437     "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
1438     "movi       v7.8b, #16                     \n"  // Add 16 constant
1439   "1:                                          \n"
1440     MEMACCESS(0)
1441     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
1442     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
1443     "umull      v3.8h, v0.8b, v4.8b            \n"  // B
1444     "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
1445     "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
1446     "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
1447     "uqadd      v0.8b, v0.8b, v7.8b            \n"
1448     MEMACCESS(1)
1449     "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
1450     "b.gt       1b                             \n"
1451   : "+r"(src_argb),  // %0
1452     "+r"(dst_y),     // %1
1453     "+r"(pix)        // %2
1454   :
1455   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
1456   );
1457 }
1458 #endif  // HAS_ARGBTOYROW_NEON
1459 
1460 #ifdef HAS_ARGBTOYJROW_NEON
ARGBToYJRow_NEON(const uint8 * src_argb,uint8 * dst_y,int pix)1461 void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
1462   asm volatile (
1463     "movi       v4.8b, #15                     \n"  // B * 0.11400 coefficient
1464     "movi       v5.8b, #75                     \n"  // G * 0.58700 coefficient
1465     "movi       v6.8b, #38                     \n"  // R * 0.29900 coefficient
1466   "1:                                          \n"
1467     MEMACCESS(0)
1468     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
1469     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
1470     "umull      v3.8h, v0.8b, v4.8b            \n"  // B
1471     "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
1472     "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
1473     "sqrshrun   v0.8b, v3.8h, #7               \n"  // 15 bit to 8 bit Y
1474     MEMACCESS(1)
1475     "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
1476     "b.gt       1b                             \n"
1477   : "+r"(src_argb),  // %0
1478     "+r"(dst_y),     // %1
1479     "+r"(pix)        // %2
1480   :
1481   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
1482   );
1483 }
1484 #endif  // HAS_ARGBTOYJROW_NEON
1485 
1486 // 8x1 pixels.
1487 #ifdef HAS_ARGBTOUV444ROW_NEON
ARGBToUV444Row_NEON(const uint8 * src_argb,uint8 * dst_u,uint8 * dst_v,int pix)1488 void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
1489                          int pix) {
1490   asm volatile (
1491     "movi       v24.8b, #112                   \n"  // UB / VR 0.875 coefficient
1492     "movi       v25.8b, #74                    \n"  // UG -0.5781 coefficient
1493     "movi       v26.8b, #38                    \n"  // UR -0.2969 coefficient
1494     "movi       v27.8b, #18                    \n"  // VB -0.1406 coefficient
1495     "movi       v28.8b, #94                    \n"  // VG -0.7344 coefficient
1496     "movi       v29.16b,#0x80                  \n"  // 128.5
1497   "1:                                          \n"
1498     MEMACCESS(0)
1499     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
1500     "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
1501     "umull      v4.8h, v0.8b, v24.8b           \n"  // B
1502     "umlsl      v4.8h, v1.8b, v25.8b           \n"  // G
1503     "umlsl      v4.8h, v2.8b, v26.8b           \n"  // R
1504     "add        v4.8h, v4.8h, v29.8h           \n"  // +128 -> unsigned
1505 
1506     "umull      v3.8h, v2.8b, v24.8b           \n"  // R
1507     "umlsl      v3.8h, v1.8b, v28.8b           \n"  // G
1508     "umlsl      v3.8h, v0.8b, v27.8b           \n"  // B
1509     "add        v3.8h, v3.8h, v29.8h           \n"  // +128 -> unsigned
1510 
1511     "uqshrn     v0.8b, v4.8h, #8               \n"  // 16 bit to 8 bit U
1512     "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
1513 
1514     MEMACCESS(1)
1515     "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels U.
1516     MEMACCESS(2)
1517     "st1        {v1.8b}, [%2], #8              \n"  // store 8 pixels V.
1518     "b.gt       1b                             \n"
1519   : "+r"(src_argb),  // %0
1520     "+r"(dst_u),     // %1
1521     "+r"(dst_v),     // %2
1522     "+r"(pix)        // %3
1523   :
1524   : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
1525     "v24", "v25", "v26", "v27", "v28", "v29"
1526   );
1527 }
1528 #endif  // HAS_ARGBTOUV444ROW_NEON
1529 
1530 // 16x1 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
1531 #ifdef HAS_ARGBTOUV422ROW_NEON
ARGBToUV422Row_NEON(const uint8 * src_argb,uint8 * dst_u,uint8 * dst_v,int pix)1532 void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
1533                          int pix) {
1534   asm volatile (
1535     RGBTOUV_SETUP_REG
1536   "1:                                          \n"
1537     MEMACCESS(0)
1538     "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
1539 
1540     "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
1541     "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
1542     "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
1543 
1544     "subs       %w3, %w3, #16                  \n"  // 16 processed per loop.
1545     "mul        v3.8h, v0.8h, v20.8h           \n"  // B
1546     "mls        v3.8h, v1.8h, v21.8h           \n"  // G
1547     "mls        v3.8h, v2.8h, v22.8h           \n"  // R
1548     "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned
1549 
1550     "mul        v4.8h, v2.8h, v20.8h           \n"  // R
1551     "mls        v4.8h, v1.8h, v24.8h           \n"  // G
1552     "mls        v4.8h, v0.8h, v23.8h           \n"  // B
1553     "add        v4.8h, v4.8h, v25.8h           \n"  // +128 -> unsigned
1554 
1555     "uqshrn     v0.8b, v3.8h, #8               \n"  // 16 bit to 8 bit U
1556     "uqshrn     v1.8b, v4.8h, #8               \n"  // 16 bit to 8 bit V
1557 
1558     MEMACCESS(1)
1559     "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels U.
1560     MEMACCESS(2)
1561     "st1        {v1.8b}, [%2], #8              \n"  // store 8 pixels V.
1562     "b.gt       1b                             \n"
1563   : "+r"(src_argb),  // %0
1564     "+r"(dst_u),     // %1
1565     "+r"(dst_v),     // %2
1566     "+r"(pix)        // %3
1567   :
1568   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1569     "v20", "v21", "v22", "v23", "v24", "v25"
1570   );
1571 }
1572 #endif  // HAS_ARGBTOUV422ROW_NEON
1573 
1574 // 32x1 pixels -> 8x1.  pix is number of argb pixels. e.g. 32.
1575 #ifdef HAS_ARGBTOUV411ROW_NEON
ARGBToUV411Row_NEON(const uint8 * src_argb,uint8 * dst_u,uint8 * dst_v,int pix)1576 void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
1577                          int pix) {
1578   asm volatile (
1579     RGBTOUV_SETUP_REG
1580   "1:                                          \n"
1581     MEMACCESS(0)
1582     "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
1583     "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
1584     "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
1585     "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
1586     MEMACCESS(0)
1587     "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%0], #64 \n"  // load next 16.
1588     "uaddlp     v4.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
1589     "uaddlp     v5.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
1590     "uaddlp     v6.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
1591 
1592     "addp       v0.8h, v0.8h, v4.8h            \n"  // B 16 shorts -> 8 shorts.
1593     "addp       v1.8h, v1.8h, v5.8h            \n"  // G 16 shorts -> 8 shorts.
1594     "addp       v2.8h, v2.8h, v6.8h            \n"  // R 16 shorts -> 8 shorts.
1595 
1596     "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
1597     "urshr      v1.8h, v1.8h, #1               \n"
1598     "urshr      v2.8h, v2.8h, #1               \n"
1599 
1600     "subs       %w3, %w3, #32                  \n"  // 32 processed per loop.
1601     "mul        v3.8h, v0.8h, v20.8h           \n"  // B
1602     "mls        v3.8h, v1.8h, v21.8h           \n"  // G
1603     "mls        v3.8h, v2.8h, v22.8h           \n"  // R
1604     "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned
1605     "mul        v4.8h, v2.8h, v20.8h           \n"  // R
1606     "mls        v4.8h, v1.8h, v24.8h           \n"  // G
1607     "mls        v4.8h, v0.8h, v23.8h           \n"  // B
1608     "add        v4.8h, v4.8h, v25.8h           \n"  // +128 -> unsigned
1609     "uqshrn     v0.8b, v3.8h, #8               \n"  // 16 bit to 8 bit U
1610     "uqshrn     v1.8b, v4.8h, #8               \n"  // 16 bit to 8 bit V
1611     MEMACCESS(1)
1612     "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels U.
1613     MEMACCESS(2)
1614     "st1        {v1.8b}, [%2], #8              \n"  // store 8 pixels V.
1615     "b.gt       1b                             \n"
1616   : "+r"(src_argb),  // %0
1617     "+r"(dst_u),     // %1
1618     "+r"(dst_v),     // %2
1619     "+r"(pix)        // %3
1620   :
1621   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1622     "v20", "v21", "v22", "v23", "v24", "v25"
1623   );
1624 }
1625 #endif  // HAS_ARGBTOUV411ROW_NEON
1626 
1627 // 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
1628 #define RGBTOUV(QB, QG, QR) \
1629     "mul        v3.8h, " #QB ",v20.8h          \n"  /* B                    */ \
1630     "mul        v4.8h, " #QR ",v20.8h          \n"  /* R                    */ \
1631     "mls        v3.8h, " #QG ",v21.8h          \n"  /* G                    */ \
1632     "mls        v4.8h, " #QG ",v24.8h          \n"  /* G                    */ \
1633     "mls        v3.8h, " #QR ",v22.8h          \n"  /* R                    */ \
1634     "mls        v4.8h, " #QB ",v23.8h          \n"  /* B                    */ \
1635     "add        v3.8h, v3.8h, v25.8h           \n"  /* +128 -> unsigned     */ \
1636     "add        v4.8h, v4.8h, v25.8h           \n"  /* +128 -> unsigned     */ \
1637     "uqshrn     v0.8b, v3.8h, #8               \n"  /* 16 bit to 8 bit U    */ \
1638     "uqshrn     v1.8b, v4.8h, #8               \n"  /* 16 bit to 8 bit V    */
1639 
1640 // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
1641 // TODO(fbarchard): consider ptrdiff_t for all strides.
1642 
1643 #ifdef HAS_ARGBTOUVROW_NEON
ARGBToUVRow_NEON(const uint8 * src_argb,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int pix)1644 void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
1645                       uint8* dst_u, uint8* dst_v, int pix) {
1646   const uint8* src_argb_1 = src_argb + src_stride_argb;
1647   asm volatile (
1648     RGBTOUV_SETUP_REG
1649   "1:                                          \n"
1650     MEMACCESS(0)
1651     "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
1652     "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
1653     "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
1654     "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
1655 
1656     MEMACCESS(1)
1657     "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
1658     "uadalp     v0.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
1659     "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
1660     "uadalp     v2.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
1661 
1662     "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
1663     "urshr      v1.8h, v1.8h, #1               \n"
1664     "urshr      v2.8h, v2.8h, #1               \n"
1665 
1666     "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
1667     RGBTOUV(v0.8h, v1.8h, v2.8h)
1668     MEMACCESS(2)
1669     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
1670     MEMACCESS(3)
1671     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
1672     "b.gt       1b                             \n"
1673   : "+r"(src_argb),  // %0
1674     "+r"(src_argb_1),  // %1
1675     "+r"(dst_u),     // %2
1676     "+r"(dst_v),     // %3
1677     "+r"(pix)        // %4
1678   :
1679   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1680     "v20", "v21", "v22", "v23", "v24", "v25"
1681   );
1682 }
1683 #endif  // HAS_ARGBTOUVROW_NEON
1684 
1685 // TODO(fbarchard): Subsample match C code.
1686 #ifdef HAS_ARGBTOUVJROW_NEON
ARGBToUVJRow_NEON(const uint8 * src_argb,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int pix)1687 void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
1688                        uint8* dst_u, uint8* dst_v, int pix) {
1689   const uint8* src_argb_1 = src_argb + src_stride_argb;
1690   asm volatile (
1691     "movi       v20.8h, #63, lsl #0            \n"  // UB/VR coeff (0.500) / 2
1692     "movi       v21.8h, #42, lsl #0            \n"  // UG coeff (-0.33126) / 2
1693     "movi       v22.8h, #21, lsl #0            \n"  // UR coeff (-0.16874) / 2
1694     "movi       v23.8h, #10, lsl #0            \n"  // VB coeff (-0.08131) / 2
1695     "movi       v24.8h, #53, lsl #0            \n"  // VG coeff (-0.41869) / 2
1696     "movi       v25.16b, #0x80                 \n"  // 128.5 (0x8080 in 16-bit)
1697   "1:                                          \n"
1698     MEMACCESS(0)
1699     "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
1700     "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
1701     "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
1702     "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
1703     MEMACCESS(1)
1704     "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64  \n"  // load next 16
1705     "uadalp     v0.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
1706     "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
1707     "uadalp     v2.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
1708 
1709     "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
1710     "urshr      v1.8h, v1.8h, #1               \n"
1711     "urshr      v2.8h, v2.8h, #1               \n"
1712 
1713     "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
1714     RGBTOUV(v0.8h, v1.8h, v2.8h)
1715     MEMACCESS(2)
1716     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
1717     MEMACCESS(3)
1718     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
1719     "b.gt       1b                             \n"
1720   : "+r"(src_argb),  // %0
1721     "+r"(src_argb_1),  // %1
1722     "+r"(dst_u),     // %2
1723     "+r"(dst_v),     // %3
1724     "+r"(pix)        // %4
1725   :
1726   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1727     "v20", "v21", "v22", "v23", "v24", "v25"
1728   );
1729 }
1730 #endif  // HAS_ARGBTOUVJROW_NEON
1731 
1732 #ifdef HAS_BGRATOUVROW_NEON
BGRAToUVRow_NEON(const uint8 * src_bgra,int src_stride_bgra,uint8 * dst_u,uint8 * dst_v,int pix)1733 void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
1734                       uint8* dst_u, uint8* dst_v, int pix) {
1735   const uint8* src_bgra_1 = src_bgra + src_stride_bgra;
1736   asm volatile (
1737     RGBTOUV_SETUP_REG
1738   "1:                                          \n"
1739     MEMACCESS(0)
1740     "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
1741     "uaddlp     v0.8h, v3.16b                  \n"  // B 16 bytes -> 8 shorts.
1742     "uaddlp     v3.8h, v2.16b                  \n"  // G 16 bytes -> 8 shorts.
1743     "uaddlp     v2.8h, v1.16b                  \n"  // R 16 bytes -> 8 shorts.
1744     MEMACCESS(1)
1745     "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more
1746     "uadalp     v0.8h, v7.16b                  \n"  // B 16 bytes -> 8 shorts.
1747     "uadalp     v3.8h, v6.16b                  \n"  // G 16 bytes -> 8 shorts.
1748     "uadalp     v2.8h, v5.16b                  \n"  // R 16 bytes -> 8 shorts.
1749 
1750     "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
1751     "urshr      v1.8h, v3.8h, #1               \n"
1752     "urshr      v2.8h, v2.8h, #1               \n"
1753 
1754     "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
1755     RGBTOUV(v0.8h, v1.8h, v2.8h)
1756     MEMACCESS(2)
1757     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
1758     MEMACCESS(3)
1759     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
1760     "b.gt       1b                             \n"
1761   : "+r"(src_bgra),  // %0
1762     "+r"(src_bgra_1),  // %1
1763     "+r"(dst_u),     // %2
1764     "+r"(dst_v),     // %3
1765     "+r"(pix)        // %4
1766   :
1767   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1768     "v20", "v21", "v22", "v23", "v24", "v25"
1769   );
1770 }
1771 #endif  // HAS_BGRATOUVROW_NEON
1772 
1773 #ifdef HAS_ABGRTOUVROW_NEON
ABGRToUVRow_NEON(const uint8 * src_abgr,int src_stride_abgr,uint8 * dst_u,uint8 * dst_v,int pix)1774 void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
1775                       uint8* dst_u, uint8* dst_v, int pix) {
1776   const uint8* src_abgr_1 = src_abgr + src_stride_abgr;
1777   asm volatile (
1778     RGBTOUV_SETUP_REG
1779   "1:                                          \n"
1780     MEMACCESS(0)
1781     "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
1782     "uaddlp     v3.8h, v2.16b                  \n"  // B 16 bytes -> 8 shorts.
1783     "uaddlp     v2.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
1784     "uaddlp     v1.8h, v0.16b                  \n"  // R 16 bytes -> 8 shorts.
1785     MEMACCESS(1)
1786     "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more.
1787     "uadalp     v3.8h, v6.16b                  \n"  // B 16 bytes -> 8 shorts.
1788     "uadalp     v2.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
1789     "uadalp     v1.8h, v4.16b                  \n"  // R 16 bytes -> 8 shorts.
1790 
1791     "urshr      v0.8h, v3.8h, #1               \n"  // 2x average
1792     "urshr      v2.8h, v2.8h, #1               \n"
1793     "urshr      v1.8h, v1.8h, #1               \n"
1794 
1795     "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
1796     RGBTOUV(v0.8h, v2.8h, v1.8h)
1797     MEMACCESS(2)
1798     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
1799     MEMACCESS(3)
1800     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
1801     "b.gt       1b                             \n"
1802   : "+r"(src_abgr),  // %0
1803     "+r"(src_abgr_1),  // %1
1804     "+r"(dst_u),     // %2
1805     "+r"(dst_v),     // %3
1806     "+r"(pix)        // %4
1807   :
1808   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1809     "v20", "v21", "v22", "v23", "v24", "v25"
1810   );
1811 }
1812 #endif  // HAS_ABGRTOUVROW_NEON
1813 
1814 #ifdef HAS_RGBATOUVROW_NEON
RGBAToUVRow_NEON(const uint8 * src_rgba,int src_stride_rgba,uint8 * dst_u,uint8 * dst_v,int pix)1815 void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
1816                       uint8* dst_u, uint8* dst_v, int pix) {
1817   const uint8* src_rgba_1 = src_rgba + src_stride_rgba;
1818   asm volatile (
1819     RGBTOUV_SETUP_REG
1820   "1:                                          \n"
1821     MEMACCESS(0)
1822     "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
1823     "uaddlp     v0.8h, v1.16b                  \n"  // B 16 bytes -> 8 shorts.
1824     "uaddlp     v1.8h, v2.16b                  \n"  // G 16 bytes -> 8 shorts.
1825     "uaddlp     v2.8h, v3.16b                  \n"  // R 16 bytes -> 8 shorts.
1826     MEMACCESS(1)
1827     "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more.
1828     "uadalp     v0.8h, v5.16b                  \n"  // B 16 bytes -> 8 shorts.
1829     "uadalp     v1.8h, v6.16b                  \n"  // G 16 bytes -> 8 shorts.
1830     "uadalp     v2.8h, v7.16b                  \n"  // R 16 bytes -> 8 shorts.
1831 
1832     "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
1833     "urshr      v1.8h, v1.8h, #1               \n"
1834     "urshr      v2.8h, v2.8h, #1               \n"
1835 
1836     "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
1837     RGBTOUV(v0.8h, v1.8h, v2.8h)
1838     MEMACCESS(2)
1839     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
1840     MEMACCESS(3)
1841     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
1842     "b.gt       1b                             \n"
1843   : "+r"(src_rgba),  // %0
1844     "+r"(src_rgba_1),  // %1
1845     "+r"(dst_u),     // %2
1846     "+r"(dst_v),     // %3
1847     "+r"(pix)        // %4
1848   :
1849   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1850     "v20", "v21", "v22", "v23", "v24", "v25"
1851   );
1852 }
1853 #endif  // HAS_RGBATOUVROW_NEON
1854 
1855 #ifdef HAS_RGB24TOUVROW_NEON
RGB24ToUVRow_NEON(const uint8 * src_rgb24,int src_stride_rgb24,uint8 * dst_u,uint8 * dst_v,int pix)1856 void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
1857                        uint8* dst_u, uint8* dst_v, int pix) {
1858   const uint8* src_rgb24_1 = src_rgb24 + src_stride_rgb24;
1859   asm volatile (
1860     RGBTOUV_SETUP_REG
1861   "1:                                          \n"
1862     MEMACCESS(0)
1863     "ld3        {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 16 pixels.
1864     "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
1865     "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
1866     "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
1867     MEMACCESS(1)
1868     "ld3        {v4.16b,v5.16b,v6.16b}, [%1], #48 \n"  // load 16 more.
1869     "uadalp     v0.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
1870     "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
1871     "uadalp     v2.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
1872 
1873     "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
1874     "urshr      v1.8h, v1.8h, #1               \n"
1875     "urshr      v2.8h, v2.8h, #1               \n"
1876 
1877     "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
1878     RGBTOUV(v0.8h, v1.8h, v2.8h)
1879     MEMACCESS(2)
1880     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
1881     MEMACCESS(3)
1882     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
1883     "b.gt       1b                             \n"
1884   : "+r"(src_rgb24),  // %0
1885     "+r"(src_rgb24_1),  // %1
1886     "+r"(dst_u),     // %2
1887     "+r"(dst_v),     // %3
1888     "+r"(pix)        // %4
1889   :
1890   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1891     "v20", "v21", "v22", "v23", "v24", "v25"
1892   );
1893 }
1894 #endif  // HAS_RGB24TOUVROW_NEON
1895 
1896 #ifdef HAS_RAWTOUVROW_NEON
RAWToUVRow_NEON(const uint8 * src_raw,int src_stride_raw,uint8 * dst_u,uint8 * dst_v,int pix)1897 void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
1898                      uint8* dst_u, uint8* dst_v, int pix) {
1899   const uint8* src_raw_1 = src_raw + src_stride_raw;
1900   asm volatile (
1901     RGBTOUV_SETUP_REG
1902   "1:                                          \n"
1903     MEMACCESS(0)
1904     "ld3        {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 8 RAW pixels.
1905     "uaddlp     v2.8h, v2.16b                  \n"  // B 16 bytes -> 8 shorts.
1906     "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
1907     "uaddlp     v0.8h, v0.16b                  \n"  // R 16 bytes -> 8 shorts.
1908     MEMACCESS(1)
1909     "ld3        {v4.16b,v5.16b,v6.16b}, [%1], #48 \n"  // load 8 more RAW pixels
1910     "uadalp     v2.8h, v6.16b                  \n"  // B 16 bytes -> 8 shorts.
1911     "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
1912     "uadalp     v0.8h, v4.16b                  \n"  // R 16 bytes -> 8 shorts.
1913 
1914     "urshr      v2.8h, v2.8h, #1               \n"  // 2x average
1915     "urshr      v1.8h, v1.8h, #1               \n"
1916     "urshr      v0.8h, v0.8h, #1               \n"
1917 
1918     "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
1919     RGBTOUV(v2.8h, v1.8h, v0.8h)
1920     MEMACCESS(2)
1921     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
1922     MEMACCESS(3)
1923     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
1924     "b.gt       1b                             \n"
1925   : "+r"(src_raw),  // %0
1926     "+r"(src_raw_1),  // %1
1927     "+r"(dst_u),     // %2
1928     "+r"(dst_v),     // %3
1929     "+r"(pix)        // %4
1930   :
1931   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1932     "v20", "v21", "v22", "v23", "v24", "v25"
1933   );
1934 }
1935 #endif  // HAS_RAWTOUVROW_NEON
1936 
1937 // 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
1938 #ifdef HAS_RGB565TOUVROW_NEON
RGB565ToUVRow_NEON(const uint8 * src_rgb565,int src_stride_rgb565,uint8 * dst_u,uint8 * dst_v,int pix)1939 void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
1940                         uint8* dst_u, uint8* dst_v, int pix) {
1941   const uint8* src_rgb565_1 = src_rgb565 + src_stride_rgb565;
1942   asm volatile (
1943     "movi       v22.8h, #56, lsl #0            \n"  // UB / VR coeff (0.875) / 2
1944     "movi       v23.8h, #37, lsl #0            \n"  // UG coeff (-0.5781) / 2
1945     "movi       v24.8h, #19, lsl #0            \n"  // UR coeff (-0.2969) / 2
1946     "movi       v25.8h, #9 , lsl #0            \n"  // VB coeff (-0.1406) / 2
1947     "movi       v26.8h, #47, lsl #0            \n"  // VG coeff (-0.7344) / 2
1948     "movi       v27.16b, #0x80                 \n"  // 128.5 (0x8080 in 16-bit)
1949   "1:                                          \n"
1950     MEMACCESS(0)
1951     "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.
1952     RGB565TOARGB
1953     "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
1954     "uaddlp     v18.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
1955     "uaddlp     v20.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
1956     MEMACCESS(0)
1957     "ld1        {v0.16b}, [%0], #16            \n"  // next 8 RGB565 pixels.
1958     RGB565TOARGB
1959     "uaddlp     v17.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
1960     "uaddlp     v19.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
1961     "uaddlp     v21.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
1962 
1963     MEMACCESS(1)
1964     "ld1        {v0.16b}, [%1], #16            \n"  // load 8 RGB565 pixels.
1965     RGB565TOARGB
1966     "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
1967     "uadalp     v18.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
1968     "uadalp     v20.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
1969     MEMACCESS(1)
1970     "ld1        {v0.16b}, [%1], #16            \n"  // next 8 RGB565 pixels.
1971     RGB565TOARGB
1972     "uadalp     v17.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
1973     "uadalp     v19.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
1974     "uadalp     v21.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
1975 
1976     "ins        v16.D[1], v17.D[0]             \n"
1977     "ins        v18.D[1], v19.D[0]             \n"
1978     "ins        v20.D[1], v21.D[0]             \n"
1979 
1980     "urshr      v4.8h, v16.8h, #1              \n"  // 2x average
1981     "urshr      v5.8h, v18.8h, #1              \n"
1982     "urshr      v6.8h, v20.8h, #1              \n"
1983 
1984     "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.
1985     "mul        v16.8h, v4.8h, v22.8h          \n"  // B
1986     "mls        v16.8h, v5.8h, v23.8h          \n"  // G
1987     "mls        v16.8h, v6.8h, v24.8h          \n"  // R
1988     "add        v16.8h, v16.8h, v27.8h         \n"  // +128 -> unsigned
1989     "mul        v17.8h, v6.8h, v22.8h          \n"  // R
1990     "mls        v17.8h, v5.8h, v26.8h          \n"  // G
1991     "mls        v17.8h, v4.8h, v25.8h          \n"  // B
1992     "add        v17.8h, v17.8h, v27.8h         \n"  // +128 -> unsigned
1993     "uqshrn     v0.8b, v16.8h, #8              \n"  // 16 bit to 8 bit U
1994     "uqshrn     v1.8b, v17.8h, #8              \n"  // 16 bit to 8 bit V
1995     MEMACCESS(2)
1996     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
1997     MEMACCESS(3)
1998     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
1999     "b.gt       1b                             \n"
2000   : "+r"(src_rgb565),  // %0
2001     "+r"(src_rgb565_1),  // %1
2002     "+r"(dst_u),     // %2
2003     "+r"(dst_v),     // %3
2004     "+r"(pix)        // %4
2005   :
2006   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
2007     "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
2008     "v25", "v26", "v27"
2009   );
2010 }
2011 #endif  // HAS_RGB565TOUVROW_NEON
2012 
2013 // 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
2014 #ifdef HAS_ARGB1555TOUVROW_NEON
ARGB1555ToUVRow_NEON(const uint8 * src_argb1555,int src_stride_argb1555,uint8 * dst_u,uint8 * dst_v,int pix)2015 void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
2016                         uint8* dst_u, uint8* dst_v, int pix) {
2017   const uint8* src_argb1555_1 = src_argb1555 + src_stride_argb1555;
2018   asm volatile (
2019     RGBTOUV_SETUP_REG
2020   "1:                                          \n"
2021     MEMACCESS(0)
2022     "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.
2023     RGB555TOARGB
2024     "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
2025     "uaddlp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
2026     "uaddlp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
2027     MEMACCESS(0)
2028     "ld1        {v0.16b}, [%0], #16            \n"  // next 8 ARGB1555 pixels.
2029     RGB555TOARGB
2030     "uaddlp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
2031     "uaddlp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
2032     "uaddlp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
2033 
2034     MEMACCESS(1)
2035     "ld1        {v0.16b}, [%1], #16            \n"  // load 8 ARGB1555 pixels.
2036     RGB555TOARGB
2037     "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
2038     "uadalp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
2039     "uadalp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
2040     MEMACCESS(1)
2041     "ld1        {v0.16b}, [%1], #16            \n"  // next 8 ARGB1555 pixels.
2042     RGB555TOARGB
2043     "uadalp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
2044     "uadalp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
2045     "uadalp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
2046 
2047     "ins        v16.D[1], v26.D[0]             \n"
2048     "ins        v17.D[1], v27.D[0]             \n"
2049     "ins        v18.D[1], v28.D[0]             \n"
2050 
2051     "urshr      v4.8h, v16.8h, #1              \n"  // 2x average
2052     "urshr      v5.8h, v17.8h, #1              \n"
2053     "urshr      v6.8h, v18.8h, #1              \n"
2054 
2055     "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.
2056     "mul        v2.8h, v4.8h, v20.8h           \n"  // B
2057     "mls        v2.8h, v5.8h, v21.8h           \n"  // G
2058     "mls        v2.8h, v6.8h, v22.8h           \n"  // R
2059     "add        v2.8h, v2.8h, v25.8h           \n"  // +128 -> unsigned
2060     "mul        v3.8h, v6.8h, v20.8h           \n"  // R
2061     "mls        v3.8h, v5.8h, v24.8h           \n"  // G
2062     "mls        v3.8h, v4.8h, v23.8h           \n"  // B
2063     "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned
2064     "uqshrn     v0.8b, v2.8h, #8               \n"  // 16 bit to 8 bit U
2065     "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
2066     MEMACCESS(2)
2067     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
2068     MEMACCESS(3)
2069     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
2070     "b.gt       1b                             \n"
2071   : "+r"(src_argb1555),  // %0
2072     "+r"(src_argb1555_1),  // %1
2073     "+r"(dst_u),     // %2
2074     "+r"(dst_v),     // %3
2075     "+r"(pix)        // %4
2076   :
2077   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
2078     "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
2079     "v26", "v27", "v28"
2080   );
2081 }
2082 #endif  // HAS_ARGB1555TOUVROW_NEON
2083 
2084 // 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
2085 #ifdef HAS_ARGB4444TOUVROW_NEON
ARGB4444ToUVRow_NEON(const uint8 * src_argb4444,int src_stride_argb4444,uint8 * dst_u,uint8 * dst_v,int pix)2086 void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
2087                           uint8* dst_u, uint8* dst_v, int pix) {
2088   const uint8* src_argb4444_1 = src_argb4444 + src_stride_argb4444;
2089   asm volatile (
2090     RGBTOUV_SETUP_REG
2091   "1:                                          \n"
2092     MEMACCESS(0)
2093     "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.
2094     ARGB4444TOARGB
2095     "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
2096     "uaddlp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
2097     "uaddlp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
2098     MEMACCESS(0)
2099     "ld1        {v0.16b}, [%0], #16            \n"  // next 8 ARGB4444 pixels.
2100     ARGB4444TOARGB
2101     "uaddlp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
2102     "uaddlp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
2103     "uaddlp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
2104 
2105     MEMACCESS(1)
2106     "ld1        {v0.16b}, [%1], #16            \n"  // load 8 ARGB4444 pixels.
2107     ARGB4444TOARGB
2108     "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
2109     "uadalp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
2110     "uadalp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
2111     MEMACCESS(1)
2112     "ld1        {v0.16b}, [%1], #16            \n"  // next 8 ARGB4444 pixels.
2113     ARGB4444TOARGB
2114     "uadalp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
2115     "uadalp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
2116     "uadalp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
2117 
2118     "ins        v16.D[1], v26.D[0]             \n"
2119     "ins        v17.D[1], v27.D[0]             \n"
2120     "ins        v18.D[1], v28.D[0]             \n"
2121 
2122     "urshr      v4.8h, v16.8h, #1              \n"  // 2x average
2123     "urshr      v5.8h, v17.8h, #1              \n"
2124     "urshr      v6.8h, v18.8h, #1              \n"
2125 
2126     "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.
2127     "mul        v2.8h, v4.8h, v20.8h           \n"  // B
2128     "mls        v2.8h, v5.8h, v21.8h           \n"  // G
2129     "mls        v2.8h, v6.8h, v22.8h           \n"  // R
2130     "add        v2.8h, v2.8h, v25.8h           \n"  // +128 -> unsigned
2131     "mul        v3.8h, v6.8h, v20.8h           \n"  // R
2132     "mls        v3.8h, v5.8h, v24.8h           \n"  // G
2133     "mls        v3.8h, v4.8h, v23.8h           \n"  // B
2134     "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned
2135     "uqshrn     v0.8b, v2.8h, #8               \n"  // 16 bit to 8 bit U
2136     "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
2137     MEMACCESS(2)
2138     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
2139     MEMACCESS(3)
2140     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
2141     "b.gt       1b                             \n"
2142   : "+r"(src_argb4444),  // %0
2143     "+r"(src_argb4444_1),  // %1
2144     "+r"(dst_u),     // %2
2145     "+r"(dst_v),     // %3
2146     "+r"(pix)        // %4
2147   :
2148   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
2149     "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
2150     "v26", "v27", "v28"
2151 
2152   );
2153 }
2154 #endif  // HAS_ARGB4444TOUVROW_NEON
2155 
2156 #ifdef HAS_RGB565TOYROW_NEON
RGB565ToYRow_NEON(const uint8 * src_rgb565,uint8 * dst_y,int pix)2157 void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) {
2158   asm volatile (
2159     "movi       v24.8b, #13                    \n"  // B * 0.1016 coefficient
2160     "movi       v25.8b, #65                    \n"  // G * 0.5078 coefficient
2161     "movi       v26.8b, #33                    \n"  // R * 0.2578 coefficient
2162     "movi       v27.8b, #16                    \n"  // Add 16 constant
2163   "1:                                          \n"
2164     MEMACCESS(0)
2165     "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.
2166     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
2167     RGB565TOARGB
2168     "umull      v3.8h, v0.8b, v24.8b           \n"  // B
2169     "umlal      v3.8h, v1.8b, v25.8b           \n"  // G
2170     "umlal      v3.8h, v2.8b, v26.8b           \n"  // R
2171     "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
2172     "uqadd      v0.8b, v0.8b, v27.8b           \n"
2173     MEMACCESS(1)
2174     "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
2175     "b.gt       1b                             \n"
2176   : "+r"(src_rgb565),  // %0
2177     "+r"(dst_y),       // %1
2178     "+r"(pix)          // %2
2179   :
2180   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6",
2181     "v24", "v25", "v26", "v27"
2182   );
2183 }
2184 #endif  // HAS_RGB565TOYROW_NEON
2185 
2186 #ifdef HAS_ARGB1555TOYROW_NEON
ARGB1555ToYRow_NEON(const uint8 * src_argb1555,uint8 * dst_y,int pix)2187 void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) {
2188   asm volatile (
2189     "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
2190     "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
2191     "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
2192     "movi       v7.8b, #16                     \n"  // Add 16 constant
2193   "1:                                          \n"
2194     MEMACCESS(0)
2195     "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.
2196     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
2197     ARGB1555TOARGB
2198     "umull      v3.8h, v0.8b, v4.8b            \n"  // B
2199     "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
2200     "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
2201     "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
2202     "uqadd      v0.8b, v0.8b, v7.8b            \n"
2203     MEMACCESS(1)
2204     "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
2205     "b.gt       1b                             \n"
2206   : "+r"(src_argb1555),  // %0
2207     "+r"(dst_y),         // %1
2208     "+r"(pix)            // %2
2209   :
2210   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
2211   );
2212 }
2213 #endif  // HAS_ARGB1555TOYROW_NEON
2214 
2215 #ifdef HAS_ARGB4444TOYROW_NEON
ARGB4444ToYRow_NEON(const uint8 * src_argb4444,uint8 * dst_y,int pix)2216 void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) {
2217   asm volatile (
2218     "movi       v24.8b, #13                    \n"  // B * 0.1016 coefficient
2219     "movi       v25.8b, #65                    \n"  // G * 0.5078 coefficient
2220     "movi       v26.8b, #33                    \n"  // R * 0.2578 coefficient
2221     "movi       v27.8b, #16                    \n"  // Add 16 constant
2222   "1:                                          \n"
2223     MEMACCESS(0)
2224     "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.
2225     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
2226     ARGB4444TOARGB
2227     "umull      v3.8h, v0.8b, v24.8b           \n"  // B
2228     "umlal      v3.8h, v1.8b, v25.8b           \n"  // G
2229     "umlal      v3.8h, v2.8b, v26.8b           \n"  // R
2230     "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
2231     "uqadd      v0.8b, v0.8b, v27.8b           \n"
2232     MEMACCESS(1)
2233     "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
2234     "b.gt       1b                             \n"
2235   : "+r"(src_argb4444),  // %0
2236     "+r"(dst_y),         // %1
2237     "+r"(pix)            // %2
2238   :
2239   : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27"
2240   );
2241 }
2242 #endif  // HAS_ARGB4444TOYROW_NEON
2243 
2244 #ifdef HAS_BGRATOYROW_NEON
BGRAToYRow_NEON(const uint8 * src_bgra,uint8 * dst_y,int pix)2245 void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) {
2246   asm volatile (
2247     "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient
2248     "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
2249     "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient
2250     "movi       v7.8b, #16                     \n"  // Add 16 constant
2251   "1:                                          \n"
2252     MEMACCESS(0)
2253     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
2254     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
2255     "umull      v16.8h, v1.8b, v4.8b           \n"  // R
2256     "umlal      v16.8h, v2.8b, v5.8b           \n"  // G
2257     "umlal      v16.8h, v3.8b, v6.8b           \n"  // B
2258     "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
2259     "uqadd      v0.8b, v0.8b, v7.8b            \n"
2260     MEMACCESS(1)
2261     "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
2262     "b.gt       1b                             \n"
2263   : "+r"(src_bgra),  // %0
2264     "+r"(dst_y),     // %1
2265     "+r"(pix)        // %2
2266   :
2267   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
2268   );
2269 }
2270 #endif  // HAS_BGRATOYROW_NEON
2271 
2272 #ifdef HAS_ABGRTOYROW_NEON
ABGRToYRow_NEON(const uint8 * src_abgr,uint8 * dst_y,int pix)2273 void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) {
2274   asm volatile (
2275     "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient
2276     "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
2277     "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient
2278     "movi       v7.8b, #16                     \n"  // Add 16 constant
2279   "1:                                          \n"
2280     MEMACCESS(0)
2281     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
2282     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
2283     "umull      v16.8h, v0.8b, v4.8b           \n"  // R
2284     "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
2285     "umlal      v16.8h, v2.8b, v6.8b           \n"  // B
2286     "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
2287     "uqadd      v0.8b, v0.8b, v7.8b            \n"
2288     MEMACCESS(1)
2289     "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
2290     "b.gt       1b                             \n"
2291   : "+r"(src_abgr),  // %0
2292     "+r"(dst_y),     // %1
2293     "+r"(pix)        // %2
2294   :
2295   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
2296   );
2297 }
2298 #endif  // HAS_ABGRTOYROW_NEON
2299 
2300 #ifdef HAS_RGBATOYROW_NEON
RGBAToYRow_NEON(const uint8 * src_rgba,uint8 * dst_y,int pix)2301 void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) {
2302   asm volatile (
2303     "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
2304     "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
2305     "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
2306     "movi       v7.8b, #16                     \n"  // Add 16 constant
2307   "1:                                          \n"
2308     MEMACCESS(0)
2309     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
2310     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
2311     "umull      v16.8h, v1.8b, v4.8b           \n"  // B
2312     "umlal      v16.8h, v2.8b, v5.8b           \n"  // G
2313     "umlal      v16.8h, v3.8b, v6.8b           \n"  // R
2314     "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
2315     "uqadd      v0.8b, v0.8b, v7.8b            \n"
2316     MEMACCESS(1)
2317     "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
2318     "b.gt       1b                             \n"
2319   : "+r"(src_rgba),  // %0
2320     "+r"(dst_y),     // %1
2321     "+r"(pix)        // %2
2322   :
2323   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
2324   );
2325 }
2326 #endif  // HAS_RGBATOYROW_NEON
2327 
2328 #ifdef HAS_RGB24TOYROW_NEON
RGB24ToYRow_NEON(const uint8 * src_rgb24,uint8 * dst_y,int pix)2329 void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) {
2330   asm volatile (
2331     "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
2332     "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
2333     "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
2334     "movi       v7.8b, #16                     \n"  // Add 16 constant
2335   "1:                                          \n"
2336     MEMACCESS(0)
2337     "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels.
2338     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
2339     "umull      v16.8h, v0.8b, v4.8b           \n"  // B
2340     "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
2341     "umlal      v16.8h, v2.8b, v6.8b           \n"  // R
2342     "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
2343     "uqadd      v0.8b, v0.8b, v7.8b            \n"
2344     MEMACCESS(1)
2345     "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
2346     "b.gt       1b                             \n"
2347   : "+r"(src_rgb24),  // %0
2348     "+r"(dst_y),      // %1
2349     "+r"(pix)         // %2
2350   :
2351   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
2352   );
2353 }
2354 #endif  // HAS_RGB24TOYROW_NEON
2355 
2356 #ifdef HAS_RAWTOYROW_NEON
RAWToYRow_NEON(const uint8 * src_raw,uint8 * dst_y,int pix)2357 void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) {
2358   asm volatile (
2359     "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient
2360     "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
2361     "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient
2362     "movi       v7.8b, #16                     \n"  // Add 16 constant
2363   "1:                                          \n"
2364     MEMACCESS(0)
2365     "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels.
2366     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
2367     "umull      v16.8h, v0.8b, v4.8b           \n"  // B
2368     "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
2369     "umlal      v16.8h, v2.8b, v6.8b           \n"  // R
2370     "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
2371     "uqadd      v0.8b, v0.8b, v7.8b            \n"
2372     MEMACCESS(1)
2373     "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
2374     "b.gt       1b                             \n"
2375   : "+r"(src_raw),  // %0
2376     "+r"(dst_y),    // %1
2377     "+r"(pix)       // %2
2378   :
2379   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
2380   );
2381 }
2382 #endif  // HAS_RAWTOYROW_NEON
2383 
2384 // Bilinear filter 16x2 -> 16x1
2385 #ifdef HAS_INTERPOLATEROW_NEON
InterpolateRow_NEON(uint8 * dst_ptr,const uint8 * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)2386 void InterpolateRow_NEON(uint8* dst_ptr,
2387                          const uint8* src_ptr, ptrdiff_t src_stride,
2388                          int dst_width, int source_y_fraction) {
2389   int y1_fraction = source_y_fraction;
2390   int y0_fraction = 256 - y1_fraction;
2391   const uint8* src_ptr1 = src_ptr + src_stride;
2392   asm volatile (
2393     "cmp        %w4, #0                        \n"
2394     "b.eq       100f                           \n"
2395     "cmp        %w4, #64                       \n"
2396     "b.eq       75f                            \n"
2397     "cmp        %w4, #128                      \n"
2398     "b.eq       50f                            \n"
2399     "cmp        %w4, #192                      \n"
2400     "b.eq       25f                            \n"
2401 
2402     "dup        v5.16b, %w4                    \n"
2403     "dup        v4.16b, %w5                    \n"
2404     // General purpose row blend.
2405   "1:                                          \n"
2406     MEMACCESS(1)
2407     "ld1        {v0.16b}, [%1], #16            \n"
2408     MEMACCESS(2)
2409     "ld1        {v1.16b}, [%2], #16            \n"
2410     "subs       %w3, %w3, #16                  \n"
2411     "umull      v2.8h, v0.8b,  v4.8b           \n"
2412     "umull2     v3.8h, v0.16b, v4.16b          \n"
2413     "umlal      v2.8h, v1.8b,  v5.8b           \n"
2414     "umlal2     v3.8h, v1.16b, v5.16b          \n"
2415     "rshrn      v0.8b,  v2.8h, #8              \n"
2416     "rshrn2     v0.16b, v3.8h, #8              \n"
2417     MEMACCESS(0)
2418     "st1        {v0.16b}, [%0], #16            \n"
2419     "b.gt       1b                             \n"
2420     "b          99f                            \n"
2421 
2422     // Blend 25 / 75.
2423   "25:                                         \n"
2424     MEMACCESS(1)
2425     "ld1        {v0.16b}, [%1], #16            \n"
2426     MEMACCESS(2)
2427     "ld1        {v1.16b}, [%2], #16            \n"
2428     "subs       %w3, %w3, #16                  \n"
2429     "urhadd     v0.16b, v0.16b, v1.16b         \n"
2430     "urhadd     v0.16b, v0.16b, v1.16b         \n"
2431     MEMACCESS(0)
2432     "st1        {v0.16b}, [%0], #16            \n"
2433     "b.gt       25b                            \n"
2434     "b          99f                            \n"
2435 
2436     // Blend 50 / 50.
2437   "50:                                         \n"
2438     MEMACCESS(1)
2439     "ld1        {v0.16b}, [%1], #16            \n"
2440     MEMACCESS(2)
2441     "ld1        {v1.16b}, [%2], #16            \n"
2442     "subs       %w3, %w3, #16                  \n"
2443     "urhadd     v0.16b, v0.16b, v1.16b         \n"
2444     MEMACCESS(0)
2445     "st1        {v0.16b}, [%0], #16            \n"
2446     "b.gt       50b                            \n"
2447     "b          99f                            \n"
2448 
2449     // Blend 75 / 25.
2450   "75:                                         \n"
2451     MEMACCESS(1)
2452     "ld1        {v1.16b}, [%1], #16            \n"
2453     MEMACCESS(2)
2454     "ld1        {v0.16b}, [%2], #16            \n"
2455     "subs       %w3, %w3, #16                  \n"
2456     "urhadd     v0.16b, v0.16b, v1.16b         \n"
2457     "urhadd     v0.16b, v0.16b, v1.16b         \n"
2458     MEMACCESS(0)
2459     "st1        {v0.16b}, [%0], #16            \n"
2460     "b.gt       75b                            \n"
2461     "b          99f                            \n"
2462 
2463     // Blend 100 / 0 - Copy row unchanged.
2464   "100:                                        \n"
2465     MEMACCESS(1)
2466     "ld1        {v0.16b}, [%1], #16            \n"
2467     "subs       %w3, %w3, #16                  \n"
2468     MEMACCESS(0)
2469     "st1        {v0.16b}, [%0], #16            \n"
2470     "b.gt       100b                           \n"
2471 
2472   "99:                                         \n"
2473   : "+r"(dst_ptr),          // %0
2474     "+r"(src_ptr),          // %1
2475     "+r"(src_ptr1),         // %2
2476     "+r"(dst_width),        // %3
2477     "+r"(y1_fraction),      // %4
2478     "+r"(y0_fraction)       // %5
2479   :
2480   : "cc", "memory", "v0", "v1", "v3", "v4", "v5"
2481   );
2482 }
2483 #endif  // HAS_INTERPOLATEROW_NEON
2484 
2485 // dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
2486 #ifdef HAS_ARGBBLENDROW_NEON
ARGBBlendRow_NEON(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)2487 void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
2488                        uint8* dst_argb, int width) {
2489   asm volatile (
2490     "subs       %w3, %w3, #8                   \n"
2491     "b.lt       89f                            \n"
2492     // Blend 8 pixels.
2493   "8:                                          \n"
2494     MEMACCESS(0)
2495     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB0 pixels
2496     MEMACCESS(1)
2497     "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 ARGB1 pixels
2498     "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
2499     "umull      v16.8h, v4.8b, v3.8b           \n"  // db * a
2500     "umull      v17.8h, v5.8b, v3.8b           \n"  // dg * a
2501     "umull      v18.8h, v6.8b, v3.8b           \n"  // dr * a
2502     "uqrshrn    v16.8b, v16.8h, #8             \n"  // db >>= 8
2503     "uqrshrn    v17.8b, v17.8h, #8             \n"  // dg >>= 8
2504     "uqrshrn    v18.8b, v18.8h, #8             \n"  // dr >>= 8
2505     "uqsub      v4.8b, v4.8b, v16.8b           \n"  // db - (db * a / 256)
2506     "uqsub      v5.8b, v5.8b, v17.8b           \n"  // dg - (dg * a / 256)
2507     "uqsub      v6.8b, v6.8b, v18.8b           \n"  // dr - (dr * a / 256)
2508     "uqadd      v0.8b, v0.8b, v4.8b            \n"  // + sb
2509     "uqadd      v1.8b, v1.8b, v5.8b            \n"  // + sg
2510     "uqadd      v2.8b, v2.8b, v6.8b            \n"  // + sr
2511     "movi       v3.8b, #255                    \n"  // a = 255
2512     MEMACCESS(2)
2513     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
2514     "b.ge       8b                             \n"
2515 
2516   "89:                                         \n"
2517     "adds       %w3, %w3, #8-1                 \n"
2518     "b.lt       99f                            \n"
2519 
2520     // Blend 1 pixels.
2521   "1:                                          \n"
2522     MEMACCESS(0)
2523     "ld4        {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n"  // load 1 pixel ARGB0.
2524     MEMACCESS(1)
2525     "ld4        {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n"  // load 1 pixel ARGB1.
2526     "subs       %w3, %w3, #1                   \n"  // 1 processed per loop.
2527     "umull      v16.8h, v4.8b, v3.8b           \n"  // db * a
2528     "umull      v17.8h, v5.8b, v3.8b           \n"  // dg * a
2529     "umull      v18.8h, v6.8b, v3.8b           \n"  // dr * a
2530     "uqrshrn    v16.8b, v16.8h, #8             \n"  // db >>= 8
2531     "uqrshrn    v17.8b, v17.8h, #8             \n"  // dg >>= 8
2532     "uqrshrn    v18.8b, v18.8h, #8             \n"  // dr >>= 8
2533     "uqsub      v4.8b, v4.8b, v16.8b           \n"  // db - (db * a / 256)
2534     "uqsub      v5.8b, v5.8b, v17.8b           \n"  // dg - (dg * a / 256)
2535     "uqsub      v6.8b, v6.8b, v18.8b           \n"  // dr - (dr * a / 256)
2536     "uqadd      v0.8b, v0.8b, v4.8b            \n"  // + sb
2537     "uqadd      v1.8b, v1.8b, v5.8b            \n"  // + sg
2538     "uqadd      v2.8b, v2.8b, v6.8b            \n"  // + sr
2539     "movi       v3.8b, #255                    \n"  // a = 255
2540     MEMACCESS(2)
2541     "st4        {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n"  // store 1 pixel.
2542     "b.ge       1b                             \n"
2543 
2544   "99:                                         \n"
2545 
2546   : "+r"(src_argb0),    // %0
2547     "+r"(src_argb1),    // %1
2548     "+r"(dst_argb),     // %2
2549     "+r"(width)         // %3
2550   :
2551   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
2552     "v16", "v17", "v18"
2553   );
2554 }
2555 #endif  // HAS_ARGBBLENDROW_NEON
2556 
2557 // Attenuate 8 pixels at a time.
2558 #ifdef HAS_ARGBATTENUATEROW_NEON
ARGBAttenuateRow_NEON(const uint8 * src_argb,uint8 * dst_argb,int width)2559 void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
2560   asm volatile (
2561     // Attenuate 8 pixels.
2562   "1:                                          \n"
2563     MEMACCESS(0)
2564     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels
2565     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
2566     "umull      v4.8h, v0.8b, v3.8b            \n"  // b * a
2567     "umull      v5.8h, v1.8b, v3.8b            \n"  // g * a
2568     "umull      v6.8h, v2.8b, v3.8b            \n"  // r * a
2569     "uqrshrn    v0.8b, v4.8h, #8               \n"  // b >>= 8
2570     "uqrshrn    v1.8b, v5.8h, #8               \n"  // g >>= 8
2571     "uqrshrn    v2.8b, v6.8h, #8               \n"  // r >>= 8
2572     MEMACCESS(1)
2573     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
2574     "b.gt       1b                             \n"
2575   : "+r"(src_argb),   // %0
2576     "+r"(dst_argb),   // %1
2577     "+r"(width)       // %2
2578   :
2579   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
2580   );
2581 }
2582 #endif  // HAS_ARGBATTENUATEROW_NEON
2583 
2584 // Quantize 8 ARGB pixels (32 bytes).
2585 // dst = (dst * scale >> 16) * interval_size + interval_offset;
2586 #ifdef HAS_ARGBQUANTIZEROW_NEON
ARGBQuantizeRow_NEON(uint8 * dst_argb,int scale,int interval_size,int interval_offset,int width)2587 void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
2588                           int interval_offset, int width) {
2589   asm volatile (
2590     "dup        v4.8h, %w2                     \n"
2591     "ushr       v4.8h, v4.8h, #1               \n"  // scale >>= 1
2592     "dup        v5.8h, %w3                     \n"  // interval multiply.
2593     "dup        v6.8h, %w4                     \n"  // interval add
2594 
2595     // 8 pixel loop.
2596   "1:                                          \n"
2597     MEMACCESS(0)
2598     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0]  \n"  // load 8 pixels of ARGB.
2599     "subs       %w1, %w1, #8                   \n"  // 8 processed per loop.
2600     "uxtl       v0.8h, v0.8b                   \n"  // b (0 .. 255)
2601     "uxtl       v1.8h, v1.8b                   \n"
2602     "uxtl       v2.8h, v2.8b                   \n"
2603     "sqdmulh    v0.8h, v0.8h, v4.8h            \n"  // b * scale
2604     "sqdmulh    v1.8h, v1.8h, v4.8h            \n"  // g
2605     "sqdmulh    v2.8h, v2.8h, v4.8h            \n"  // r
2606     "mul        v0.8h, v0.8h, v5.8h            \n"  // b * interval_size
2607     "mul        v1.8h, v1.8h, v5.8h            \n"  // g
2608     "mul        v2.8h, v2.8h, v5.8h            \n"  // r
2609     "add        v0.8h, v0.8h, v6.8h            \n"  // b + interval_offset
2610     "add        v1.8h, v1.8h, v6.8h            \n"  // g
2611     "add        v2.8h, v2.8h, v6.8h            \n"  // r
2612     "uqxtn      v0.8b, v0.8h                   \n"
2613     "uqxtn      v1.8b, v1.8h                   \n"
2614     "uqxtn      v2.8b, v2.8h                   \n"
2615     MEMACCESS(0)
2616     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // store 8 ARGB pixels
2617     "b.gt       1b                             \n"
2618   : "+r"(dst_argb),       // %0
2619     "+r"(width)           // %1
2620   : "r"(scale),           // %2
2621     "r"(interval_size),   // %3
2622     "r"(interval_offset)  // %4
2623   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
2624   );
2625 }
2626 #endif  // HAS_ARGBQUANTIZEROW_NEON
2627 
2628 // Shade 8 pixels at a time by specified value.
2629 // NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
2630 // Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
2631 #ifdef HAS_ARGBSHADEROW_NEON
ARGBShadeRow_NEON(const uint8 * src_argb,uint8 * dst_argb,int width,uint32 value)2632 void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
2633                        uint32 value) {
2634   asm volatile (
2635     "dup        v0.4s, %w3                     \n"  // duplicate scale value.
2636     "zip1       v0.8b, v0.8b, v0.8b            \n"  // v0.8b aarrggbb.
2637     "ushr       v0.8h, v0.8h, #1               \n"  // scale / 2.
2638 
2639     // 8 pixel loop.
2640   "1:                                          \n"
2641     MEMACCESS(0)
2642     "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
2643     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
2644     "uxtl       v4.8h, v4.8b                   \n"  // b (0 .. 255)
2645     "uxtl       v5.8h, v5.8b                   \n"
2646     "uxtl       v6.8h, v6.8b                   \n"
2647     "uxtl       v7.8h, v7.8b                   \n"
2648     "sqrdmulh   v4.8h, v4.8h, v0.h[0]          \n"  // b * scale * 2
2649     "sqrdmulh   v5.8h, v5.8h, v0.h[1]          \n"  // g
2650     "sqrdmulh   v6.8h, v6.8h, v0.h[2]          \n"  // r
2651     "sqrdmulh   v7.8h, v7.8h, v0.h[3]          \n"  // a
2652     "uqxtn      v4.8b, v4.8h                   \n"
2653     "uqxtn      v5.8b, v5.8h                   \n"
2654     "uqxtn      v6.8b, v6.8h                   \n"
2655     "uqxtn      v7.8b, v7.8h                   \n"
2656     MEMACCESS(1)
2657     "st4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // store 8 ARGB pixels
2658     "b.gt       1b                             \n"
2659   : "+r"(src_argb),       // %0
2660     "+r"(dst_argb),       // %1
2661     "+r"(width)           // %2
2662   : "r"(value)            // %3
2663   : "cc", "memory", "v0", "v4", "v5", "v6", "v7"
2664   );
2665 }
2666 #endif  // HAS_ARGBSHADEROW_NEON
2667 
2668 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
2669 // Similar to ARGBToYJ but stores ARGB.
2670 // C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
2671 #ifdef HAS_ARGBGRAYROW_NEON
ARGBGrayRow_NEON(const uint8 * src_argb,uint8 * dst_argb,int width)2672 void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
2673   asm volatile (
2674     "movi       v24.8b, #15                    \n"  // B * 0.11400 coefficient
2675     "movi       v25.8b, #75                    \n"  // G * 0.58700 coefficient
2676     "movi       v26.8b, #38                    \n"  // R * 0.29900 coefficient
2677   "1:                                          \n"
2678     MEMACCESS(0)
2679     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
2680     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
2681     "umull      v4.8h, v0.8b, v24.8b           \n"  // B
2682     "umlal      v4.8h, v1.8b, v25.8b           \n"  // G
2683     "umlal      v4.8h, v2.8b, v26.8b           \n"  // R
2684     "sqrshrun   v0.8b, v4.8h, #7               \n"  // 15 bit to 8 bit B
2685     "orr        v1.8b, v0.8b, v0.8b            \n"  // G
2686     "orr        v2.8b, v0.8b, v0.8b            \n"  // R
2687     MEMACCESS(1)
2688     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 pixels.
2689     "b.gt       1b                             \n"
2690   : "+r"(src_argb),  // %0
2691     "+r"(dst_argb),  // %1
2692     "+r"(width)      // %2
2693   :
2694   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26"
2695   );
2696 }
2697 #endif  // HAS_ARGBGRAYROW_NEON
2698 
2699 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
2700 //    b = (r * 35 + g * 68 + b * 17) >> 7
2701 //    g = (r * 45 + g * 88 + b * 22) >> 7
2702 //    r = (r * 50 + g * 98 + b * 24) >> 7
2703 
2704 #ifdef HAS_ARGBSEPIAROW_NEON
ARGBSepiaRow_NEON(uint8 * dst_argb,int width)2705 void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
2706   asm volatile (
2707     "movi       v20.8b, #17                    \n"  // BB coefficient
2708     "movi       v21.8b, #68                    \n"  // BG coefficient
2709     "movi       v22.8b, #35                    \n"  // BR coefficient
2710     "movi       v24.8b, #22                    \n"  // GB coefficient
2711     "movi       v25.8b, #88                    \n"  // GG coefficient
2712     "movi       v26.8b, #45                    \n"  // GR coefficient
2713     "movi       v28.8b, #24                    \n"  // BB coefficient
2714     "movi       v29.8b, #98                    \n"  // BG coefficient
2715     "movi       v30.8b, #50                    \n"  // BR coefficient
2716   "1:                                          \n"
2717     MEMACCESS(0)
2718     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n"  // load 8 ARGB pixels.
2719     "subs       %w1, %w1, #8                   \n"  // 8 processed per loop.
2720     "umull      v4.8h, v0.8b, v20.8b           \n"  // B to Sepia B
2721     "umlal      v4.8h, v1.8b, v21.8b           \n"  // G
2722     "umlal      v4.8h, v2.8b, v22.8b           \n"  // R
2723     "umull      v5.8h, v0.8b, v24.8b           \n"  // B to Sepia G
2724     "umlal      v5.8h, v1.8b, v25.8b           \n"  // G
2725     "umlal      v5.8h, v2.8b, v26.8b           \n"  // R
2726     "umull      v6.8h, v0.8b, v28.8b           \n"  // B to Sepia R
2727     "umlal      v6.8h, v1.8b, v29.8b           \n"  // G
2728     "umlal      v6.8h, v2.8b, v30.8b           \n"  // R
2729     "uqshrn     v0.8b, v4.8h, #7               \n"  // 16 bit to 8 bit B
2730     "uqshrn     v1.8b, v5.8h, #7               \n"  // 16 bit to 8 bit G
2731     "uqshrn     v2.8b, v6.8h, #7               \n"  // 16 bit to 8 bit R
2732     MEMACCESS(0)
2733     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // store 8 pixels.
2734     "b.gt       1b                             \n"
2735   : "+r"(dst_argb),  // %0
2736     "+r"(width)      // %1
2737   :
2738   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
2739     "v20", "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30"
2740   );
2741 }
2742 #endif  // HAS_ARGBSEPIAROW_NEON
2743 
2744 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
2745 // TODO(fbarchard): Was same as Sepia except matrix is provided.  This function
2746 // needs to saturate.  Consider doing a non-saturating version.
2747 #ifdef HAS_ARGBCOLORMATRIXROW_NEON
ARGBColorMatrixRow_NEON(const uint8 * src_argb,uint8 * dst_argb,const int8 * matrix_argb,int width)2748 void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
2749                              const int8* matrix_argb, int width) {
2750   asm volatile (
2751     MEMACCESS(3)
2752     "ld1        {v2.16b}, [%3]                 \n"  // load 3 ARGB vectors.
2753     "sxtl       v0.8h, v2.8b                   \n"  // B,G coefficients s16.
2754     "sxtl2      v1.8h, v2.16b                  \n"  // R,A coefficients s16.
2755 
2756   "1:                                          \n"
2757     MEMACCESS(0)
2758     "ld4        {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n"  // load 8 pixels.
2759     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
2760     "uxtl       v16.8h, v16.8b                 \n"  // b (0 .. 255) 16 bit
2761     "uxtl       v17.8h, v17.8b                 \n"  // g
2762     "uxtl       v18.8h, v18.8b                 \n"  // r
2763     "uxtl       v19.8h, v19.8b                 \n"  // a
2764     "mul        v22.8h, v16.8h, v0.h[0]        \n"  // B = B * Matrix B
2765     "mul        v23.8h, v16.8h, v0.h[4]        \n"  // G = B * Matrix G
2766     "mul        v24.8h, v16.8h, v1.h[0]        \n"  // R = B * Matrix R
2767     "mul        v25.8h, v16.8h, v1.h[4]        \n"  // A = B * Matrix A
2768     "mul        v4.8h, v17.8h, v0.h[1]         \n"  // B += G * Matrix B
2769     "mul        v5.8h, v17.8h, v0.h[5]         \n"  // G += G * Matrix G
2770     "mul        v6.8h, v17.8h, v1.h[1]         \n"  // R += G * Matrix R
2771     "mul        v7.8h, v17.8h, v1.h[5]         \n"  // A += G * Matrix A
2772     "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B
2773     "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G
2774     "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R
2775     "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A
2776     "mul        v4.8h, v18.8h, v0.h[2]         \n"  // B += R * Matrix B
2777     "mul        v5.8h, v18.8h, v0.h[6]         \n"  // G += R * Matrix G
2778     "mul        v6.8h, v18.8h, v1.h[2]         \n"  // R += R * Matrix R
2779     "mul        v7.8h, v18.8h, v1.h[6]         \n"  // A += R * Matrix A
2780     "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B
2781     "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G
2782     "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R
2783     "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A
2784     "mul        v4.8h, v19.8h, v0.h[3]         \n"  // B += A * Matrix B
2785     "mul        v5.8h, v19.8h, v0.h[7]         \n"  // G += A * Matrix G
2786     "mul        v6.8h, v19.8h, v1.h[3]         \n"  // R += A * Matrix R
2787     "mul        v7.8h, v19.8h, v1.h[7]         \n"  // A += A * Matrix A
2788     "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B
2789     "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G
2790     "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R
2791     "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A
2792     "sqshrun    v16.8b, v22.8h, #6             \n"  // 16 bit to 8 bit B
2793     "sqshrun    v17.8b, v23.8h, #6             \n"  // 16 bit to 8 bit G
2794     "sqshrun    v18.8b, v24.8h, #6             \n"  // 16 bit to 8 bit R
2795     "sqshrun    v19.8b, v25.8h, #6             \n"  // 16 bit to 8 bit A
2796     MEMACCESS(1)
2797     "st4        {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n"  // store 8 pixels.
2798     "b.gt       1b                             \n"
2799   : "+r"(src_argb),   // %0
2800     "+r"(dst_argb),   // %1
2801     "+r"(width)       // %2
2802   : "r"(matrix_argb)  // %3
2803   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
2804     "v18", "v19", "v22", "v23", "v24", "v25"
2805   );
2806 }
2807 #endif  // HAS_ARGBCOLORMATRIXROW_NEON
2808 
2809 // TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable.
2810 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
2811 #ifdef HAS_ARGBMULTIPLYROW_NEON
ARGBMultiplyRow_NEON(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)2812 void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
2813                           uint8* dst_argb, int width) {
2814   asm volatile (
2815     // 8 pixel loop.
2816   "1:                                          \n"
2817     MEMACCESS(0)
2818     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
2819     MEMACCESS(1)
2820     "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more pixels.
2821     "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
2822     "umull      v0.8h, v0.8b, v4.8b            \n"  // multiply B
2823     "umull      v1.8h, v1.8b, v5.8b            \n"  // multiply G
2824     "umull      v2.8h, v2.8b, v6.8b            \n"  // multiply R
2825     "umull      v3.8h, v3.8b, v7.8b            \n"  // multiply A
2826     "rshrn      v0.8b, v0.8h, #8               \n"  // 16 bit to 8 bit B
2827     "rshrn      v1.8b, v1.8h, #8               \n"  // 16 bit to 8 bit G
2828     "rshrn      v2.8b, v2.8h, #8               \n"  // 16 bit to 8 bit R
2829     "rshrn      v3.8b, v3.8h, #8               \n"  // 16 bit to 8 bit A
2830     MEMACCESS(2)
2831     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
2832     "b.gt       1b                             \n"
2833 
2834   : "+r"(src_argb0),  // %0
2835     "+r"(src_argb1),  // %1
2836     "+r"(dst_argb),   // %2
2837     "+r"(width)       // %3
2838   :
2839   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
2840   );
2841 }
2842 #endif  // HAS_ARGBMULTIPLYROW_NEON
2843 
2844 // Add 2 rows of ARGB pixels together, 8 pixels at a time.
2845 #ifdef HAS_ARGBADDROW_NEON
ARGBAddRow_NEON(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)2846 void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
2847                      uint8* dst_argb, int width) {
2848   asm volatile (
2849     // 8 pixel loop.
2850   "1:                                          \n"
2851     MEMACCESS(0)
2852     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
2853     MEMACCESS(1)
2854     "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more pixels.
2855     "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
2856     "uqadd      v0.8b, v0.8b, v4.8b            \n"
2857     "uqadd      v1.8b, v1.8b, v5.8b            \n"
2858     "uqadd      v2.8b, v2.8b, v6.8b            \n"
2859     "uqadd      v3.8b, v3.8b, v7.8b            \n"
2860     MEMACCESS(2)
2861     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
2862     "b.gt       1b                             \n"
2863 
2864   : "+r"(src_argb0),  // %0
2865     "+r"(src_argb1),  // %1
2866     "+r"(dst_argb),   // %2
2867     "+r"(width)       // %3
2868   :
2869   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
2870   );
2871 }
2872 #endif  // HAS_ARGBADDROW_NEON
2873 
2874 // Subtract 2 rows of ARGB pixels, 8 pixels at a time.
2875 #ifdef HAS_ARGBSUBTRACTROW_NEON
ARGBSubtractRow_NEON(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)2876 void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
2877                           uint8* dst_argb, int width) {
2878   asm volatile (
2879     // 8 pixel loop.
2880   "1:                                          \n"
2881     MEMACCESS(0)
2882     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
2883     MEMACCESS(1)
2884     "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more pixels.
2885     "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
2886     "uqsub      v0.8b, v0.8b, v4.8b            \n"
2887     "uqsub      v1.8b, v1.8b, v5.8b            \n"
2888     "uqsub      v2.8b, v2.8b, v6.8b            \n"
2889     "uqsub      v3.8b, v3.8b, v7.8b            \n"
2890     MEMACCESS(2)
2891     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
2892     "b.gt       1b                             \n"
2893 
2894   : "+r"(src_argb0),  // %0
2895     "+r"(src_argb1),  // %1
2896     "+r"(dst_argb),   // %2
2897     "+r"(width)       // %3
2898   :
2899   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
2900   );
2901 }
2902 #endif  // HAS_ARGBSUBTRACTROW_NEON
2903 
2904 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
2905 // A = 255
2906 // R = Sobel
2907 // G = Sobel
2908 // B = Sobel
2909 #ifdef HAS_SOBELROW_NEON
SobelRow_NEON(const uint8 * src_sobelx,const uint8 * src_sobely,uint8 * dst_argb,int width)2910 void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
2911                      uint8* dst_argb, int width) {
2912   asm volatile (
2913     "movi       v3.8b, #255                    \n"  // alpha
2914     // 8 pixel loop.
2915   "1:                                          \n"
2916     MEMACCESS(0)
2917     "ld1        {v0.8b}, [%0], #8              \n"  // load 8 sobelx.
2918     MEMACCESS(1)
2919     "ld1        {v1.8b}, [%1], #8              \n"  // load 8 sobely.
2920     "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
2921     "uqadd      v0.8b, v0.8b, v1.8b            \n"  // add
2922     "orr        v1.8b, v0.8b, v0.8b            \n"
2923     "orr        v2.8b, v0.8b, v0.8b            \n"
2924     MEMACCESS(2)
2925     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
2926     "b.gt       1b                             \n"
2927   : "+r"(src_sobelx),  // %0
2928     "+r"(src_sobely),  // %1
2929     "+r"(dst_argb),    // %2
2930     "+r"(width)        // %3
2931   :
2932   : "cc", "memory", "v0", "v1", "v2", "v3"
2933   );
2934 }
2935 #endif  // HAS_SOBELROW_NEON
2936 
2937 // Adds Sobel X and Sobel Y and stores Sobel into plane.
2938 #ifdef HAS_SOBELTOPLANEROW_NEON
SobelToPlaneRow_NEON(const uint8 * src_sobelx,const uint8 * src_sobely,uint8 * dst_y,int width)2939 void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
2940                           uint8* dst_y, int width) {
2941   asm volatile (
2942     // 16 pixel loop.
2943   "1:                                          \n"
2944     MEMACCESS(0)
2945     "ld1        {v0.16b}, [%0], #16            \n"  // load 16 sobelx.
2946     MEMACCESS(1)
2947     "ld1        {v1.16b}, [%1], #16            \n"  // load 16 sobely.
2948     "subs       %w3, %w3, #16                  \n"  // 16 processed per loop.
2949     "uqadd      v0.16b, v0.16b, v1.16b         \n"  // add
2950     MEMACCESS(2)
2951     "st1        {v0.16b}, [%2], #16            \n"  // store 16 pixels.
2952     "b.gt       1b                             \n"
2953   : "+r"(src_sobelx),  // %0
2954     "+r"(src_sobely),  // %1
2955     "+r"(dst_y),       // %2
2956     "+r"(width)        // %3
2957   :
2958   : "cc", "memory", "v0", "v1"
2959   );
2960 }
2961 #endif  // HAS_SOBELTOPLANEROW_NEON
2962 
2963 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
2964 // A = 255
2965 // R = Sobel X
2966 // G = Sobel
2967 // B = Sobel Y
2968 #ifdef HAS_SOBELXYROW_NEON
SobelXYRow_NEON(const uint8 * src_sobelx,const uint8 * src_sobely,uint8 * dst_argb,int width)2969 void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
2970                      uint8* dst_argb, int width) {
2971   asm volatile (
2972     "movi       v3.8b, #255                    \n"  // alpha
2973     // 8 pixel loop.
2974   "1:                                          \n"
2975     MEMACCESS(0)
2976     "ld1        {v2.8b}, [%0], #8              \n"  // load 8 sobelx.
2977     MEMACCESS(1)
2978     "ld1        {v0.8b}, [%1], #8              \n"  // load 8 sobely.
2979     "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
2980     "uqadd      v1.8b, v0.8b, v2.8b            \n"  // add
2981     MEMACCESS(2)
2982     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
2983     "b.gt       1b                             \n"
2984   : "+r"(src_sobelx),  // %0
2985     "+r"(src_sobely),  // %1
2986     "+r"(dst_argb),    // %2
2987     "+r"(width)        // %3
2988   :
2989   : "cc", "memory", "v0", "v1", "v2", "v3"
2990   );
2991 }
2992 #endif  // HAS_SOBELXYROW_NEON
2993 
2994 // SobelX as a matrix is
2995 // -1  0  1
2996 // -2  0  2
2997 // -1  0  1
2998 #ifdef HAS_SOBELXROW_NEON
SobelXRow_NEON(const uint8 * src_y0,const uint8 * src_y1,const uint8 * src_y2,uint8 * dst_sobelx,int width)2999 void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
3000                     const uint8* src_y2, uint8* dst_sobelx, int width) {
3001   asm volatile (
3002   "1:                                          \n"
3003     MEMACCESS(0)
3004     "ld1        {v0.8b}, [%0],%5               \n"  // top
3005     MEMACCESS(0)
3006     "ld1        {v1.8b}, [%0],%6               \n"
3007     "usubl      v0.8h, v0.8b, v1.8b            \n"
3008     MEMACCESS(1)
3009     "ld1        {v2.8b}, [%1],%5               \n"  // center * 2
3010     MEMACCESS(1)
3011     "ld1        {v3.8b}, [%1],%6               \n"
3012     "usubl      v1.8h, v2.8b, v3.8b            \n"
3013     "add        v0.8h, v0.8h, v1.8h            \n"
3014     "add        v0.8h, v0.8h, v1.8h            \n"
3015     MEMACCESS(2)
3016     "ld1        {v2.8b}, [%2],%5               \n"  // bottom
3017     MEMACCESS(2)
3018     "ld1        {v3.8b}, [%2],%6               \n"
3019     "subs       %w4, %w4, #8                   \n"  // 8 pixels
3020     "usubl      v1.8h, v2.8b, v3.8b            \n"
3021     "add        v0.8h, v0.8h, v1.8h            \n"
3022     "abs        v0.8h, v0.8h                   \n"
3023     "uqxtn      v0.8b, v0.8h                   \n"
3024     MEMACCESS(3)
3025     "st1        {v0.8b}, [%3], #8              \n"  // store 8 sobelx
3026     "b.gt       1b                             \n"
3027   : "+r"(src_y0),      // %0
3028     "+r"(src_y1),      // %1
3029     "+r"(src_y2),      // %2
3030     "+r"(dst_sobelx),  // %3
3031     "+r"(width)        // %4
3032   : "r"(2LL),          // %5
3033     "r"(6LL)           // %6
3034   : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
3035   );
3036 }
3037 #endif  // HAS_SOBELXROW_NEON
3038 
3039 // SobelY as a matrix is
3040 // -1 -2 -1
3041 //  0  0  0
3042 //  1  2  1
3043 #ifdef HAS_SOBELYROW_NEON
SobelYRow_NEON(const uint8 * src_y0,const uint8 * src_y1,uint8 * dst_sobely,int width)3044 void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
3045                     uint8* dst_sobely, int width) {
3046   asm volatile (
3047   "1:                                          \n"
3048     MEMACCESS(0)
3049     "ld1        {v0.8b}, [%0],%4               \n"  // left
3050     MEMACCESS(1)
3051     "ld1        {v1.8b}, [%1],%4               \n"
3052     "usubl      v0.8h, v0.8b, v1.8b            \n"
3053     MEMACCESS(0)
3054     "ld1        {v2.8b}, [%0],%4               \n"  // center * 2
3055     MEMACCESS(1)
3056     "ld1        {v3.8b}, [%1],%4               \n"
3057     "usubl      v1.8h, v2.8b, v3.8b            \n"
3058     "add        v0.8h, v0.8h, v1.8h            \n"
3059     "add        v0.8h, v0.8h, v1.8h            \n"
3060     MEMACCESS(0)
3061     "ld1        {v2.8b}, [%0],%5               \n"  // right
3062     MEMACCESS(1)
3063     "ld1        {v3.8b}, [%1],%5               \n"
3064     "subs       %w3, %w3, #8                   \n"  // 8 pixels
3065     "usubl      v1.8h, v2.8b, v3.8b            \n"
3066     "add        v0.8h, v0.8h, v1.8h            \n"
3067     "abs        v0.8h, v0.8h                   \n"
3068     "uqxtn      v0.8b, v0.8h                   \n"
3069     MEMACCESS(2)
3070     "st1        {v0.8b}, [%2], #8              \n"  // store 8 sobely
3071     "b.gt       1b                             \n"
3072   : "+r"(src_y0),      // %0
3073     "+r"(src_y1),      // %1
3074     "+r"(dst_sobely),  // %2
3075     "+r"(width)        // %3
3076   : "r"(1LL),          // %4
3077     "r"(6LL)           // %5
3078   : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
3079   );
3080 }
3081 #endif  // HAS_SOBELYROW_NEON
3082 #endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
3083 
3084 #ifdef __cplusplus
3085 }  // extern "C"
3086 }  // namespace libyuv
3087 #endif
3088