1 /*
2  *  Copyright 2014 The LibYuv Project Authors. All rights reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS. All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "libyuv/row.h"
12 
13 #ifdef __cplusplus
14 namespace libyuv {
15 extern "C" {
16 #endif
17 
18 // This module is for GCC Neon armv8 64 bit.
19 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
20 
21 // Read 8 Y, 4 U and 4 V from 422
22 #define READYUV422                               \
23   "ld1        {v0.8b}, [%0], #8              \n" \
24   "ld1        {v1.s}[0], [%1], #4            \n" \
25   "ld1        {v1.s}[1], [%2], #4            \n"
26 
27 // Read 8 Y, 8 U and 8 V from 444
28 #define READYUV444                               \
29   "ld1        {v0.8b}, [%0], #8              \n" \
30   "ld1        {v1.d}[0], [%1], #8            \n" \
31   "ld1        {v1.d}[1], [%2], #8            \n" \
32   "uaddlp     v1.8h, v1.16b                  \n" \
33   "rshrn      v1.8b, v1.8h, #1               \n"
34 
35 // Read 8 Y, and set 4 U and 4 V to 128
36 #define READYUV400                               \
37   "ld1        {v0.8b}, [%0], #8              \n" \
38   "movi       v1.8b , #128                   \n"
39 
40 // Read 8 Y and 4 UV from NV12
41 #define READNV12                                 \
42   "ld1        {v0.8b}, [%0], #8              \n" \
43   "ld1        {v2.8b}, [%1], #8              \n" \
44   "uzp1       v1.8b, v2.8b, v2.8b            \n" \
45   "uzp2       v3.8b, v2.8b, v2.8b            \n" \
46   "ins        v1.s[1], v3.s[0]               \n"
47 
48 // Read 8 Y and 4 VU from NV21
49 #define READNV21                                 \
50   "ld1        {v0.8b}, [%0], #8              \n" \
51   "ld1        {v2.8b}, [%1], #8              \n" \
52   "uzp1       v3.8b, v2.8b, v2.8b            \n" \
53   "uzp2       v1.8b, v2.8b, v2.8b            \n" \
54   "ins        v1.s[1], v3.s[0]               \n"
55 
56 // Read 8 YUY2
57 #define READYUY2                                 \
58   "ld2        {v0.8b, v1.8b}, [%0], #16      \n" \
59   "uzp2       v3.8b, v1.8b, v1.8b            \n" \
60   "uzp1       v1.8b, v1.8b, v1.8b            \n" \
61   "ins        v1.s[1], v3.s[0]               \n"
62 
63 // Read 8 UYVY
64 #define READUYVY                                 \
65   "ld2        {v2.8b, v3.8b}, [%0], #16      \n" \
66   "orr        v0.8b, v3.8b, v3.8b            \n" \
67   "uzp1       v1.8b, v2.8b, v2.8b            \n" \
68   "uzp2       v3.8b, v2.8b, v2.8b            \n" \
69   "ins        v1.s[1], v3.s[0]               \n"
70 
71 #define YUVTORGB_SETUP                           \
72   "ld1r       {v24.8h}, [%[kUVBiasBGR]], #2  \n" \
73   "ld1r       {v25.8h}, [%[kUVBiasBGR]], #2  \n" \
74   "ld1r       {v26.8h}, [%[kUVBiasBGR]]      \n" \
75   "ld1r       {v31.4s}, [%[kYToRgb]]         \n" \
76   "ld2        {v27.8h, v28.8h}, [%[kUVToRB]] \n" \
77   "ld2        {v29.8h, v30.8h}, [%[kUVToG]]  \n"
78 
79 #define YUVTORGB(vR, vG, vB)                                        \
80   "uxtl       v0.8h, v0.8b                   \n" /* Extract Y    */ \
81   "shll       v2.8h, v1.8b, #8               \n" /* Replicate UV */ \
82   "ushll2     v3.4s, v0.8h, #0               \n" /* Y */            \
83   "ushll      v0.4s, v0.4h, #0               \n"                    \
84   "mul        v3.4s, v3.4s, v31.4s           \n"                    \
85   "mul        v0.4s, v0.4s, v31.4s           \n"                    \
86   "sqshrun    v0.4h, v0.4s, #16              \n"                    \
87   "sqshrun2   v0.8h, v3.4s, #16              \n" /* Y */            \
88   "uaddw      v1.8h, v2.8h, v1.8b            \n" /* Replicate UV */ \
89   "mov        v2.d[0], v1.d[1]               \n" /* Extract V */    \
90   "uxtl       v2.8h, v2.8b                   \n"                    \
91   "uxtl       v1.8h, v1.8b                   \n" /* Extract U */    \
92   "mul        v3.8h, v1.8h, v27.8h           \n"                    \
93   "mul        v5.8h, v1.8h, v29.8h           \n"                    \
94   "mul        v6.8h, v2.8h, v30.8h           \n"                    \
95   "mul        v7.8h, v2.8h, v28.8h           \n"                    \
96   "sqadd      v6.8h, v6.8h, v5.8h            \n"                    \
97   "sqadd      " #vB                                                 \
98   ".8h, v24.8h, v0.8h      \n" /* B */                              \
99   "sqadd      " #vG                                                 \
100   ".8h, v25.8h, v0.8h      \n" /* G */                              \
101   "sqadd      " #vR                                                 \
102   ".8h, v26.8h, v0.8h      \n" /* R */                              \
103   "sqadd      " #vB ".8h, " #vB                                     \
104   ".8h, v3.8h  \n" /* B */                                          \
105   "sqsub      " #vG ".8h, " #vG                                     \
106   ".8h, v6.8h  \n" /* G */                                          \
107   "sqadd      " #vR ".8h, " #vR                                     \
108   ".8h, v7.8h  \n" /* R */                                          \
109   "sqshrun    " #vB ".8b, " #vB                                     \
110   ".8h, #6     \n" /* B */                                          \
111   "sqshrun    " #vG ".8b, " #vG                                     \
112   ".8h, #6     \n"                               /* G */            \
113   "sqshrun    " #vR ".8b, " #vR ".8h, #6     \n" /* R */
114 
I444ToARGBRow_NEON(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)115 void I444ToARGBRow_NEON(const uint8_t* src_y,
116                         const uint8_t* src_u,
117                         const uint8_t* src_v,
118                         uint8_t* dst_argb,
119                         const struct YuvConstants* yuvconstants,
120                         int width) {
121   asm volatile (
122     YUVTORGB_SETUP
123     "movi       v23.8b, #255                   \n" /* A */
124   "1:                                          \n"
125     READYUV444
126     YUVTORGB(v22, v21, v20)
127     "subs       %w4, %w4, #8                   \n"
128     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
129     "b.gt       1b                             \n"
130     : "+r"(src_y),     // %0
131       "+r"(src_u),     // %1
132       "+r"(src_v),     // %2
133       "+r"(dst_argb),  // %3
134       "+r"(width)      // %4
135     : [kUVToRB]"r"(&yuvconstants->kUVToRB),
136       [kUVToG]"r"(&yuvconstants->kUVToG),
137       [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
138       [kYToRgb]"r"(&yuvconstants->kYToRgb)
139     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
140       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
141   );
142 }
143 
I422ToARGBRow_NEON(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)144 void I422ToARGBRow_NEON(const uint8_t* src_y,
145                         const uint8_t* src_u,
146                         const uint8_t* src_v,
147                         uint8_t* dst_argb,
148                         const struct YuvConstants* yuvconstants,
149                         int width) {
150   asm volatile (
151     YUVTORGB_SETUP
152     "movi       v23.8b, #255                   \n" /* A */
153   "1:                                          \n"
154     READYUV422
155     YUVTORGB(v22, v21, v20)
156     "subs       %w4, %w4, #8                   \n"
157     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
158     "b.gt       1b                             \n"
159     : "+r"(src_y),     // %0
160       "+r"(src_u),     // %1
161       "+r"(src_v),     // %2
162       "+r"(dst_argb),  // %3
163       "+r"(width)      // %4
164     : [kUVToRB]"r"(&yuvconstants->kUVToRB),
165       [kUVToG]"r"(&yuvconstants->kUVToG),
166       [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
167       [kYToRgb]"r"(&yuvconstants->kYToRgb)
168     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
169       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
170   );
171 }
172 
I422AlphaToARGBRow_NEON(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,const uint8_t * src_a,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)173 void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
174                              const uint8_t* src_u,
175                              const uint8_t* src_v,
176                              const uint8_t* src_a,
177                              uint8_t* dst_argb,
178                              const struct YuvConstants* yuvconstants,
179                              int width) {
180   asm volatile (
181     YUVTORGB_SETUP
182   "1:                                          \n"
183     READYUV422
184     YUVTORGB(v22, v21, v20)
185     "ld1        {v23.8b}, [%3], #8             \n"
186     "subs       %w5, %w5, #8                   \n"
187     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%4], #32     \n"
188     "b.gt       1b                             \n"
189     : "+r"(src_y),     // %0
190       "+r"(src_u),     // %1
191       "+r"(src_v),     // %2
192       "+r"(src_a),     // %3
193       "+r"(dst_argb),  // %4
194       "+r"(width)      // %5
195     : [kUVToRB]"r"(&yuvconstants->kUVToRB),
196       [kUVToG]"r"(&yuvconstants->kUVToG),
197       [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
198       [kYToRgb]"r"(&yuvconstants->kYToRgb)
199     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
200       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
201   );
202 }
203 
I422ToRGBARow_NEON(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_rgba,const struct YuvConstants * yuvconstants,int width)204 void I422ToRGBARow_NEON(const uint8_t* src_y,
205                         const uint8_t* src_u,
206                         const uint8_t* src_v,
207                         uint8_t* dst_rgba,
208                         const struct YuvConstants* yuvconstants,
209                         int width) {
210   asm volatile (
211     YUVTORGB_SETUP
212     "movi       v20.8b, #255                   \n" /* A */
213   "1:                                          \n"
214     READYUV422
215     YUVTORGB(v23, v22, v21)
216     "subs       %w4, %w4, #8                   \n"
217     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
218     "b.gt       1b                             \n"
219     : "+r"(src_y),     // %0
220       "+r"(src_u),     // %1
221       "+r"(src_v),     // %2
222       "+r"(dst_rgba),  // %3
223       "+r"(width)      // %4
224     : [kUVToRB]"r"(&yuvconstants->kUVToRB),
225       [kUVToG]"r"(&yuvconstants->kUVToG),
226       [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
227       [kYToRgb]"r"(&yuvconstants->kYToRgb)
228     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
229       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
230   );
231 }
232 
I422ToRGB24Row_NEON(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_rgb24,const struct YuvConstants * yuvconstants,int width)233 void I422ToRGB24Row_NEON(const uint8_t* src_y,
234                          const uint8_t* src_u,
235                          const uint8_t* src_v,
236                          uint8_t* dst_rgb24,
237                          const struct YuvConstants* yuvconstants,
238                          int width) {
239   asm volatile (
240     YUVTORGB_SETUP
241   "1:                                          \n"
242     READYUV422
243     YUVTORGB(v22, v21, v20)
244     "subs       %w4, %w4, #8                   \n"
245     "st3        {v20.8b,v21.8b,v22.8b}, [%3], #24     \n"
246     "b.gt       1b                             \n"
247     : "+r"(src_y),     // %0
248       "+r"(src_u),     // %1
249       "+r"(src_v),     // %2
250       "+r"(dst_rgb24), // %3
251       "+r"(width)      // %4
252     : [kUVToRB]"r"(&yuvconstants->kUVToRB),
253       [kUVToG]"r"(&yuvconstants->kUVToG),
254       [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
255       [kYToRgb]"r"(&yuvconstants->kYToRgb)
256     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
257       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
258   );
259 }
260 
261 #define ARGBTORGB565                                                        \
262   "shll       v0.8h,  v22.8b, #8             \n" /* R                    */ \
263   "shll       v21.8h, v21.8b, #8             \n" /* G                    */ \
264   "shll       v20.8h, v20.8b, #8             \n" /* B                    */ \
265   "sri        v0.8h,  v21.8h, #5             \n" /* RG                   */ \
266   "sri        v0.8h,  v20.8h, #11            \n" /* RGB                  */
267 
I422ToRGB565Row_NEON(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_rgb565,const struct YuvConstants * yuvconstants,int width)268 void I422ToRGB565Row_NEON(const uint8_t* src_y,
269                           const uint8_t* src_u,
270                           const uint8_t* src_v,
271                           uint8_t* dst_rgb565,
272                           const struct YuvConstants* yuvconstants,
273                           int width) {
274   asm volatile(
275       YUVTORGB_SETUP
276       "1:                                        \n" READYUV422 YUVTORGB(
277           v22, v21,
278           v20) "subs       %w4, %w4, #8                   \n" ARGBTORGB565
279                "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels
280                                                                // RGB565.
281                "b.gt       1b                             \n"
282       : "+r"(src_y),       // %0
283         "+r"(src_u),       // %1
284         "+r"(src_v),       // %2
285         "+r"(dst_rgb565),  // %3
286         "+r"(width)        // %4
287       : [kUVToRB] "r"(&yuvconstants->kUVToRB),
288         [kUVToG] "r"(&yuvconstants->kUVToG),
289         [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
290         [kYToRgb] "r"(&yuvconstants->kYToRgb)
291       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
292         "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30");
293 }
294 
295 #define ARGBTOARGB1555                                                      \
296   "shll       v0.8h,  v23.8b, #8             \n" /* A                    */ \
297   "shll       v22.8h, v22.8b, #8             \n" /* R                    */ \
298   "shll       v21.8h, v21.8b, #8             \n" /* G                    */ \
299   "shll       v20.8h, v20.8b, #8             \n" /* B                    */ \
300   "sri        v0.8h,  v22.8h, #1             \n" /* AR                   */ \
301   "sri        v0.8h,  v21.8h, #6             \n" /* ARG                  */ \
302   "sri        v0.8h,  v20.8h, #11            \n" /* ARGB                 */
303 
I422ToARGB1555Row_NEON(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb1555,const struct YuvConstants * yuvconstants,int width)304 void I422ToARGB1555Row_NEON(const uint8_t* src_y,
305                             const uint8_t* src_u,
306                             const uint8_t* src_v,
307                             uint8_t* dst_argb1555,
308                             const struct YuvConstants* yuvconstants,
309                             int width) {
310   asm volatile(
311       YUVTORGB_SETUP
312       "movi       v23.8b, #255                   \n"
313       "1:                                        \n" READYUV422 YUVTORGB(
314           v22, v21,
315           v20) "subs       %w4, %w4, #8                   \n" ARGBTOARGB1555
316                "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels
317                                                                // RGB565.
318                "b.gt       1b                             \n"
319       : "+r"(src_y),         // %0
320         "+r"(src_u),         // %1
321         "+r"(src_v),         // %2
322         "+r"(dst_argb1555),  // %3
323         "+r"(width)          // %4
324       : [kUVToRB] "r"(&yuvconstants->kUVToRB),
325         [kUVToG] "r"(&yuvconstants->kUVToG),
326         [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
327         [kYToRgb] "r"(&yuvconstants->kYToRgb)
328       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
329         "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30");
330 }
331 
332 #define ARGBTOARGB4444                                                       \
333   /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f        */ \
334   "ushr       v20.8b, v20.8b, #4             \n" /* B                    */  \
335   "bic        v21.8b, v21.8b, v4.8b          \n" /* G                    */  \
336   "ushr       v22.8b, v22.8b, #4             \n" /* R                    */  \
337   "bic        v23.8b, v23.8b, v4.8b          \n" /* A                    */  \
338   "orr        v0.8b,  v20.8b, v21.8b         \n" /* BG                   */  \
339   "orr        v1.8b,  v22.8b, v23.8b         \n" /* RA                   */  \
340   "zip1       v0.16b, v0.16b, v1.16b         \n" /* BGRA                 */
341 
I422ToARGB4444Row_NEON(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb4444,const struct YuvConstants * yuvconstants,int width)342 void I422ToARGB4444Row_NEON(const uint8_t* src_y,
343                             const uint8_t* src_u,
344                             const uint8_t* src_v,
345                             uint8_t* dst_argb4444,
346                             const struct YuvConstants* yuvconstants,
347                             int width) {
348   asm volatile (
349     YUVTORGB_SETUP
350     "movi       v4.16b, #0x0f                  \n"  // bits to clear with vbic.
351   "1:                                          \n"
352     READYUV422
353     YUVTORGB(v22, v21, v20)
354     "subs       %w4, %w4, #8                   \n"
355     "movi       v23.8b, #255                   \n"
356     ARGBTOARGB4444
357     "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels ARGB4444.
358     "b.gt       1b                             \n"
359     : "+r"(src_y),    // %0
360       "+r"(src_u),    // %1
361       "+r"(src_v),    // %2
362       "+r"(dst_argb4444),  // %3
363       "+r"(width)     // %4
364     : [kUVToRB]"r"(&yuvconstants->kUVToRB),
365       [kUVToG]"r"(&yuvconstants->kUVToG),
366       [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
367       [kYToRgb]"r"(&yuvconstants->kYToRgb)
368     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
369       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
370   );
371 }
372 
I400ToARGBRow_NEON(const uint8_t * src_y,uint8_t * dst_argb,int width)373 void I400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
374   asm volatile (
375     YUVTORGB_SETUP
376     "movi       v23.8b, #255                   \n"
377   "1:                                          \n"
378     READYUV400
379     YUVTORGB(v22, v21, v20)
380     "subs       %w2, %w2, #8                   \n"
381     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32     \n"
382     "b.gt       1b                             \n"
383     : "+r"(src_y),     // %0
384       "+r"(dst_argb),  // %1
385       "+r"(width)      // %2
386     : [kUVToRB]"r"(&kYuvI601Constants.kUVToRB),
387       [kUVToG]"r"(&kYuvI601Constants.kUVToG),
388       [kUVBiasBGR]"r"(&kYuvI601Constants.kUVBiasBGR),
389       [kYToRgb]"r"(&kYuvI601Constants.kYToRgb)
390     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
391       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
392   );
393 }
394 
J400ToARGBRow_NEON(const uint8_t * src_y,uint8_t * dst_argb,int width)395 void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
396   asm volatile(
397       "movi       v23.8b, #255                   \n"
398       "1:                                        \n"
399       "ld1        {v20.8b}, [%0], #8             \n"
400       "orr        v21.8b, v20.8b, v20.8b         \n"
401       "orr        v22.8b, v20.8b, v20.8b         \n"
402       "subs       %w2, %w2, #8                   \n"
403       "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32     \n"
404       "b.gt       1b                             \n"
405       : "+r"(src_y),     // %0
406         "+r"(dst_argb),  // %1
407         "+r"(width)      // %2
408       :
409       : "cc", "memory", "v20", "v21", "v22", "v23");
410 }
411 
NV12ToARGBRow_NEON(const uint8_t * src_y,const uint8_t * src_uv,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)412 void NV12ToARGBRow_NEON(const uint8_t* src_y,
413                         const uint8_t* src_uv,
414                         uint8_t* dst_argb,
415                         const struct YuvConstants* yuvconstants,
416                         int width) {
417   asm volatile (
418     YUVTORGB_SETUP
419     "movi       v23.8b, #255                   \n"
420   "1:                                          \n"
421     READNV12
422     YUVTORGB(v22, v21, v20)
423     "subs       %w3, %w3, #8                   \n"
424     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32     \n"
425     "b.gt       1b                             \n"
426     : "+r"(src_y),     // %0
427       "+r"(src_uv),    // %1
428       "+r"(dst_argb),  // %2
429       "+r"(width)      // %3
430     : [kUVToRB]"r"(&yuvconstants->kUVToRB),
431       [kUVToG]"r"(&yuvconstants->kUVToG),
432       [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
433       [kYToRgb]"r"(&yuvconstants->kYToRgb)
434     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
435       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
436   );
437 }
438 
NV21ToARGBRow_NEON(const uint8_t * src_y,const uint8_t * src_vu,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)439 void NV21ToARGBRow_NEON(const uint8_t* src_y,
440                         const uint8_t* src_vu,
441                         uint8_t* dst_argb,
442                         const struct YuvConstants* yuvconstants,
443                         int width) {
444   asm volatile (
445     YUVTORGB_SETUP
446     "movi       v23.8b, #255                   \n"
447   "1:                                          \n"
448     READNV21
449     YUVTORGB(v22, v21, v20)
450     "subs       %w3, %w3, #8                   \n"
451     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32     \n"
452     "b.gt       1b                             \n"
453     : "+r"(src_y),     // %0
454       "+r"(src_vu),    // %1
455       "+r"(dst_argb),  // %2
456       "+r"(width)      // %3
457     : [kUVToRB]"r"(&yuvconstants->kUVToRB),
458       [kUVToG]"r"(&yuvconstants->kUVToG),
459       [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
460       [kYToRgb]"r"(&yuvconstants->kYToRgb)
461     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
462       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
463   );
464 }
465 
NV12ToRGB24Row_NEON(const uint8_t * src_y,const uint8_t * src_uv,uint8_t * dst_rgb24,const struct YuvConstants * yuvconstants,int width)466 void NV12ToRGB24Row_NEON(const uint8_t* src_y,
467                          const uint8_t* src_uv,
468                          uint8_t* dst_rgb24,
469                          const struct YuvConstants* yuvconstants,
470                          int width) {
471   asm volatile (
472     YUVTORGB_SETUP
473   "1:                                          \n"
474     READNV12
475     YUVTORGB(v22, v21, v20)
476     "subs       %w3, %w3, #8                   \n"
477     "st3        {v20.8b,v21.8b,v22.8b}, [%2], #24     \n"
478     "b.gt       1b                             \n"
479     : "+r"(src_y),     // %0
480       "+r"(src_uv),    // %1
481       "+r"(dst_rgb24),  // %2
482       "+r"(width)      // %3
483     : [kUVToRB]"r"(&yuvconstants->kUVToRB),
484       [kUVToG]"r"(&yuvconstants->kUVToG),
485       [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
486       [kYToRgb]"r"(&yuvconstants->kYToRgb)
487     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
488       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
489   );
490 }
491 
NV21ToRGB24Row_NEON(const uint8_t * src_y,const uint8_t * src_vu,uint8_t * dst_rgb24,const struct YuvConstants * yuvconstants,int width)492 void NV21ToRGB24Row_NEON(const uint8_t* src_y,
493                          const uint8_t* src_vu,
494                          uint8_t* dst_rgb24,
495                          const struct YuvConstants* yuvconstants,
496                          int width) {
497   asm volatile (
498     YUVTORGB_SETUP
499   "1:                                          \n"
500     READNV21
501     YUVTORGB(v22, v21, v20)
502     "subs       %w3, %w3, #8                   \n"
503     "st3        {v20.8b,v21.8b,v22.8b}, [%2], #24     \n"
504     "b.gt       1b                             \n"
505     : "+r"(src_y),     // %0
506       "+r"(src_vu),    // %1
507       "+r"(dst_rgb24),  // %2
508       "+r"(width)      // %3
509     : [kUVToRB]"r"(&yuvconstants->kUVToRB),
510       [kUVToG]"r"(&yuvconstants->kUVToG),
511       [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
512       [kYToRgb]"r"(&yuvconstants->kYToRgb)
513     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
514       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
515   );
516 }
517 
NV12ToRGB565Row_NEON(const uint8_t * src_y,const uint8_t * src_uv,uint8_t * dst_rgb565,const struct YuvConstants * yuvconstants,int width)518 void NV12ToRGB565Row_NEON(const uint8_t* src_y,
519                           const uint8_t* src_uv,
520                           uint8_t* dst_rgb565,
521                           const struct YuvConstants* yuvconstants,
522                           int width) {
523   asm volatile(
524       YUVTORGB_SETUP
525       "1:                                        \n" READNV12 YUVTORGB(
526           v22, v21,
527           v20) "subs       %w3, %w3, #8                   \n" ARGBTORGB565
528                "st1        {v0.8h}, [%2], 16              \n"  // store 8 pixels
529                                                                // RGB565.
530                "b.gt       1b                             \n"
531       : "+r"(src_y),       // %0
532         "+r"(src_uv),      // %1
533         "+r"(dst_rgb565),  // %2
534         "+r"(width)        // %3
535       : [kUVToRB] "r"(&yuvconstants->kUVToRB),
536         [kUVToG] "r"(&yuvconstants->kUVToG),
537         [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
538         [kYToRgb] "r"(&yuvconstants->kYToRgb)
539       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
540         "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30");
541 }
542 
YUY2ToARGBRow_NEON(const uint8_t * src_yuy2,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)543 void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
544                         uint8_t* dst_argb,
545                         const struct YuvConstants* yuvconstants,
546                         int width) {
547   asm volatile (
548     YUVTORGB_SETUP
549     "movi       v23.8b, #255                   \n"
550   "1:                                          \n"
551     READYUY2
552     YUVTORGB(v22, v21, v20)
553     "subs       %w2, %w2, #8                   \n"
554     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32      \n"
555     "b.gt       1b                             \n"
556     : "+r"(src_yuy2),  // %0
557       "+r"(dst_argb),  // %1
558       "+r"(width)      // %2
559     : [kUVToRB]"r"(&yuvconstants->kUVToRB),
560       [kUVToG]"r"(&yuvconstants->kUVToG),
561       [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
562       [kYToRgb]"r"(&yuvconstants->kYToRgb)
563     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
564       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
565   );
566 }
567 
UYVYToARGBRow_NEON(const uint8_t * src_uyvy,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)568 void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
569                         uint8_t* dst_argb,
570                         const struct YuvConstants* yuvconstants,
571                         int width) {
572   asm volatile (
573     YUVTORGB_SETUP
574     "movi       v23.8b, #255                   \n"
575   "1:                                          \n"
576     READUYVY
577     YUVTORGB(v22, v21, v20)
578     "subs       %w2, %w2, #8                   \n"
579     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32      \n"
580     "b.gt       1b                             \n"
581     : "+r"(src_uyvy),  // %0
582       "+r"(dst_argb),  // %1
583       "+r"(width)      // %2
584     : [kUVToRB]"r"(&yuvconstants->kUVToRB),
585       [kUVToG]"r"(&yuvconstants->kUVToG),
586       [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
587       [kYToRgb]"r"(&yuvconstants->kYToRgb)
588     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
589       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
590   );
591 }
592 
593 // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
SplitUVRow_NEON(const uint8_t * src_uv,uint8_t * dst_u,uint8_t * dst_v,int width)594 void SplitUVRow_NEON(const uint8_t* src_uv,
595                      uint8_t* dst_u,
596                      uint8_t* dst_v,
597                      int width) {
598   asm volatile(
599       "1:                                        \n"
600       "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pairs of UV
601       "subs       %w3, %w3, #16                  \n"  // 16 processed per loop
602       "st1        {v0.16b}, [%1], #16            \n"  // store U
603       "st1        {v1.16b}, [%2], #16            \n"  // store V
604       "b.gt       1b                             \n"
605       : "+r"(src_uv),               // %0
606         "+r"(dst_u),                // %1
607         "+r"(dst_v),                // %2
608         "+r"(width)                 // %3  // Output registers
609       :                             // Input registers
610       : "cc", "memory", "v0", "v1"  // Clobber List
611       );
612 }
613 
614 // Reads 16 U's and V's and writes out 16 pairs of UV.
MergeUVRow_NEON(const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_uv,int width)615 void MergeUVRow_NEON(const uint8_t* src_u,
616                      const uint8_t* src_v,
617                      uint8_t* dst_uv,
618                      int width) {
619   asm volatile(
620       "1:                                        \n"
621       "ld1        {v0.16b}, [%0], #16            \n"  // load U
622       "ld1        {v1.16b}, [%1], #16            \n"  // load V
623       "subs       %w3, %w3, #16                  \n"  // 16 processed per loop
624       "st2        {v0.16b,v1.16b}, [%2], #32     \n"  // store 16 pairs of UV
625       "b.gt       1b                             \n"
626       : "+r"(src_u),                // %0
627         "+r"(src_v),                // %1
628         "+r"(dst_uv),               // %2
629         "+r"(width)                 // %3  // Output registers
630       :                             // Input registers
631       : "cc", "memory", "v0", "v1"  // Clobber List
632       );
633 }
634 
635 // Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b.
SplitRGBRow_NEON(const uint8_t * src_rgb,uint8_t * dst_r,uint8_t * dst_g,uint8_t * dst_b,int width)636 void SplitRGBRow_NEON(const uint8_t* src_rgb,
637                       uint8_t* dst_r,
638                       uint8_t* dst_g,
639                       uint8_t* dst_b,
640                       int width) {
641   asm volatile(
642       "1:                                        \n"
643       "ld3        {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 16 RGB
644       "subs       %w4, %w4, #16                  \n"  // 16 processed per loop
645       "st1        {v0.16b}, [%1], #16            \n"  // store R
646       "st1        {v1.16b}, [%2], #16            \n"  // store G
647       "st1        {v2.16b}, [%3], #16            \n"  // store B
648       "b.gt       1b                             \n"
649       : "+r"(src_rgb),                    // %0
650         "+r"(dst_r),                      // %1
651         "+r"(dst_g),                      // %2
652         "+r"(dst_b),                      // %3
653         "+r"(width)                       // %4
654       :                                   // Input registers
655       : "cc", "memory", "v0", "v1", "v2"  // Clobber List
656       );
657 }
658 
659 // Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time
MergeRGBRow_NEON(const uint8_t * src_r,const uint8_t * src_g,const uint8_t * src_b,uint8_t * dst_rgb,int width)660 void MergeRGBRow_NEON(const uint8_t* src_r,
661                       const uint8_t* src_g,
662                       const uint8_t* src_b,
663                       uint8_t* dst_rgb,
664                       int width) {
665   asm volatile(
666       "1:                                        \n"
667       "ld1        {v0.16b}, [%0], #16            \n"  // load R
668       "ld1        {v1.16b}, [%1], #16            \n"  // load G
669       "ld1        {v2.16b}, [%2], #16            \n"  // load B
670       "subs       %w4, %w4, #16                  \n"  // 16 processed per loop
671       "st3        {v0.16b,v1.16b,v2.16b}, [%3], #48 \n"  // store 16 RGB
672       "b.gt       1b                             \n"
673       : "+r"(src_r),                      // %0
674         "+r"(src_g),                      // %1
675         "+r"(src_b),                      // %2
676         "+r"(dst_rgb),                    // %3
677         "+r"(width)                       // %4
678       :                                   // Input registers
679       : "cc", "memory", "v0", "v1", "v2"  // Clobber List
680       );
681 }
682 
683 // Copy multiple of 32.
CopyRow_NEON(const uint8_t * src,uint8_t * dst,int width)684 void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
685   asm volatile(
686       "1:                                        \n"
687       "ldp        q0, q1, [%0], #32              \n"
688       "subs       %w2, %w2, #32                  \n"  // 32 processed per loop
689       "stp        q0, q1, [%1], #32              \n"
690       "b.gt       1b                             \n"
691       : "+r"(src),                  // %0
692         "+r"(dst),                  // %1
693         "+r"(width)                 // %2  // Output registers
694       :                             // Input registers
695       : "cc", "memory", "v0", "v1"  // Clobber List
696       );
697 }
698 
699 // SetRow writes 'width' bytes using an 8 bit value repeated.
SetRow_NEON(uint8_t * dst,uint8_t v8,int width)700 void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
701   asm volatile(
702       "dup        v0.16b, %w2                    \n"  // duplicate 16 bytes
703       "1:                                        \n"
704       "subs       %w1, %w1, #16                  \n"  // 16 bytes per loop
705       "st1        {v0.16b}, [%0], #16            \n"  // store
706       "b.gt       1b                             \n"
707       : "+r"(dst),   // %0
708         "+r"(width)  // %1
709       : "r"(v8)      // %2
710       : "cc", "memory", "v0");
711 }
712 
ARGBSetRow_NEON(uint8_t * dst,uint32_t v32,int width)713 void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
714   asm volatile(
715       "dup        v0.4s, %w2                     \n"  // duplicate 4 ints
716       "1:                                        \n"
717       "subs       %w1, %w1, #4                   \n"  // 4 ints per loop
718       "st1        {v0.16b}, [%0], #16            \n"  // store
719       "b.gt       1b                             \n"
720       : "+r"(dst),   // %0
721         "+r"(width)  // %1
722       : "r"(v32)     // %2
723       : "cc", "memory", "v0");
724 }
725 
MirrorRow_NEON(const uint8_t * src,uint8_t * dst,int width)726 void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
727   asm volatile(
728       // Start at end of source row.
729       "add        %0, %0, %w2, sxtw              \n"
730       "sub        %0, %0, #16                    \n"
731       "1:                                        \n"
732       "ld1        {v0.16b}, [%0], %3             \n"  // src -= 16
733       "subs       %w2, %w2, #16                  \n"  // 16 pixels per loop.
734       "rev64      v0.16b, v0.16b                 \n"
735       "st1        {v0.D}[1], [%1], #8            \n"  // dst += 16
736       "st1        {v0.D}[0], [%1], #8            \n"
737       "b.gt       1b                             \n"
738       : "+r"(src),           // %0
739         "+r"(dst),           // %1
740         "+r"(width)          // %2
741       : "r"((ptrdiff_t)-16)  // %3
742       : "cc", "memory", "v0");
743 }
744 
MirrorUVRow_NEON(const uint8_t * src_uv,uint8_t * dst_u,uint8_t * dst_v,int width)745 void MirrorUVRow_NEON(const uint8_t* src_uv,
746                       uint8_t* dst_u,
747                       uint8_t* dst_v,
748                       int width) {
749   asm volatile(
750       // Start at end of source row.
751       "add        %0, %0, %w3, sxtw #1           \n"
752       "sub        %0, %0, #16                    \n"
753       "1:                                        \n"
754       "ld2        {v0.8b, v1.8b}, [%0], %4       \n"  // src -= 16
755       "subs       %w3, %w3, #8                   \n"  // 8 pixels per loop.
756       "rev64      v0.8b, v0.8b                   \n"
757       "rev64      v1.8b, v1.8b                   \n"
758       "st1        {v0.8b}, [%1], #8              \n"  // dst += 8
759       "st1        {v1.8b}, [%2], #8              \n"
760       "b.gt       1b                             \n"
761       : "+r"(src_uv),        // %0
762         "+r"(dst_u),         // %1
763         "+r"(dst_v),         // %2
764         "+r"(width)          // %3
765       : "r"((ptrdiff_t)-16)  // %4
766       : "cc", "memory", "v0", "v1");
767 }
768 
ARGBMirrorRow_NEON(const uint8_t * src,uint8_t * dst,int width)769 void ARGBMirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
770   asm volatile(
771       // Start at end of source row.
772       "add        %0, %0, %w2, sxtw #2           \n"
773       "sub        %0, %0, #16                    \n"
774       "1:                                        \n"
775       "ld1        {v0.16b}, [%0], %3             \n"  // src -= 16
776       "subs       %w2, %w2, #4                   \n"  // 4 pixels per loop.
777       "rev64      v0.4s, v0.4s                   \n"
778       "st1        {v0.D}[1], [%1], #8            \n"  // dst += 16
779       "st1        {v0.D}[0], [%1], #8            \n"
780       "b.gt       1b                             \n"
781       : "+r"(src),           // %0
782         "+r"(dst),           // %1
783         "+r"(width)          // %2
784       : "r"((ptrdiff_t)-16)  // %3
785       : "cc", "memory", "v0");
786 }
787 
RGB24ToARGBRow_NEON(const uint8_t * src_rgb24,uint8_t * dst_argb,int width)788 void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
789                          uint8_t* dst_argb,
790                          int width) {
791   asm volatile(
792       "movi       v4.8b, #255                    \n"  // Alpha
793       "1:                                        \n"
794       "ld3        {v1.8b,v2.8b,v3.8b}, [%0], #24 \n"  // load 8 pixels of RGB24.
795       "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
796       "st4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n"  // store 8 ARGB
797       "b.gt       1b                             \n"
798       : "+r"(src_rgb24),  // %0
799         "+r"(dst_argb),   // %1
800         "+r"(width)       // %2
801       :
802       : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List
803       );
804 }
805 
RAWToARGBRow_NEON(const uint8_t * src_raw,uint8_t * dst_argb,int width)806 void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
807   asm volatile(
808       "movi       v5.8b, #255                    \n"  // Alpha
809       "1:                                        \n"
810       "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // read r g b
811       "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
812       "orr        v3.8b, v1.8b, v1.8b            \n"  // move g
813       "orr        v4.8b, v0.8b, v0.8b            \n"  // move r
814       "st4        {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n"  // store b g r a
815       "b.gt       1b                             \n"
816       : "+r"(src_raw),   // %0
817         "+r"(dst_argb),  // %1
818         "+r"(width)      // %2
819       :
820       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5"  // Clobber List
821       );
822 }
823 
RAWToRGB24Row_NEON(const uint8_t * src_raw,uint8_t * dst_rgb24,int width)824 void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
825   asm volatile(
826       "1:                                        \n"
827       "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // read r g b
828       "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
829       "orr        v3.8b, v1.8b, v1.8b            \n"  // move g
830       "orr        v4.8b, v0.8b, v0.8b            \n"  // move r
831       "st3        {v2.8b,v3.8b,v4.8b}, [%1], #24 \n"  // store b g r
832       "b.gt       1b                             \n"
833       : "+r"(src_raw),    // %0
834         "+r"(dst_rgb24),  // %1
835         "+r"(width)       // %2
836       :
837       : "cc", "memory", "v0", "v1", "v2", "v3", "v4"  // Clobber List
838       );
839 }
840 
841 #define RGB565TOARGB                                                        \
842   "shrn       v6.8b, v0.8h, #5               \n" /* G xxGGGGGG           */ \
843   "shl        v6.8b, v6.8b, #2               \n" /* G GGGGGG00 upper 6   */ \
844   "ushr       v4.8b, v6.8b, #6               \n" /* G 000000GG lower 2   */ \
845   "orr        v1.8b, v4.8b, v6.8b            \n" /* G                    */ \
846   "xtn        v2.8b, v0.8h                   \n" /* B xxxBBBBB           */ \
847   "ushr       v0.8h, v0.8h, #11              \n" /* R 000RRRRR           */ \
848   "xtn2       v2.16b,v0.8h                   \n" /* R in upper part      */ \
849   "shl        v2.16b, v2.16b, #3             \n" /* R,B BBBBB000 upper 5 */ \
850   "ushr       v0.16b, v2.16b, #5             \n" /* R,B 00000BBB lower 3 */ \
851   "orr        v0.16b, v0.16b, v2.16b         \n" /* R,B                  */ \
852   "dup        v2.2D, v0.D[1]                 \n" /* R                    */
853 
RGB565ToARGBRow_NEON(const uint8_t * src_rgb565,uint8_t * dst_argb,int width)854 void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
855                           uint8_t* dst_argb,
856                           int width) {
857   asm volatile(
858       "movi       v3.8b, #255                    \n"  // Alpha
859       "1:                                        \n"
860       "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.
861       "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
862       RGB565TOARGB
863       "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB
864       "b.gt       1b                             \n"
865       : "+r"(src_rgb565),  // %0
866         "+r"(dst_argb),    // %1
867         "+r"(width)        // %2
868       :
869       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6"  // Clobber List
870       );
871 }
872 
873 #define ARGB1555TOARGB                                                      \
874   "ushr       v2.8h, v0.8h, #10              \n" /* R xxxRRRRR           */ \
875   "shl        v2.8h, v2.8h, #3               \n" /* R RRRRR000 upper 5   */ \
876   "xtn        v3.8b, v2.8h                   \n" /* RRRRR000 AAAAAAAA    */ \
877                                                                             \
878   "sshr       v2.8h, v0.8h, #15              \n" /* A AAAAAAAA           */ \
879   "xtn2       v3.16b, v2.8h                  \n"                            \
880                                                                             \
881   "xtn        v2.8b, v0.8h                   \n" /* B xxxBBBBB           */ \
882   "shrn2      v2.16b,v0.8h, #5               \n" /* G xxxGGGGG           */ \
883                                                                             \
884   "ushr       v1.16b, v3.16b, #5             \n" /* R,A 00000RRR lower 3 */ \
885   "shl        v0.16b, v2.16b, #3             \n" /* B,G BBBBB000 upper 5 */ \
886   "ushr       v2.16b, v0.16b, #5             \n" /* B,G 00000BBB lower 3 */ \
887                                                                             \
888   "orr        v0.16b, v0.16b, v2.16b         \n" /* B,G                  */ \
889   "orr        v2.16b, v1.16b, v3.16b         \n" /* R,A                  */ \
890   "dup        v1.2D, v0.D[1]                 \n"                            \
891   "dup        v3.2D, v2.D[1]                 \n"
892 
893 // RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
894 #define RGB555TOARGB                                                        \
895   "ushr       v2.8h, v0.8h, #10              \n" /* R xxxRRRRR           */ \
896   "shl        v2.8h, v2.8h, #3               \n" /* R RRRRR000 upper 5   */ \
897   "xtn        v3.8b, v2.8h                   \n" /* RRRRR000             */ \
898                                                                             \
899   "xtn        v2.8b, v0.8h                   \n" /* B xxxBBBBB           */ \
900   "shrn2      v2.16b,v0.8h, #5               \n" /* G xxxGGGGG           */ \
901                                                                             \
902   "ushr       v1.16b, v3.16b, #5             \n" /* R   00000RRR lower 3 */ \
903   "shl        v0.16b, v2.16b, #3             \n" /* B,G BBBBB000 upper 5 */ \
904   "ushr       v2.16b, v0.16b, #5             \n" /* B,G 00000BBB lower 3 */ \
905                                                                             \
906   "orr        v0.16b, v0.16b, v2.16b         \n" /* B,G                  */ \
907   "orr        v2.16b, v1.16b, v3.16b         \n" /* R                    */ \
908   "dup        v1.2D, v0.D[1]                 \n" /* G */
909 
ARGB1555ToARGBRow_NEON(const uint8_t * src_argb1555,uint8_t * dst_argb,int width)910 void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
911                             uint8_t* dst_argb,
912                             int width) {
913   asm volatile(
914       "movi       v3.8b, #255                    \n"  // Alpha
915       "1:                                        \n"
916       "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.
917       "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
918       ARGB1555TOARGB
919       "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB
920                                                             // pixels
921       "b.gt       1b                             \n"
922       : "+r"(src_argb1555),  // %0
923         "+r"(dst_argb),      // %1
924         "+r"(width)          // %2
925       :
926       : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
927       );
928 }
929 
930 #define ARGB4444TOARGB                                                      \
931   "shrn       v1.8b,  v0.8h, #8              \n" /* v1(l) AR             */ \
932   "xtn2       v1.16b, v0.8h                  \n" /* v1(h) GB             */ \
933   "shl        v2.16b, v1.16b, #4             \n" /* B,R BBBB0000         */ \
934   "ushr       v3.16b, v1.16b, #4             \n" /* G,A 0000GGGG         */ \
935   "ushr       v0.16b, v2.16b, #4             \n" /* B,R 0000BBBB         */ \
936   "shl        v1.16b, v3.16b, #4             \n" /* G,A GGGG0000         */ \
937   "orr        v2.16b, v0.16b, v2.16b         \n" /* B,R BBBBBBBB         */ \
938   "orr        v3.16b, v1.16b, v3.16b         \n" /* G,A GGGGGGGG         */ \
939   "dup        v0.2D, v2.D[1]                 \n"                            \
940   "dup        v1.2D, v3.D[1]                 \n"
941 
ARGB4444ToARGBRow_NEON(const uint8_t * src_argb4444,uint8_t * dst_argb,int width)942 void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
943                             uint8_t* dst_argb,
944                             int width) {
945   asm volatile(
946       "1:                                        \n"
947       "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.
948       "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
949       ARGB4444TOARGB
950       "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB
951                                                             // pixels
952       "b.gt       1b                             \n"
953       : "+r"(src_argb4444),  // %0
954         "+r"(dst_argb),      // %1
955         "+r"(width)          // %2
956       :
957       : "cc", "memory", "v0", "v1", "v2", "v3", "v4"  // Clobber List
958       );
959 }
960 
ARGBToRGB24Row_NEON(const uint8_t * src_argb,uint8_t * dst_rgb24,int width)961 void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
962                          uint8_t* dst_rgb24,
963                          int width) {
964   asm volatile(
965       "1:                                        \n"
966       "ld4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load 8 ARGB
967       "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
968       "st3        {v1.8b,v2.8b,v3.8b}, [%1], #24 \n"  // store 8 pixels of
969                                                       // RGB24.
970       "b.gt       1b                             \n"
971       : "+r"(src_argb),   // %0
972         "+r"(dst_rgb24),  // %1
973         "+r"(width)       // %2
974       :
975       : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List
976       );
977 }
978 
ARGBToRAWRow_NEON(const uint8_t * src_argb,uint8_t * dst_raw,int width)979 void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
980   asm volatile(
981       "1:                                        \n"
982       "ld4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load b g r a
983       "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
984       "orr        v4.8b, v2.8b, v2.8b            \n"  // mov g
985       "orr        v5.8b, v1.8b, v1.8b            \n"  // mov b
986       "st3        {v3.8b,v4.8b,v5.8b}, [%1], #24 \n"  // store r g b
987       "b.gt       1b                             \n"
988       : "+r"(src_argb),  // %0
989         "+r"(dst_raw),   // %1
990         "+r"(width)      // %2
991       :
992       : "cc", "memory", "v1", "v2", "v3", "v4", "v5"  // Clobber List
993       );
994 }
995 
YUY2ToYRow_NEON(const uint8_t * src_yuy2,uint8_t * dst_y,int width)996 void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
997   asm volatile(
998       "1:                                        \n"
999       "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pixels of YUY2.
1000       "subs       %w2, %w2, #16                  \n"  // 16 processed per loop.
1001       "st1        {v0.16b}, [%1], #16            \n"  // store 16 pixels of Y.
1002       "b.gt       1b                             \n"
1003       : "+r"(src_yuy2),  // %0
1004         "+r"(dst_y),     // %1
1005         "+r"(width)      // %2
1006       :
1007       : "cc", "memory", "v0", "v1"  // Clobber List
1008       );
1009 }
1010 
UYVYToYRow_NEON(const uint8_t * src_uyvy,uint8_t * dst_y,int width)1011 void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
1012   asm volatile(
1013       "1:                                        \n"
1014       "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pixels of UYVY.
1015       "subs       %w2, %w2, #16                  \n"  // 16 processed per loop.
1016       "st1        {v1.16b}, [%1], #16            \n"  // store 16 pixels of Y.
1017       "b.gt       1b                             \n"
1018       : "+r"(src_uyvy),  // %0
1019         "+r"(dst_y),     // %1
1020         "+r"(width)      // %2
1021       :
1022       : "cc", "memory", "v0", "v1"  // Clobber List
1023       );
1024 }
1025 
YUY2ToUV422Row_NEON(const uint8_t * src_yuy2,uint8_t * dst_u,uint8_t * dst_v,int width)1026 void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
1027                          uint8_t* dst_u,
1028                          uint8_t* dst_v,
1029                          int width) {
1030   asm volatile(
1031       "1:                                        \n"
1032       "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 YUY2
1033       "subs       %w3, %w3, #16                  \n"  // 16 pixels = 8 UVs.
1034       "st1        {v1.8b}, [%1], #8              \n"  // store 8 U.
1035       "st1        {v3.8b}, [%2], #8              \n"  // store 8 V.
1036       "b.gt       1b                             \n"
1037       : "+r"(src_yuy2),  // %0
1038         "+r"(dst_u),     // %1
1039         "+r"(dst_v),     // %2
1040         "+r"(width)      // %3
1041       :
1042       : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
1043       );
1044 }
1045 
UYVYToUV422Row_NEON(const uint8_t * src_uyvy,uint8_t * dst_u,uint8_t * dst_v,int width)1046 void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
1047                          uint8_t* dst_u,
1048                          uint8_t* dst_v,
1049                          int width) {
1050   asm volatile(
1051       "1:                                        \n"
1052       "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 UYVY
1053       "subs       %w3, %w3, #16                  \n"  // 16 pixels = 8 UVs.
1054       "st1        {v0.8b}, [%1], #8              \n"  // store 8 U.
1055       "st1        {v2.8b}, [%2], #8              \n"  // store 8 V.
1056       "b.gt       1b                             \n"
1057       : "+r"(src_uyvy),  // %0
1058         "+r"(dst_u),     // %1
1059         "+r"(dst_v),     // %2
1060         "+r"(width)      // %3
1061       :
1062       : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
1063       );
1064 }
1065 
YUY2ToUVRow_NEON(const uint8_t * src_yuy2,int stride_yuy2,uint8_t * dst_u,uint8_t * dst_v,int width)1066 void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
1067                       int stride_yuy2,
1068                       uint8_t* dst_u,
1069                       uint8_t* dst_v,
1070                       int width) {
1071   const uint8_t* src_yuy2b = src_yuy2 + stride_yuy2;
1072   asm volatile(
1073       "1:                                        \n"
1074       "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels
1075       "subs       %w4, %w4, #16                  \n"  // 16 pixels = 8 UVs.
1076       "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load next row
1077       "urhadd     v1.8b, v1.8b, v5.8b            \n"        // average rows of U
1078       "urhadd     v3.8b, v3.8b, v7.8b            \n"        // average rows of V
1079       "st1        {v1.8b}, [%2], #8              \n"        // store 8 U.
1080       "st1        {v3.8b}, [%3], #8              \n"        // store 8 V.
1081       "b.gt       1b                             \n"
1082       : "+r"(src_yuy2),   // %0
1083         "+r"(src_yuy2b),  // %1
1084         "+r"(dst_u),      // %2
1085         "+r"(dst_v),      // %3
1086         "+r"(width)       // %4
1087       :
1088       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
1089         "v7"  // Clobber List
1090       );
1091 }
1092 
UYVYToUVRow_NEON(const uint8_t * src_uyvy,int stride_uyvy,uint8_t * dst_u,uint8_t * dst_v,int width)1093 void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
1094                       int stride_uyvy,
1095                       uint8_t* dst_u,
1096                       uint8_t* dst_v,
1097                       int width) {
1098   const uint8_t* src_uyvyb = src_uyvy + stride_uyvy;
1099   asm volatile(
1100       "1:                                        \n"
1101       "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels
1102       "subs       %w4, %w4, #16                  \n"  // 16 pixels = 8 UVs.
1103       "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load next row
1104       "urhadd     v0.8b, v0.8b, v4.8b            \n"        // average rows of U
1105       "urhadd     v2.8b, v2.8b, v6.8b            \n"        // average rows of V
1106       "st1        {v0.8b}, [%2], #8              \n"        // store 8 U.
1107       "st1        {v2.8b}, [%3], #8              \n"        // store 8 V.
1108       "b.gt       1b                             \n"
1109       : "+r"(src_uyvy),   // %0
1110         "+r"(src_uyvyb),  // %1
1111         "+r"(dst_u),      // %2
1112         "+r"(dst_v),      // %3
1113         "+r"(width)       // %4
1114       :
1115       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
1116         "v7"  // Clobber List
1117       );
1118 }
1119 
1120 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
ARGBShuffleRow_NEON(const uint8_t * src_argb,uint8_t * dst_argb,const uint8_t * shuffler,int width)1121 void ARGBShuffleRow_NEON(const uint8_t* src_argb,
1122                          uint8_t* dst_argb,
1123                          const uint8_t* shuffler,
1124                          int width) {
1125   asm volatile(
1126       "ld1        {v2.16b}, [%3]                 \n"  // shuffler
1127       "1:                                        \n"
1128       "ld1        {v0.16b}, [%0], #16            \n"  // load 4 pixels.
1129       "subs       %w2, %w2, #4                   \n"  // 4 processed per loop
1130       "tbl        v1.16b, {v0.16b}, v2.16b       \n"  // look up 4 pixels
1131       "st1        {v1.16b}, [%1], #16            \n"  // store 4.
1132       "b.gt       1b                             \n"
1133       : "+r"(src_argb),                   // %0
1134         "+r"(dst_argb),                   // %1
1135         "+r"(width)                       // %2
1136       : "r"(shuffler)                     // %3
1137       : "cc", "memory", "v0", "v1", "v2"  // Clobber List
1138       );
1139 }
1140 
I422ToYUY2Row_NEON(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_yuy2,int width)1141 void I422ToYUY2Row_NEON(const uint8_t* src_y,
1142                         const uint8_t* src_u,
1143                         const uint8_t* src_v,
1144                         uint8_t* dst_yuy2,
1145                         int width) {
1146   asm volatile(
1147       "1:                                        \n"
1148       "ld2        {v0.8b, v1.8b}, [%0], #16      \n"  // load 16 Ys
1149       "orr        v2.8b, v1.8b, v1.8b            \n"
1150       "ld1        {v1.8b}, [%1], #8              \n"        // load 8 Us
1151       "ld1        {v3.8b}, [%2], #8              \n"        // load 8 Vs
1152       "subs       %w4, %w4, #16                  \n"        // 16 pixels
1153       "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels.
1154       "b.gt       1b                             \n"
1155       : "+r"(src_y),     // %0
1156         "+r"(src_u),     // %1
1157         "+r"(src_v),     // %2
1158         "+r"(dst_yuy2),  // %3
1159         "+r"(width)      // %4
1160       :
1161       : "cc", "memory", "v0", "v1", "v2", "v3");
1162 }
1163 
I422ToUYVYRow_NEON(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_uyvy,int width)1164 void I422ToUYVYRow_NEON(const uint8_t* src_y,
1165                         const uint8_t* src_u,
1166                         const uint8_t* src_v,
1167                         uint8_t* dst_uyvy,
1168                         int width) {
1169   asm volatile(
1170       "1:                                        \n"
1171       "ld2        {v1.8b,v2.8b}, [%0], #16       \n"  // load 16 Ys
1172       "orr        v3.8b, v2.8b, v2.8b            \n"
1173       "ld1        {v0.8b}, [%1], #8              \n"        // load 8 Us
1174       "ld1        {v2.8b}, [%2], #8              \n"        // load 8 Vs
1175       "subs       %w4, %w4, #16                  \n"        // 16 pixels
1176       "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels.
1177       "b.gt       1b                             \n"
1178       : "+r"(src_y),     // %0
1179         "+r"(src_u),     // %1
1180         "+r"(src_v),     // %2
1181         "+r"(dst_uyvy),  // %3
1182         "+r"(width)      // %4
1183       :
1184       : "cc", "memory", "v0", "v1", "v2", "v3");
1185 }
1186 
ARGBToRGB565Row_NEON(const uint8_t * src_argb,uint8_t * dst_rgb565,int width)1187 void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
1188                           uint8_t* dst_rgb565,
1189                           int width) {
1190   asm volatile(
1191       "1:                                        \n"
1192       "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
1193       "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
1194       ARGBTORGB565
1195       "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels RGB565.
1196       "b.gt       1b                             \n"
1197       : "+r"(src_argb),    // %0
1198         "+r"(dst_rgb565),  // %1
1199         "+r"(width)        // %2
1200       :
1201       : "cc", "memory", "v0", "v20", "v21", "v22", "v23");
1202 }
1203 
ARGBToRGB565DitherRow_NEON(const uint8_t * src_argb,uint8_t * dst_rgb,const uint32_t dither4,int width)1204 void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
1205                                 uint8_t* dst_rgb,
1206                                 const uint32_t dither4,
1207                                 int width) {
1208   asm volatile(
1209       "dup        v1.4s, %w2                     \n"  // dither4
1210       "1:                                        \n"
1211       "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"  // load 8 pixels
1212       "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
1213       "uqadd      v20.8b, v20.8b, v1.8b          \n"
1214       "uqadd      v21.8b, v21.8b, v1.8b          \n"
1215       "uqadd      v22.8b, v22.8b, v1.8b          \n" ARGBTORGB565
1216       "st1        {v0.16b}, [%0], #16            \n"  // store 8 pixels RGB565.
1217       "b.gt       1b                             \n"
1218       : "+r"(dst_rgb)   // %0
1219       : "r"(src_argb),  // %1
1220         "r"(dither4),   // %2
1221         "r"(width)      // %3
1222       : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23");
1223 }
1224 
ARGBToARGB1555Row_NEON(const uint8_t * src_argb,uint8_t * dst_argb1555,int width)1225 void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
1226                             uint8_t* dst_argb1555,
1227                             int width) {
1228   asm volatile(
1229       "1:                                        \n"
1230       "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
1231       "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
1232       ARGBTOARGB1555
1233       "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels
1234                                                       // ARGB1555.
1235       "b.gt       1b                             \n"
1236       : "+r"(src_argb),      // %0
1237         "+r"(dst_argb1555),  // %1
1238         "+r"(width)          // %2
1239       :
1240       : "cc", "memory", "v0", "v20", "v21", "v22", "v23");
1241 }
1242 
ARGBToARGB4444Row_NEON(const uint8_t * src_argb,uint8_t * dst_argb4444,int width)1243 void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
1244                             uint8_t* dst_argb4444,
1245                             int width) {
1246   asm volatile(
1247       "movi       v4.16b, #0x0f                  \n"  // bits to clear with
1248                                                       // vbic.
1249       "1:                                        \n"
1250       "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
1251       "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
1252       ARGBTOARGB4444
1253       "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels
1254                                                       // ARGB4444.
1255       "b.gt       1b                             \n"
1256       : "+r"(src_argb),      // %0
1257         "+r"(dst_argb4444),  // %1
1258         "+r"(width)          // %2
1259       :
1260       : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23");
1261 }
1262 
ARGBToYRow_NEON(const uint8_t * src_argb,uint8_t * dst_y,int width)1263 void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
1264   asm volatile(
1265       "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
1266       "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
1267       "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
1268       "movi       v7.8b, #16                     \n"  // Add 16 constant
1269       "1:                                        \n"
1270       "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
1271       "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
1272       "umull      v3.8h, v0.8b, v4.8b            \n"  // B
1273       "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
1274       "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
1275       "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
1276       "uqadd      v0.8b, v0.8b, v7.8b            \n"
1277       "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
1278       "b.gt       1b                             \n"
1279       : "+r"(src_argb),  // %0
1280         "+r"(dst_y),     // %1
1281         "+r"(width)      // %2
1282       :
1283       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
1284 }
1285 
ARGBExtractAlphaRow_NEON(const uint8_t * src_argb,uint8_t * dst_a,int width)1286 void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
1287                               uint8_t* dst_a,
1288                               int width) {
1289   asm volatile(
1290       "1:                                        \n"
1291       "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load row 16
1292                                                                 // pixels
1293       "subs       %w2, %w2, #16                  \n"  // 16 processed per loop
1294       "st1        {v3.16b}, [%1], #16            \n"  // store 16 A's.
1295       "b.gt       1b                             \n"
1296       : "+r"(src_argb),  // %0
1297         "+r"(dst_a),     // %1
1298         "+r"(width)      // %2
1299       :
1300       : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
1301       );
1302 }
1303 
ARGBToYJRow_NEON(const uint8_t * src_argb,uint8_t * dst_y,int width)1304 void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
1305   asm volatile(
1306       "movi       v4.8b, #15                     \n"  // B * 0.11400 coefficient
1307       "movi       v5.8b, #75                     \n"  // G * 0.58700 coefficient
1308       "movi       v6.8b, #38                     \n"  // R * 0.29900 coefficient
1309       "1:                                        \n"
1310       "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
1311       "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
1312       "umull      v3.8h, v0.8b, v4.8b            \n"  // B
1313       "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
1314       "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
1315       "sqrshrun   v0.8b, v3.8h, #7               \n"  // 15 bit to 8 bit Y
1316       "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
1317       "b.gt       1b                             \n"
1318       : "+r"(src_argb),  // %0
1319         "+r"(dst_y),     // %1
1320         "+r"(width)      // %2
1321       :
1322       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
1323 }
1324 
1325 // 8x1 pixels.
ARGBToUV444Row_NEON(const uint8_t * src_argb,uint8_t * dst_u,uint8_t * dst_v,int width)1326 void ARGBToUV444Row_NEON(const uint8_t* src_argb,
1327                          uint8_t* dst_u,
1328                          uint8_t* dst_v,
1329                          int width) {
1330   asm volatile(
1331       "movi       v24.8b, #112                   \n"  // UB / VR 0.875
1332                                                       // coefficient
1333       "movi       v25.8b, #74                    \n"  // UG -0.5781 coefficient
1334       "movi       v26.8b, #38                    \n"  // UR -0.2969 coefficient
1335       "movi       v27.8b, #18                    \n"  // VB -0.1406 coefficient
1336       "movi       v28.8b, #94                    \n"  // VG -0.7344 coefficient
1337       "movi       v29.16b,#0x80                  \n"  // 128.5
1338       "1:                                        \n"
1339       "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
1340                                                             // pixels.
1341       "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
1342       "umull      v4.8h, v0.8b, v24.8b           \n"  // B
1343       "umlsl      v4.8h, v1.8b, v25.8b           \n"  // G
1344       "umlsl      v4.8h, v2.8b, v26.8b           \n"  // R
1345       "add        v4.8h, v4.8h, v29.8h           \n"  // +128 -> unsigned
1346 
1347       "umull      v3.8h, v2.8b, v24.8b           \n"  // R
1348       "umlsl      v3.8h, v1.8b, v28.8b           \n"  // G
1349       "umlsl      v3.8h, v0.8b, v27.8b           \n"  // B
1350       "add        v3.8h, v3.8h, v29.8h           \n"  // +128 -> unsigned
1351 
1352       "uqshrn     v0.8b, v4.8h, #8               \n"  // 16 bit to 8 bit U
1353       "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
1354 
1355       "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels U.
1356       "st1        {v1.8b}, [%2], #8              \n"  // store 8 pixels V.
1357       "b.gt       1b                             \n"
1358       : "+r"(src_argb),  // %0
1359         "+r"(dst_u),     // %1
1360         "+r"(dst_v),     // %2
1361         "+r"(width)      // %3
1362       :
1363       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26",
1364         "v27", "v28", "v29");
1365 }
1366 
1367 #define RGBTOUV_SETUP_REG                                                  \
1368   "movi       v20.8h, #56, lsl #0  \n" /* UB/VR coefficient (0.875) / 2 */ \
1369   "movi       v21.8h, #37, lsl #0  \n" /* UG coefficient (-0.5781) / 2  */ \
1370   "movi       v22.8h, #19, lsl #0  \n" /* UR coefficient (-0.2969) / 2  */ \
1371   "movi       v23.8h, #9,  lsl #0  \n" /* VB coefficient (-0.1406) / 2  */ \
1372   "movi       v24.8h, #47, lsl #0  \n" /* VG coefficient (-0.7344) / 2  */ \
1373   "movi       v25.16b, #0x80       \n" /* 128.5 (0x8080 in 16-bit)      */
1374 
1375 // 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
1376 // clang-format off
1377 #define RGBTOUV(QB, QG, QR)                                                 \
1378   "mul        v3.8h, " #QB ",v20.8h          \n" /* B                    */ \
1379   "mul        v4.8h, " #QR ",v20.8h          \n" /* R                    */ \
1380   "mls        v3.8h, " #QG ",v21.8h          \n" /* G                    */ \
1381   "mls        v4.8h, " #QG ",v24.8h          \n" /* G                    */ \
1382   "mls        v3.8h, " #QR ",v22.8h          \n" /* R                    */ \
1383   "mls        v4.8h, " #QB ",v23.8h          \n" /* B                    */ \
1384   "add        v3.8h, v3.8h, v25.8h           \n" /* +128 -> unsigned     */ \
1385   "add        v4.8h, v4.8h, v25.8h           \n" /* +128 -> unsigned     */ \
1386   "uqshrn     v0.8b, v3.8h, #8               \n" /* 16 bit to 8 bit U    */ \
1387   "uqshrn     v1.8b, v4.8h, #8               \n" /* 16 bit to 8 bit V    */
1388 // clang-format on
1389 
1390 // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
1391 // TODO(fbarchard): consider ptrdiff_t for all strides.
1392 
ARGBToUVRow_NEON(const uint8_t * src_argb,int src_stride_argb,uint8_t * dst_u,uint8_t * dst_v,int width)1393 void ARGBToUVRow_NEON(const uint8_t* src_argb,
1394                       int src_stride_argb,
1395                       uint8_t* dst_u,
1396                       uint8_t* dst_v,
1397                       int width) {
1398   const uint8_t* src_argb_1 = src_argb + src_stride_argb;
1399   asm volatile (
1400     RGBTOUV_SETUP_REG
1401   "1:                                          \n"
1402     "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
1403     "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
1404     "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
1405     "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
1406 
1407     "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
1408     "uadalp     v0.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
1409     "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
1410     "uadalp     v2.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
1411 
1412     "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
1413     "urshr      v1.8h, v1.8h, #1               \n"
1414     "urshr      v2.8h, v2.8h, #1               \n"
1415 
1416     "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
1417     RGBTOUV(v0.8h, v1.8h, v2.8h)
1418     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
1419     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
1420     "b.gt       1b                             \n"
1421   : "+r"(src_argb),  // %0
1422     "+r"(src_argb_1),  // %1
1423     "+r"(dst_u),     // %2
1424     "+r"(dst_v),     // %3
1425     "+r"(width)        // %4
1426   :
1427   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1428     "v20", "v21", "v22", "v23", "v24", "v25"
1429   );
1430 }
1431 
1432 // TODO(fbarchard): Subsample match C code.
ARGBToUVJRow_NEON(const uint8_t * src_argb,int src_stride_argb,uint8_t * dst_u,uint8_t * dst_v,int width)1433 void ARGBToUVJRow_NEON(const uint8_t* src_argb,
1434                        int src_stride_argb,
1435                        uint8_t* dst_u,
1436                        uint8_t* dst_v,
1437                        int width) {
1438   const uint8_t* src_argb_1 = src_argb + src_stride_argb;
1439   asm volatile (
1440     "movi       v20.8h, #63, lsl #0            \n"  // UB/VR coeff (0.500) / 2
1441     "movi       v21.8h, #42, lsl #0            \n"  // UG coeff (-0.33126) / 2
1442     "movi       v22.8h, #21, lsl #0            \n"  // UR coeff (-0.16874) / 2
1443     "movi       v23.8h, #10, lsl #0            \n"  // VB coeff (-0.08131) / 2
1444     "movi       v24.8h, #53, lsl #0            \n"  // VG coeff (-0.41869) / 2
1445     "movi       v25.16b, #0x80                 \n"  // 128.5 (0x8080 in 16-bit)
1446   "1:                                          \n"
1447     "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
1448     "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
1449     "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
1450     "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
1451     "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64  \n"  // load next 16
1452     "uadalp     v0.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
1453     "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
1454     "uadalp     v2.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
1455 
1456     "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
1457     "urshr      v1.8h, v1.8h, #1               \n"
1458     "urshr      v2.8h, v2.8h, #1               \n"
1459 
1460     "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
1461     RGBTOUV(v0.8h, v1.8h, v2.8h)
1462     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
1463     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
1464     "b.gt       1b                             \n"
1465   : "+r"(src_argb),  // %0
1466     "+r"(src_argb_1),  // %1
1467     "+r"(dst_u),     // %2
1468     "+r"(dst_v),     // %3
1469     "+r"(width)        // %4
1470   :
1471   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1472     "v20", "v21", "v22", "v23", "v24", "v25"
1473   );
1474 }
1475 
BGRAToUVRow_NEON(const uint8_t * src_bgra,int src_stride_bgra,uint8_t * dst_u,uint8_t * dst_v,int width)1476 void BGRAToUVRow_NEON(const uint8_t* src_bgra,
1477                       int src_stride_bgra,
1478                       uint8_t* dst_u,
1479                       uint8_t* dst_v,
1480                       int width) {
1481   const uint8_t* src_bgra_1 = src_bgra + src_stride_bgra;
1482   asm volatile (
1483     RGBTOUV_SETUP_REG
1484   "1:                                          \n"
1485     "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
1486     "uaddlp     v0.8h, v3.16b                  \n"  // B 16 bytes -> 8 shorts.
1487     "uaddlp     v3.8h, v2.16b                  \n"  // G 16 bytes -> 8 shorts.
1488     "uaddlp     v2.8h, v1.16b                  \n"  // R 16 bytes -> 8 shorts.
1489     "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more
1490     "uadalp     v0.8h, v7.16b                  \n"  // B 16 bytes -> 8 shorts.
1491     "uadalp     v3.8h, v6.16b                  \n"  // G 16 bytes -> 8 shorts.
1492     "uadalp     v2.8h, v5.16b                  \n"  // R 16 bytes -> 8 shorts.
1493 
1494     "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
1495     "urshr      v1.8h, v3.8h, #1               \n"
1496     "urshr      v2.8h, v2.8h, #1               \n"
1497 
1498     "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
1499     RGBTOUV(v0.8h, v1.8h, v2.8h)
1500     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
1501     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
1502     "b.gt       1b                             \n"
1503   : "+r"(src_bgra),  // %0
1504     "+r"(src_bgra_1),  // %1
1505     "+r"(dst_u),     // %2
1506     "+r"(dst_v),     // %3
1507     "+r"(width)        // %4
1508   :
1509   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1510     "v20", "v21", "v22", "v23", "v24", "v25"
1511   );
1512 }
1513 
ABGRToUVRow_NEON(const uint8_t * src_abgr,int src_stride_abgr,uint8_t * dst_u,uint8_t * dst_v,int width)1514 void ABGRToUVRow_NEON(const uint8_t* src_abgr,
1515                       int src_stride_abgr,
1516                       uint8_t* dst_u,
1517                       uint8_t* dst_v,
1518                       int width) {
1519   const uint8_t* src_abgr_1 = src_abgr + src_stride_abgr;
1520   asm volatile (
1521     RGBTOUV_SETUP_REG
1522   "1:                                          \n"
1523     "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
1524     "uaddlp     v3.8h, v2.16b                  \n"  // B 16 bytes -> 8 shorts.
1525     "uaddlp     v2.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
1526     "uaddlp     v1.8h, v0.16b                  \n"  // R 16 bytes -> 8 shorts.
1527     "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more.
1528     "uadalp     v3.8h, v6.16b                  \n"  // B 16 bytes -> 8 shorts.
1529     "uadalp     v2.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
1530     "uadalp     v1.8h, v4.16b                  \n"  // R 16 bytes -> 8 shorts.
1531 
1532     "urshr      v0.8h, v3.8h, #1               \n"  // 2x average
1533     "urshr      v2.8h, v2.8h, #1               \n"
1534     "urshr      v1.8h, v1.8h, #1               \n"
1535 
1536     "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
1537     RGBTOUV(v0.8h, v2.8h, v1.8h)
1538     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
1539     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
1540     "b.gt       1b                             \n"
1541   : "+r"(src_abgr),  // %0
1542     "+r"(src_abgr_1),  // %1
1543     "+r"(dst_u),     // %2
1544     "+r"(dst_v),     // %3
1545     "+r"(width)        // %4
1546   :
1547   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1548     "v20", "v21", "v22", "v23", "v24", "v25"
1549   );
1550 }
1551 
RGBAToUVRow_NEON(const uint8_t * src_rgba,int src_stride_rgba,uint8_t * dst_u,uint8_t * dst_v,int width)1552 void RGBAToUVRow_NEON(const uint8_t* src_rgba,
1553                       int src_stride_rgba,
1554                       uint8_t* dst_u,
1555                       uint8_t* dst_v,
1556                       int width) {
1557   const uint8_t* src_rgba_1 = src_rgba + src_stride_rgba;
1558   asm volatile (
1559     RGBTOUV_SETUP_REG
1560   "1:                                          \n"
1561     "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
1562     "uaddlp     v0.8h, v1.16b                  \n"  // B 16 bytes -> 8 shorts.
1563     "uaddlp     v1.8h, v2.16b                  \n"  // G 16 bytes -> 8 shorts.
1564     "uaddlp     v2.8h, v3.16b                  \n"  // R 16 bytes -> 8 shorts.
1565     "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more.
1566     "uadalp     v0.8h, v5.16b                  \n"  // B 16 bytes -> 8 shorts.
1567     "uadalp     v1.8h, v6.16b                  \n"  // G 16 bytes -> 8 shorts.
1568     "uadalp     v2.8h, v7.16b                  \n"  // R 16 bytes -> 8 shorts.
1569 
1570     "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
1571     "urshr      v1.8h, v1.8h, #1               \n"
1572     "urshr      v2.8h, v2.8h, #1               \n"
1573 
1574     "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
1575     RGBTOUV(v0.8h, v1.8h, v2.8h)
1576     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
1577     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
1578     "b.gt       1b                             \n"
1579   : "+r"(src_rgba),  // %0
1580     "+r"(src_rgba_1),  // %1
1581     "+r"(dst_u),     // %2
1582     "+r"(dst_v),     // %3
1583     "+r"(width)        // %4
1584   :
1585   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1586     "v20", "v21", "v22", "v23", "v24", "v25"
1587   );
1588 }
1589 
RGB24ToUVRow_NEON(const uint8_t * src_rgb24,int src_stride_rgb24,uint8_t * dst_u,uint8_t * dst_v,int width)1590 void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
1591                        int src_stride_rgb24,
1592                        uint8_t* dst_u,
1593                        uint8_t* dst_v,
1594                        int width) {
1595   const uint8_t* src_rgb24_1 = src_rgb24 + src_stride_rgb24;
1596   asm volatile (
1597     RGBTOUV_SETUP_REG
1598   "1:                                          \n"
1599     "ld3        {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 16 pixels.
1600     "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
1601     "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
1602     "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
1603     "ld3        {v4.16b,v5.16b,v6.16b}, [%1], #48 \n"  // load 16 more.
1604     "uadalp     v0.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
1605     "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
1606     "uadalp     v2.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
1607 
1608     "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
1609     "urshr      v1.8h, v1.8h, #1               \n"
1610     "urshr      v2.8h, v2.8h, #1               \n"
1611 
1612     "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
1613     RGBTOUV(v0.8h, v1.8h, v2.8h)
1614     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
1615     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
1616     "b.gt       1b                             \n"
1617   : "+r"(src_rgb24),  // %0
1618     "+r"(src_rgb24_1),  // %1
1619     "+r"(dst_u),     // %2
1620     "+r"(dst_v),     // %3
1621     "+r"(width)        // %4
1622   :
1623   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1624     "v20", "v21", "v22", "v23", "v24", "v25"
1625   );
1626 }
1627 
RAWToUVRow_NEON(const uint8_t * src_raw,int src_stride_raw,uint8_t * dst_u,uint8_t * dst_v,int width)1628 void RAWToUVRow_NEON(const uint8_t* src_raw,
1629                      int src_stride_raw,
1630                      uint8_t* dst_u,
1631                      uint8_t* dst_v,
1632                      int width) {
1633   const uint8_t* src_raw_1 = src_raw + src_stride_raw;
1634   asm volatile (
1635     RGBTOUV_SETUP_REG
1636   "1:                                          \n"
1637     "ld3        {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 8 RAW pixels.
1638     "uaddlp     v2.8h, v2.16b                  \n"  // B 16 bytes -> 8 shorts.
1639     "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
1640     "uaddlp     v0.8h, v0.16b                  \n"  // R 16 bytes -> 8 shorts.
1641     "ld3        {v4.16b,v5.16b,v6.16b}, [%1], #48 \n"  // load 8 more RAW pixels
1642     "uadalp     v2.8h, v6.16b                  \n"  // B 16 bytes -> 8 shorts.
1643     "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
1644     "uadalp     v0.8h, v4.16b                  \n"  // R 16 bytes -> 8 shorts.
1645 
1646     "urshr      v2.8h, v2.8h, #1               \n"  // 2x average
1647     "urshr      v1.8h, v1.8h, #1               \n"
1648     "urshr      v0.8h, v0.8h, #1               \n"
1649 
1650     "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
1651     RGBTOUV(v2.8h, v1.8h, v0.8h)
1652     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
1653     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
1654     "b.gt       1b                             \n"
1655   : "+r"(src_raw),  // %0
1656     "+r"(src_raw_1),  // %1
1657     "+r"(dst_u),     // %2
1658     "+r"(dst_v),     // %3
1659     "+r"(width)        // %4
1660   :
1661   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1662     "v20", "v21", "v22", "v23", "v24", "v25"
1663   );
1664 }
1665 
1666 // 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
RGB565ToUVRow_NEON(const uint8_t * src_rgb565,int src_stride_rgb565,uint8_t * dst_u,uint8_t * dst_v,int width)1667 void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
1668                         int src_stride_rgb565,
1669                         uint8_t* dst_u,
1670                         uint8_t* dst_v,
1671                         int width) {
1672   const uint8_t* src_rgb565_1 = src_rgb565 + src_stride_rgb565;
1673   asm volatile(
1674       "movi       v22.8h, #56, lsl #0            \n"  // UB / VR coeff (0.875) /
1675                                                       // 2
1676       "movi       v23.8h, #37, lsl #0            \n"  // UG coeff (-0.5781) / 2
1677       "movi       v24.8h, #19, lsl #0            \n"  // UR coeff (-0.2969) / 2
1678       "movi       v25.8h, #9 , lsl #0            \n"  // VB coeff (-0.1406) / 2
1679       "movi       v26.8h, #47, lsl #0            \n"  // VG coeff (-0.7344) / 2
1680       "movi       v27.16b, #0x80                 \n"  // 128.5 0x8080 in 16bit
1681       "1:                                        \n"
1682       "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.
1683       RGB565TOARGB
1684       "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
1685       "uaddlp     v18.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
1686       "uaddlp     v20.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
1687       "ld1        {v0.16b}, [%0], #16            \n"  // next 8 RGB565 pixels.
1688       RGB565TOARGB
1689       "uaddlp     v17.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
1690       "uaddlp     v19.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
1691       "uaddlp     v21.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
1692 
1693       "ld1        {v0.16b}, [%1], #16            \n"  // load 8 RGB565 pixels.
1694       RGB565TOARGB
1695       "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
1696       "uadalp     v18.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
1697       "uadalp     v20.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
1698       "ld1        {v0.16b}, [%1], #16            \n"  // next 8 RGB565 pixels.
1699       RGB565TOARGB
1700       "uadalp     v17.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
1701       "uadalp     v19.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
1702       "uadalp     v21.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
1703 
1704       "ins        v16.D[1], v17.D[0]             \n"
1705       "ins        v18.D[1], v19.D[0]             \n"
1706       "ins        v20.D[1], v21.D[0]             \n"
1707 
1708       "urshr      v4.8h, v16.8h, #1              \n"  // 2x average
1709       "urshr      v5.8h, v18.8h, #1              \n"
1710       "urshr      v6.8h, v20.8h, #1              \n"
1711 
1712       "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.
1713       "mul        v16.8h, v4.8h, v22.8h          \n"  // B
1714       "mls        v16.8h, v5.8h, v23.8h          \n"  // G
1715       "mls        v16.8h, v6.8h, v24.8h          \n"  // R
1716       "add        v16.8h, v16.8h, v27.8h         \n"  // +128 -> unsigned
1717       "mul        v17.8h, v6.8h, v22.8h          \n"  // R
1718       "mls        v17.8h, v5.8h, v26.8h          \n"  // G
1719       "mls        v17.8h, v4.8h, v25.8h          \n"  // B
1720       "add        v17.8h, v17.8h, v27.8h         \n"  // +128 -> unsigned
1721       "uqshrn     v0.8b, v16.8h, #8              \n"  // 16 bit to 8 bit U
1722       "uqshrn     v1.8b, v17.8h, #8              \n"  // 16 bit to 8 bit V
1723       "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
1724       "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
1725       "b.gt       1b                             \n"
1726       : "+r"(src_rgb565),    // %0
1727         "+r"(src_rgb565_1),  // %1
1728         "+r"(dst_u),         // %2
1729         "+r"(dst_v),         // %3
1730         "+r"(width)          // %4
1731       :
1732       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
1733         "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
1734         "v27");
1735 }
1736 
1737 // 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
ARGB1555ToUVRow_NEON(const uint8_t * src_argb1555,int src_stride_argb1555,uint8_t * dst_u,uint8_t * dst_v,int width)1738 void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
1739                           int src_stride_argb1555,
1740                           uint8_t* dst_u,
1741                           uint8_t* dst_v,
1742                           int width) {
1743   const uint8_t* src_argb1555_1 = src_argb1555 + src_stride_argb1555;
1744   asm volatile(
1745       RGBTOUV_SETUP_REG
1746       "1:                                        \n"
1747       "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.
1748       RGB555TOARGB
1749       "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
1750       "uaddlp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
1751       "uaddlp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
1752       "ld1        {v0.16b}, [%0], #16            \n"  // next 8 ARGB1555 pixels.
1753       RGB555TOARGB
1754       "uaddlp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
1755       "uaddlp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
1756       "uaddlp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
1757 
1758       "ld1        {v0.16b}, [%1], #16            \n"  // load 8 ARGB1555 pixels.
1759       RGB555TOARGB
1760       "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
1761       "uadalp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
1762       "uadalp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
1763       "ld1        {v0.16b}, [%1], #16            \n"  // next 8 ARGB1555 pixels.
1764       RGB555TOARGB
1765       "uadalp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
1766       "uadalp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
1767       "uadalp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
1768 
1769       "ins        v16.D[1], v26.D[0]             \n"
1770       "ins        v17.D[1], v27.D[0]             \n"
1771       "ins        v18.D[1], v28.D[0]             \n"
1772 
1773       "urshr      v4.8h, v16.8h, #1              \n"  // 2x average
1774       "urshr      v5.8h, v17.8h, #1              \n"
1775       "urshr      v6.8h, v18.8h, #1              \n"
1776 
1777       "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.
1778       "mul        v2.8h, v4.8h, v20.8h           \n"  // B
1779       "mls        v2.8h, v5.8h, v21.8h           \n"  // G
1780       "mls        v2.8h, v6.8h, v22.8h           \n"  // R
1781       "add        v2.8h, v2.8h, v25.8h           \n"  // +128 -> unsigned
1782       "mul        v3.8h, v6.8h, v20.8h           \n"  // R
1783       "mls        v3.8h, v5.8h, v24.8h           \n"  // G
1784       "mls        v3.8h, v4.8h, v23.8h           \n"  // B
1785       "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned
1786       "uqshrn     v0.8b, v2.8h, #8               \n"  // 16 bit to 8 bit U
1787       "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
1788       "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
1789       "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
1790       "b.gt       1b                             \n"
1791       : "+r"(src_argb1555),    // %0
1792         "+r"(src_argb1555_1),  // %1
1793         "+r"(dst_u),           // %2
1794         "+r"(dst_v),           // %3
1795         "+r"(width)            // %4
1796       :
1797       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17",
1798         "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
1799         "v28");
1800 }
1801 
1802 // 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
ARGB4444ToUVRow_NEON(const uint8_t * src_argb4444,int src_stride_argb4444,uint8_t * dst_u,uint8_t * dst_v,int width)1803 void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
1804                           int src_stride_argb4444,
1805                           uint8_t* dst_u,
1806                           uint8_t* dst_v,
1807                           int width) {
1808   const uint8_t* src_argb4444_1 = src_argb4444 + src_stride_argb4444;
1809   asm volatile(
1810       RGBTOUV_SETUP_REG
1811       "1:                                        \n"
1812       "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.
1813       ARGB4444TOARGB
1814       "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
1815       "uaddlp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
1816       "uaddlp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
1817       "ld1        {v0.16b}, [%0], #16            \n"  // next 8 ARGB4444 pixels.
1818       ARGB4444TOARGB
1819       "uaddlp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
1820       "uaddlp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
1821       "uaddlp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
1822 
1823       "ld1        {v0.16b}, [%1], #16            \n"  // load 8 ARGB4444 pixels.
1824       ARGB4444TOARGB
1825       "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
1826       "uadalp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
1827       "uadalp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
1828       "ld1        {v0.16b}, [%1], #16            \n"  // next 8 ARGB4444 pixels.
1829       ARGB4444TOARGB
1830       "uadalp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
1831       "uadalp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
1832       "uadalp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
1833 
1834       "ins        v16.D[1], v26.D[0]             \n"
1835       "ins        v17.D[1], v27.D[0]             \n"
1836       "ins        v18.D[1], v28.D[0]             \n"
1837 
1838       "urshr      v4.8h, v16.8h, #1              \n"  // 2x average
1839       "urshr      v5.8h, v17.8h, #1              \n"
1840       "urshr      v6.8h, v18.8h, #1              \n"
1841 
1842       "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.
1843       "mul        v2.8h, v4.8h, v20.8h           \n"  // B
1844       "mls        v2.8h, v5.8h, v21.8h           \n"  // G
1845       "mls        v2.8h, v6.8h, v22.8h           \n"  // R
1846       "add        v2.8h, v2.8h, v25.8h           \n"  // +128 -> unsigned
1847       "mul        v3.8h, v6.8h, v20.8h           \n"  // R
1848       "mls        v3.8h, v5.8h, v24.8h           \n"  // G
1849       "mls        v3.8h, v4.8h, v23.8h           \n"  // B
1850       "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned
1851       "uqshrn     v0.8b, v2.8h, #8               \n"  // 16 bit to 8 bit U
1852       "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
1853       "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
1854       "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
1855       "b.gt       1b                             \n"
1856       : "+r"(src_argb4444),    // %0
1857         "+r"(src_argb4444_1),  // %1
1858         "+r"(dst_u),           // %2
1859         "+r"(dst_v),           // %3
1860         "+r"(width)            // %4
1861       :
1862       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17",
1863         "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
1864         "v28"
1865 
1866       );
1867 }
1868 
RGB565ToYRow_NEON(const uint8_t * src_rgb565,uint8_t * dst_y,int width)1869 void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
1870   asm volatile(
1871       "movi       v24.8b, #13                    \n"  // B * 0.1016 coefficient
1872       "movi       v25.8b, #65                    \n"  // G * 0.5078 coefficient
1873       "movi       v26.8b, #33                    \n"  // R * 0.2578 coefficient
1874       "movi       v27.8b, #16                    \n"  // Add 16 constant
1875       "1:                                        \n"
1876       "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.
1877       "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
1878       RGB565TOARGB
1879       "umull      v3.8h, v0.8b, v24.8b           \n"  // B
1880       "umlal      v3.8h, v1.8b, v25.8b           \n"  // G
1881       "umlal      v3.8h, v2.8b, v26.8b           \n"  // R
1882       "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
1883       "uqadd      v0.8b, v0.8b, v27.8b           \n"
1884       "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
1885       "b.gt       1b                             \n"
1886       : "+r"(src_rgb565),  // %0
1887         "+r"(dst_y),       // %1
1888         "+r"(width)        // %2
1889       :
1890       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6", "v24", "v25", "v26",
1891         "v27");
1892 }
1893 
ARGB1555ToYRow_NEON(const uint8_t * src_argb1555,uint8_t * dst_y,int width)1894 void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
1895                          uint8_t* dst_y,
1896                          int width) {
1897   asm volatile(
1898       "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
1899       "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
1900       "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
1901       "movi       v7.8b, #16                     \n"  // Add 16 constant
1902       "1:                                        \n"
1903       "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.
1904       "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
1905       ARGB1555TOARGB
1906       "umull      v3.8h, v0.8b, v4.8b            \n"  // B
1907       "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
1908       "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
1909       "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
1910       "uqadd      v0.8b, v0.8b, v7.8b            \n"
1911       "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
1912       "b.gt       1b                             \n"
1913       : "+r"(src_argb1555),  // %0
1914         "+r"(dst_y),         // %1
1915         "+r"(width)          // %2
1916       :
1917       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
1918 }
1919 
ARGB4444ToYRow_NEON(const uint8_t * src_argb4444,uint8_t * dst_y,int width)1920 void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
1921                          uint8_t* dst_y,
1922                          int width) {
1923   asm volatile(
1924       "movi       v24.8b, #13                    \n"  // B * 0.1016 coefficient
1925       "movi       v25.8b, #65                    \n"  // G * 0.5078 coefficient
1926       "movi       v26.8b, #33                    \n"  // R * 0.2578 coefficient
1927       "movi       v27.8b, #16                    \n"  // Add 16 constant
1928       "1:                                        \n"
1929       "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.
1930       "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
1931       ARGB4444TOARGB
1932       "umull      v3.8h, v0.8b, v24.8b           \n"  // B
1933       "umlal      v3.8h, v1.8b, v25.8b           \n"  // G
1934       "umlal      v3.8h, v2.8b, v26.8b           \n"  // R
1935       "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
1936       "uqadd      v0.8b, v0.8b, v27.8b           \n"
1937       "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
1938       "b.gt       1b                             \n"
1939       : "+r"(src_argb4444),  // %0
1940         "+r"(dst_y),         // %1
1941         "+r"(width)          // %2
1942       :
1943       : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27");
1944 }
1945 
BGRAToYRow_NEON(const uint8_t * src_bgra,uint8_t * dst_y,int width)1946 void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
1947   asm volatile(
1948       "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient
1949       "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
1950       "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient
1951       "movi       v7.8b, #16                     \n"  // Add 16 constant
1952       "1:                                        \n"
1953       "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
1954       "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
1955       "umull      v16.8h, v1.8b, v4.8b           \n"  // R
1956       "umlal      v16.8h, v2.8b, v5.8b           \n"  // G
1957       "umlal      v16.8h, v3.8b, v6.8b           \n"  // B
1958       "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
1959       "uqadd      v0.8b, v0.8b, v7.8b            \n"
1960       "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
1961       "b.gt       1b                             \n"
1962       : "+r"(src_bgra),  // %0
1963         "+r"(dst_y),     // %1
1964         "+r"(width)      // %2
1965       :
1966       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
1967 }
1968 
ABGRToYRow_NEON(const uint8_t * src_abgr,uint8_t * dst_y,int width)1969 void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
1970   asm volatile(
1971       "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient
1972       "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
1973       "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient
1974       "movi       v7.8b, #16                     \n"  // Add 16 constant
1975       "1:                                        \n"
1976       "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
1977       "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
1978       "umull      v16.8h, v0.8b, v4.8b           \n"  // R
1979       "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
1980       "umlal      v16.8h, v2.8b, v6.8b           \n"  // B
1981       "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
1982       "uqadd      v0.8b, v0.8b, v7.8b            \n"
1983       "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
1984       "b.gt       1b                             \n"
1985       : "+r"(src_abgr),  // %0
1986         "+r"(dst_y),     // %1
1987         "+r"(width)      // %2
1988       :
1989       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
1990 }
1991 
RGBAToYRow_NEON(const uint8_t * src_rgba,uint8_t * dst_y,int width)1992 void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
1993   asm volatile(
1994       "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
1995       "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
1996       "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
1997       "movi       v7.8b, #16                     \n"  // Add 16 constant
1998       "1:                                        \n"
1999       "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
2000       "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
2001       "umull      v16.8h, v1.8b, v4.8b           \n"  // B
2002       "umlal      v16.8h, v2.8b, v5.8b           \n"  // G
2003       "umlal      v16.8h, v3.8b, v6.8b           \n"  // R
2004       "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
2005       "uqadd      v0.8b, v0.8b, v7.8b            \n"
2006       "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
2007       "b.gt       1b                             \n"
2008       : "+r"(src_rgba),  // %0
2009         "+r"(dst_y),     // %1
2010         "+r"(width)      // %2
2011       :
2012       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
2013 }
2014 
RGB24ToYRow_NEON(const uint8_t * src_rgb24,uint8_t * dst_y,int width)2015 void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
2016   asm volatile(
2017       "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
2018       "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
2019       "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
2020       "movi       v7.8b, #16                     \n"  // Add 16 constant
2021       "1:                                        \n"
2022       "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels.
2023       "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
2024       "umull      v16.8h, v0.8b, v4.8b           \n"  // B
2025       "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
2026       "umlal      v16.8h, v2.8b, v6.8b           \n"  // R
2027       "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
2028       "uqadd      v0.8b, v0.8b, v7.8b            \n"
2029       "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
2030       "b.gt       1b                             \n"
2031       : "+r"(src_rgb24),  // %0
2032         "+r"(dst_y),      // %1
2033         "+r"(width)       // %2
2034       :
2035       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
2036 }
2037 
RAWToYRow_NEON(const uint8_t * src_raw,uint8_t * dst_y,int width)2038 void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) {
2039   asm volatile(
2040       "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient
2041       "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
2042       "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient
2043       "movi       v7.8b, #16                     \n"  // Add 16 constant
2044       "1:                                        \n"
2045       "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels.
2046       "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
2047       "umull      v16.8h, v0.8b, v4.8b           \n"  // B
2048       "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
2049       "umlal      v16.8h, v2.8b, v6.8b           \n"  // R
2050       "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
2051       "uqadd      v0.8b, v0.8b, v7.8b            \n"
2052       "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
2053       "b.gt       1b                             \n"
2054       : "+r"(src_raw),  // %0
2055         "+r"(dst_y),    // %1
2056         "+r"(width)     // %2
2057       :
2058       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
2059 }
2060 
2061 // Bilinear filter 16x2 -> 16x1
InterpolateRow_NEON(uint8_t * dst_ptr,const uint8_t * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)2062 void InterpolateRow_NEON(uint8_t* dst_ptr,
2063                          const uint8_t* src_ptr,
2064                          ptrdiff_t src_stride,
2065                          int dst_width,
2066                          int source_y_fraction) {
2067   int y1_fraction = source_y_fraction;
2068   int y0_fraction = 256 - y1_fraction;
2069   const uint8_t* src_ptr1 = src_ptr + src_stride;
2070   asm volatile(
2071       "cmp        %w4, #0                        \n"
2072       "b.eq       100f                           \n"
2073       "cmp        %w4, #128                      \n"
2074       "b.eq       50f                            \n"
2075 
2076       "dup        v5.16b, %w4                    \n"
2077       "dup        v4.16b, %w5                    \n"
2078       // General purpose row blend.
2079       "1:                                        \n"
2080       "ld1        {v0.16b}, [%1], #16            \n"
2081       "ld1        {v1.16b}, [%2], #16            \n"
2082       "subs       %w3, %w3, #16                  \n"
2083       "umull      v2.8h, v0.8b,  v4.8b           \n"
2084       "umull2     v3.8h, v0.16b, v4.16b          \n"
2085       "umlal      v2.8h, v1.8b,  v5.8b           \n"
2086       "umlal2     v3.8h, v1.16b, v5.16b          \n"
2087       "rshrn      v0.8b,  v2.8h, #8              \n"
2088       "rshrn2     v0.16b, v3.8h, #8              \n"
2089       "st1        {v0.16b}, [%0], #16            \n"
2090       "b.gt       1b                             \n"
2091       "b          99f                            \n"
2092 
2093       // Blend 50 / 50.
2094       "50:                                       \n"
2095       "ld1        {v0.16b}, [%1], #16            \n"
2096       "ld1        {v1.16b}, [%2], #16            \n"
2097       "subs       %w3, %w3, #16                  \n"
2098       "urhadd     v0.16b, v0.16b, v1.16b         \n"
2099       "st1        {v0.16b}, [%0], #16            \n"
2100       "b.gt       50b                            \n"
2101       "b          99f                            \n"
2102 
2103       // Blend 100 / 0 - Copy row unchanged.
2104       "100:                                      \n"
2105       "ld1        {v0.16b}, [%1], #16            \n"
2106       "subs       %w3, %w3, #16                  \n"
2107       "st1        {v0.16b}, [%0], #16            \n"
2108       "b.gt       100b                           \n"
2109 
2110       "99:                                       \n"
2111       : "+r"(dst_ptr),      // %0
2112         "+r"(src_ptr),      // %1
2113         "+r"(src_ptr1),     // %2
2114         "+r"(dst_width),    // %3
2115         "+r"(y1_fraction),  // %4
2116         "+r"(y0_fraction)   // %5
2117       :
2118       : "cc", "memory", "v0", "v1", "v3", "v4", "v5");
2119 }
2120 
2121 // dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
ARGBBlendRow_NEON(const uint8_t * src_argb0,const uint8_t * src_argb1,uint8_t * dst_argb,int width)2122 void ARGBBlendRow_NEON(const uint8_t* src_argb0,
2123                        const uint8_t* src_argb1,
2124                        uint8_t* dst_argb,
2125                        int width) {
2126   asm volatile(
2127       "subs       %w3, %w3, #8                   \n"
2128       "b.lt       89f                            \n"
2129       // Blend 8 pixels.
2130       "8:                                        \n"
2131       "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB0
2132                                                             // pixels
2133       "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 ARGB1
2134                                                             // pixels
2135       "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
2136       "umull      v16.8h, v4.8b, v3.8b           \n"  // db * a
2137       "umull      v17.8h, v5.8b, v3.8b           \n"  // dg * a
2138       "umull      v18.8h, v6.8b, v3.8b           \n"  // dr * a
2139       "uqrshrn    v16.8b, v16.8h, #8             \n"  // db >>= 8
2140       "uqrshrn    v17.8b, v17.8h, #8             \n"  // dg >>= 8
2141       "uqrshrn    v18.8b, v18.8h, #8             \n"  // dr >>= 8
2142       "uqsub      v4.8b, v4.8b, v16.8b           \n"  // db - (db * a / 256)
2143       "uqsub      v5.8b, v5.8b, v17.8b           \n"  // dg - (dg * a / 256)
2144       "uqsub      v6.8b, v6.8b, v18.8b           \n"  // dr - (dr * a / 256)
2145       "uqadd      v0.8b, v0.8b, v4.8b            \n"  // + sb
2146       "uqadd      v1.8b, v1.8b, v5.8b            \n"  // + sg
2147       "uqadd      v2.8b, v2.8b, v6.8b            \n"  // + sr
2148       "movi       v3.8b, #255                    \n"  // a = 255
2149       "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
2150                                                             // pixels
2151       "b.ge       8b                             \n"
2152 
2153       "89:                                       \n"
2154       "adds       %w3, %w3, #8-1                 \n"
2155       "b.lt       99f                            \n"
2156 
2157       // Blend 1 pixels.
2158       "1:                                        \n"
2159       "ld4        {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n"  // load 1 pixel ARGB0.
2160       "ld4        {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n"  // load 1 pixel ARGB1.
2161       "subs       %w3, %w3, #1                   \n"  // 1 processed per loop.
2162       "umull      v16.8h, v4.8b, v3.8b           \n"  // db * a
2163       "umull      v17.8h, v5.8b, v3.8b           \n"  // dg * a
2164       "umull      v18.8h, v6.8b, v3.8b           \n"  // dr * a
2165       "uqrshrn    v16.8b, v16.8h, #8             \n"  // db >>= 8
2166       "uqrshrn    v17.8b, v17.8h, #8             \n"  // dg >>= 8
2167       "uqrshrn    v18.8b, v18.8h, #8             \n"  // dr >>= 8
2168       "uqsub      v4.8b, v4.8b, v16.8b           \n"  // db - (db * a / 256)
2169       "uqsub      v5.8b, v5.8b, v17.8b           \n"  // dg - (dg * a / 256)
2170       "uqsub      v6.8b, v6.8b, v18.8b           \n"  // dr - (dr * a / 256)
2171       "uqadd      v0.8b, v0.8b, v4.8b            \n"  // + sb
2172       "uqadd      v1.8b, v1.8b, v5.8b            \n"  // + sg
2173       "uqadd      v2.8b, v2.8b, v6.8b            \n"  // + sr
2174       "movi       v3.8b, #255                    \n"  // a = 255
2175       "st4        {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n"  // store 1 pixel.
2176       "b.ge       1b                             \n"
2177 
2178       "99:                                       \n"
2179 
2180       : "+r"(src_argb0),  // %0
2181         "+r"(src_argb1),  // %1
2182         "+r"(dst_argb),   // %2
2183         "+r"(width)       // %3
2184       :
2185       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
2186         "v17", "v18");
2187 }
2188 
2189 // Attenuate 8 pixels at a time.
ARGBAttenuateRow_NEON(const uint8_t * src_argb,uint8_t * dst_argb,int width)2190 void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
2191                            uint8_t* dst_argb,
2192                            int width) {
2193   asm volatile(
2194       // Attenuate 8 pixels.
2195       "1:                                        \n"
2196       "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
2197       "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
2198       "umull      v4.8h, v0.8b, v3.8b            \n"  // b * a
2199       "umull      v5.8h, v1.8b, v3.8b            \n"  // g * a
2200       "umull      v6.8h, v2.8b, v3.8b            \n"  // r * a
2201       "uqrshrn    v0.8b, v4.8h, #8               \n"  // b >>= 8
2202       "uqrshrn    v1.8b, v5.8h, #8               \n"  // g >>= 8
2203       "uqrshrn    v2.8b, v6.8h, #8               \n"  // r >>= 8
2204       "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB
2205                                                             // pixels
2206       "b.gt       1b                             \n"
2207       : "+r"(src_argb),  // %0
2208         "+r"(dst_argb),  // %1
2209         "+r"(width)      // %2
2210       :
2211       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
2212 }
2213 
2214 // Quantize 8 ARGB pixels (32 bytes).
2215 // dst = (dst * scale >> 16) * interval_size + interval_offset;
ARGBQuantizeRow_NEON(uint8_t * dst_argb,int scale,int interval_size,int interval_offset,int width)2216 void ARGBQuantizeRow_NEON(uint8_t* dst_argb,
2217                           int scale,
2218                           int interval_size,
2219                           int interval_offset,
2220                           int width) {
2221   asm volatile(
2222       "dup        v4.8h, %w2                     \n"
2223       "ushr       v4.8h, v4.8h, #1               \n"  // scale >>= 1
2224       "dup        v5.8h, %w3                     \n"  // interval multiply.
2225       "dup        v6.8h, %w4                     \n"  // interval add
2226 
2227       // 8 pixel loop.
2228       "1:                                        \n"
2229       "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0]  \n"  // load 8  ARGB.
2230       "subs       %w1, %w1, #8                   \n"    // 8 processed per loop.
2231       "uxtl       v0.8h, v0.8b                   \n"    // b (0 .. 255)
2232       "uxtl       v1.8h, v1.8b                   \n"
2233       "uxtl       v2.8h, v2.8b                   \n"
2234       "sqdmulh    v0.8h, v0.8h, v4.8h            \n"  // b * scale
2235       "sqdmulh    v1.8h, v1.8h, v4.8h            \n"  // g
2236       "sqdmulh    v2.8h, v2.8h, v4.8h            \n"  // r
2237       "mul        v0.8h, v0.8h, v5.8h            \n"  // b * interval_size
2238       "mul        v1.8h, v1.8h, v5.8h            \n"  // g
2239       "mul        v2.8h, v2.8h, v5.8h            \n"  // r
2240       "add        v0.8h, v0.8h, v6.8h            \n"  // b + interval_offset
2241       "add        v1.8h, v1.8h, v6.8h            \n"  // g
2242       "add        v2.8h, v2.8h, v6.8h            \n"  // r
2243       "uqxtn      v0.8b, v0.8h                   \n"
2244       "uqxtn      v1.8b, v1.8h                   \n"
2245       "uqxtn      v2.8b, v2.8h                   \n"
2246       "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // store 8 ARGB
2247       "b.gt       1b                             \n"
2248       : "+r"(dst_argb),       // %0
2249         "+r"(width)           // %1
2250       : "r"(scale),           // %2
2251         "r"(interval_size),   // %3
2252         "r"(interval_offset)  // %4
2253       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
2254 }
2255 
2256 // Shade 8 pixels at a time by specified value.
2257 // NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
2258 // Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
ARGBShadeRow_NEON(const uint8_t * src_argb,uint8_t * dst_argb,int width,uint32_t value)2259 void ARGBShadeRow_NEON(const uint8_t* src_argb,
2260                        uint8_t* dst_argb,
2261                        int width,
2262                        uint32_t value) {
2263   asm volatile(
2264       "dup        v0.4s, %w3                     \n"  // duplicate scale value.
2265       "zip1       v0.8b, v0.8b, v0.8b            \n"  // v0.8b aarrggbb.
2266       "ushr       v0.8h, v0.8h, #1               \n"  // scale / 2.
2267 
2268       // 8 pixel loop.
2269       "1:                                        \n"
2270       "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n"  // load 8 ARGB
2271       "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
2272       "uxtl       v4.8h, v4.8b                   \n"  // b (0 .. 255)
2273       "uxtl       v5.8h, v5.8b                   \n"
2274       "uxtl       v6.8h, v6.8b                   \n"
2275       "uxtl       v7.8h, v7.8b                   \n"
2276       "sqrdmulh   v4.8h, v4.8h, v0.h[0]          \n"  // b * scale * 2
2277       "sqrdmulh   v5.8h, v5.8h, v0.h[1]          \n"  // g
2278       "sqrdmulh   v6.8h, v6.8h, v0.h[2]          \n"  // r
2279       "sqrdmulh   v7.8h, v7.8h, v0.h[3]          \n"  // a
2280       "uqxtn      v4.8b, v4.8h                   \n"
2281       "uqxtn      v5.8b, v5.8h                   \n"
2282       "uqxtn      v6.8b, v6.8h                   \n"
2283       "uqxtn      v7.8b, v7.8h                   \n"
2284       "st4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // store 8 ARGB
2285       "b.gt       1b                             \n"
2286       : "+r"(src_argb),  // %0
2287         "+r"(dst_argb),  // %1
2288         "+r"(width)      // %2
2289       : "r"(value)       // %3
2290       : "cc", "memory", "v0", "v4", "v5", "v6", "v7");
2291 }
2292 
2293 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
2294 // Similar to ARGBToYJ but stores ARGB.
2295 // C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
ARGBGrayRow_NEON(const uint8_t * src_argb,uint8_t * dst_argb,int width)2296 void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
2297   asm volatile(
2298       "movi       v24.8b, #15                    \n"  // B * 0.11400 coefficient
2299       "movi       v25.8b, #75                    \n"  // G * 0.58700 coefficient
2300       "movi       v26.8b, #38                    \n"  // R * 0.29900 coefficient
2301       "1:                                        \n"
2302       "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
2303       "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
2304       "umull      v4.8h, v0.8b, v24.8b           \n"  // B
2305       "umlal      v4.8h, v1.8b, v25.8b           \n"  // G
2306       "umlal      v4.8h, v2.8b, v26.8b           \n"  // R
2307       "sqrshrun   v0.8b, v4.8h, #7               \n"  // 15 bit to 8 bit B
2308       "orr        v1.8b, v0.8b, v0.8b            \n"  // G
2309       "orr        v2.8b, v0.8b, v0.8b            \n"  // R
2310       "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 pixels.
2311       "b.gt       1b                             \n"
2312       : "+r"(src_argb),  // %0
2313         "+r"(dst_argb),  // %1
2314         "+r"(width)      // %2
2315       :
2316       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26");
2317 }
2318 
2319 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
2320 //    b = (r * 35 + g * 68 + b * 17) >> 7
2321 //    g = (r * 45 + g * 88 + b * 22) >> 7
2322 //    r = (r * 50 + g * 98 + b * 24) >> 7
2323 
ARGBSepiaRow_NEON(uint8_t * dst_argb,int width)2324 void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) {
2325   asm volatile(
2326       "movi       v20.8b, #17                    \n"  // BB coefficient
2327       "movi       v21.8b, #68                    \n"  // BG coefficient
2328       "movi       v22.8b, #35                    \n"  // BR coefficient
2329       "movi       v24.8b, #22                    \n"  // GB coefficient
2330       "movi       v25.8b, #88                    \n"  // GG coefficient
2331       "movi       v26.8b, #45                    \n"  // GR coefficient
2332       "movi       v28.8b, #24                    \n"  // BB coefficient
2333       "movi       v29.8b, #98                    \n"  // BG coefficient
2334       "movi       v30.8b, #50                    \n"  // BR coefficient
2335       "1:                                        \n"
2336       "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n"  // load 8 ARGB pixels.
2337       "subs       %w1, %w1, #8                   \n"   // 8 processed per loop.
2338       "umull      v4.8h, v0.8b, v20.8b           \n"   // B to Sepia B
2339       "umlal      v4.8h, v1.8b, v21.8b           \n"   // G
2340       "umlal      v4.8h, v2.8b, v22.8b           \n"   // R
2341       "umull      v5.8h, v0.8b, v24.8b           \n"   // B to Sepia G
2342       "umlal      v5.8h, v1.8b, v25.8b           \n"   // G
2343       "umlal      v5.8h, v2.8b, v26.8b           \n"   // R
2344       "umull      v6.8h, v0.8b, v28.8b           \n"   // B to Sepia R
2345       "umlal      v6.8h, v1.8b, v29.8b           \n"   // G
2346       "umlal      v6.8h, v2.8b, v30.8b           \n"   // R
2347       "uqshrn     v0.8b, v4.8h, #7               \n"   // 16 bit to 8 bit B
2348       "uqshrn     v1.8b, v5.8h, #7               \n"   // 16 bit to 8 bit G
2349       "uqshrn     v2.8b, v6.8h, #7               \n"   // 16 bit to 8 bit R
2350       "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // store 8 pixels.
2351       "b.gt       1b                             \n"
2352       : "+r"(dst_argb),  // %0
2353         "+r"(width)      // %1
2354       :
2355       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
2356         "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30");
2357 }
2358 
2359 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
2360 // TODO(fbarchard): Was same as Sepia except matrix is provided.  This function
2361 // needs to saturate.  Consider doing a non-saturating version.
ARGBColorMatrixRow_NEON(const uint8_t * src_argb,uint8_t * dst_argb,const int8_t * matrix_argb,int width)2362 void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
2363                              uint8_t* dst_argb,
2364                              const int8_t* matrix_argb,
2365                              int width) {
2366   asm volatile(
2367       "ld1        {v2.16b}, [%3]                 \n"  // load 3 ARGB vectors.
2368       "sxtl       v0.8h, v2.8b                   \n"  // B,G coefficients s16.
2369       "sxtl2      v1.8h, v2.16b                  \n"  // R,A coefficients s16.
2370 
2371       "1:                                        \n"
2372       "ld4        {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n"  // load 8 ARGB
2373       "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
2374       "uxtl       v16.8h, v16.8b                 \n"  // b (0 .. 255) 16 bit
2375       "uxtl       v17.8h, v17.8b                 \n"  // g
2376       "uxtl       v18.8h, v18.8b                 \n"  // r
2377       "uxtl       v19.8h, v19.8b                 \n"  // a
2378       "mul        v22.8h, v16.8h, v0.h[0]        \n"  // B = B * Matrix B
2379       "mul        v23.8h, v16.8h, v0.h[4]        \n"  // G = B * Matrix G
2380       "mul        v24.8h, v16.8h, v1.h[0]        \n"  // R = B * Matrix R
2381       "mul        v25.8h, v16.8h, v1.h[4]        \n"  // A = B * Matrix A
2382       "mul        v4.8h, v17.8h, v0.h[1]         \n"  // B += G * Matrix B
2383       "mul        v5.8h, v17.8h, v0.h[5]         \n"  // G += G * Matrix G
2384       "mul        v6.8h, v17.8h, v1.h[1]         \n"  // R += G * Matrix R
2385       "mul        v7.8h, v17.8h, v1.h[5]         \n"  // A += G * Matrix A
2386       "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B
2387       "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G
2388       "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R
2389       "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A
2390       "mul        v4.8h, v18.8h, v0.h[2]         \n"  // B += R * Matrix B
2391       "mul        v5.8h, v18.8h, v0.h[6]         \n"  // G += R * Matrix G
2392       "mul        v6.8h, v18.8h, v1.h[2]         \n"  // R += R * Matrix R
2393       "mul        v7.8h, v18.8h, v1.h[6]         \n"  // A += R * Matrix A
2394       "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B
2395       "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G
2396       "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R
2397       "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A
2398       "mul        v4.8h, v19.8h, v0.h[3]         \n"  // B += A * Matrix B
2399       "mul        v5.8h, v19.8h, v0.h[7]         \n"  // G += A * Matrix G
2400       "mul        v6.8h, v19.8h, v1.h[3]         \n"  // R += A * Matrix R
2401       "mul        v7.8h, v19.8h, v1.h[7]         \n"  // A += A * Matrix A
2402       "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B
2403       "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G
2404       "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R
2405       "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A
2406       "sqshrun    v16.8b, v22.8h, #6             \n"  // 16 bit to 8 bit B
2407       "sqshrun    v17.8b, v23.8h, #6             \n"  // 16 bit to 8 bit G
2408       "sqshrun    v18.8b, v24.8h, #6             \n"  // 16 bit to 8 bit R
2409       "sqshrun    v19.8b, v25.8h, #6             \n"  // 16 bit to 8 bit A
2410       "st4        {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n"  // store 8 ARGB
2411       "b.gt       1b                             \n"
2412       : "+r"(src_argb),   // %0
2413         "+r"(dst_argb),   // %1
2414         "+r"(width)       // %2
2415       : "r"(matrix_argb)  // %3
2416       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
2417         "v17", "v18", "v19", "v22", "v23", "v24", "v25");
2418 }
2419 
2420 // TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable.
2421 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
ARGBMultiplyRow_NEON(const uint8_t * src_argb0,const uint8_t * src_argb1,uint8_t * dst_argb,int width)2422 void ARGBMultiplyRow_NEON(const uint8_t* src_argb0,
2423                           const uint8_t* src_argb1,
2424                           uint8_t* dst_argb,
2425                           int width) {
2426   asm volatile(
2427       // 8 pixel loop.
2428       "1:                                        \n"
2429       "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
2430       "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more
2431       "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
2432       "umull      v0.8h, v0.8b, v4.8b            \n"  // multiply B
2433       "umull      v1.8h, v1.8b, v5.8b            \n"  // multiply G
2434       "umull      v2.8h, v2.8b, v6.8b            \n"  // multiply R
2435       "umull      v3.8h, v3.8b, v7.8b            \n"  // multiply A
2436       "rshrn      v0.8b, v0.8h, #8               \n"  // 16 bit to 8 bit B
2437       "rshrn      v1.8b, v1.8h, #8               \n"  // 16 bit to 8 bit G
2438       "rshrn      v2.8b, v2.8h, #8               \n"  // 16 bit to 8 bit R
2439       "rshrn      v3.8b, v3.8h, #8               \n"  // 16 bit to 8 bit A
2440       "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
2441       "b.gt       1b                             \n"
2442       : "+r"(src_argb0),  // %0
2443         "+r"(src_argb1),  // %1
2444         "+r"(dst_argb),   // %2
2445         "+r"(width)       // %3
2446       :
2447       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
2448 }
2449 
2450 // Add 2 rows of ARGB pixels together, 8 pixels at a time.
ARGBAddRow_NEON(const uint8_t * src_argb0,const uint8_t * src_argb1,uint8_t * dst_argb,int width)2451 void ARGBAddRow_NEON(const uint8_t* src_argb0,
2452                      const uint8_t* src_argb1,
2453                      uint8_t* dst_argb,
2454                      int width) {
2455   asm volatile(
2456       // 8 pixel loop.
2457       "1:                                        \n"
2458       "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
2459       "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more
2460       "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
2461       "uqadd      v0.8b, v0.8b, v4.8b            \n"
2462       "uqadd      v1.8b, v1.8b, v5.8b            \n"
2463       "uqadd      v2.8b, v2.8b, v6.8b            \n"
2464       "uqadd      v3.8b, v3.8b, v7.8b            \n"
2465       "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
2466       "b.gt       1b                             \n"
2467       : "+r"(src_argb0),  // %0
2468         "+r"(src_argb1),  // %1
2469         "+r"(dst_argb),   // %2
2470         "+r"(width)       // %3
2471       :
2472       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
2473 }
2474 
2475 // Subtract 2 rows of ARGB pixels, 8 pixels at a time.
ARGBSubtractRow_NEON(const uint8_t * src_argb0,const uint8_t * src_argb1,uint8_t * dst_argb,int width)2476 void ARGBSubtractRow_NEON(const uint8_t* src_argb0,
2477                           const uint8_t* src_argb1,
2478                           uint8_t* dst_argb,
2479                           int width) {
2480   asm volatile(
2481       // 8 pixel loop.
2482       "1:                                        \n"
2483       "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
2484       "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more
2485       "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
2486       "uqsub      v0.8b, v0.8b, v4.8b            \n"
2487       "uqsub      v1.8b, v1.8b, v5.8b            \n"
2488       "uqsub      v2.8b, v2.8b, v6.8b            \n"
2489       "uqsub      v3.8b, v3.8b, v7.8b            \n"
2490       "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
2491       "b.gt       1b                             \n"
2492       : "+r"(src_argb0),  // %0
2493         "+r"(src_argb1),  // %1
2494         "+r"(dst_argb),   // %2
2495         "+r"(width)       // %3
2496       :
2497       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
2498 }
2499 
2500 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
2501 // A = 255
2502 // R = Sobel
2503 // G = Sobel
2504 // B = Sobel
SobelRow_NEON(const uint8_t * src_sobelx,const uint8_t * src_sobely,uint8_t * dst_argb,int width)2505 void SobelRow_NEON(const uint8_t* src_sobelx,
2506                    const uint8_t* src_sobely,
2507                    uint8_t* dst_argb,
2508                    int width) {
2509   asm volatile(
2510       "movi       v3.8b, #255                    \n"  // alpha
2511       // 8 pixel loop.
2512       "1:                                        \n"
2513       "ld1        {v0.8b}, [%0], #8              \n"  // load 8 sobelx.
2514       "ld1        {v1.8b}, [%1], #8              \n"  // load 8 sobely.
2515       "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
2516       "uqadd      v0.8b, v0.8b, v1.8b            \n"  // add
2517       "orr        v1.8b, v0.8b, v0.8b            \n"
2518       "orr        v2.8b, v0.8b, v0.8b            \n"
2519       "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
2520       "b.gt       1b                             \n"
2521       : "+r"(src_sobelx),  // %0
2522         "+r"(src_sobely),  // %1
2523         "+r"(dst_argb),    // %2
2524         "+r"(width)        // %3
2525       :
2526       : "cc", "memory", "v0", "v1", "v2", "v3");
2527 }
2528 
2529 // Adds Sobel X and Sobel Y and stores Sobel into plane.
SobelToPlaneRow_NEON(const uint8_t * src_sobelx,const uint8_t * src_sobely,uint8_t * dst_y,int width)2530 void SobelToPlaneRow_NEON(const uint8_t* src_sobelx,
2531                           const uint8_t* src_sobely,
2532                           uint8_t* dst_y,
2533                           int width) {
2534   asm volatile(
2535       // 16 pixel loop.
2536       "1:                                        \n"
2537       "ld1        {v0.16b}, [%0], #16            \n"  // load 16 sobelx.
2538       "ld1        {v1.16b}, [%1], #16            \n"  // load 16 sobely.
2539       "subs       %w3, %w3, #16                  \n"  // 16 processed per loop.
2540       "uqadd      v0.16b, v0.16b, v1.16b         \n"  // add
2541       "st1        {v0.16b}, [%2], #16            \n"  // store 16 pixels.
2542       "b.gt       1b                             \n"
2543       : "+r"(src_sobelx),  // %0
2544         "+r"(src_sobely),  // %1
2545         "+r"(dst_y),       // %2
2546         "+r"(width)        // %3
2547       :
2548       : "cc", "memory", "v0", "v1");
2549 }
2550 
2551 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
2552 // A = 255
2553 // R = Sobel X
2554 // G = Sobel
2555 // B = Sobel Y
SobelXYRow_NEON(const uint8_t * src_sobelx,const uint8_t * src_sobely,uint8_t * dst_argb,int width)2556 void SobelXYRow_NEON(const uint8_t* src_sobelx,
2557                      const uint8_t* src_sobely,
2558                      uint8_t* dst_argb,
2559                      int width) {
2560   asm volatile(
2561       "movi       v3.8b, #255                    \n"  // alpha
2562       // 8 pixel loop.
2563       "1:                                        \n"
2564       "ld1        {v2.8b}, [%0], #8              \n"  // load 8 sobelx.
2565       "ld1        {v0.8b}, [%1], #8              \n"  // load 8 sobely.
2566       "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
2567       "uqadd      v1.8b, v0.8b, v2.8b            \n"  // add
2568       "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
2569       "b.gt       1b                             \n"
2570       : "+r"(src_sobelx),  // %0
2571         "+r"(src_sobely),  // %1
2572         "+r"(dst_argb),    // %2
2573         "+r"(width)        // %3
2574       :
2575       : "cc", "memory", "v0", "v1", "v2", "v3");
2576 }
2577 
2578 // SobelX as a matrix is
2579 // -1  0  1
2580 // -2  0  2
2581 // -1  0  1
SobelXRow_NEON(const uint8_t * src_y0,const uint8_t * src_y1,const uint8_t * src_y2,uint8_t * dst_sobelx,int width)2582 void SobelXRow_NEON(const uint8_t* src_y0,
2583                     const uint8_t* src_y1,
2584                     const uint8_t* src_y2,
2585                     uint8_t* dst_sobelx,
2586                     int width) {
2587   asm volatile(
2588       "1:                                        \n"
2589       "ld1        {v0.8b}, [%0],%5               \n"  // top
2590       "ld1        {v1.8b}, [%0],%6               \n"
2591       "usubl      v0.8h, v0.8b, v1.8b            \n"
2592       "ld1        {v2.8b}, [%1],%5               \n"  // center * 2
2593       "ld1        {v3.8b}, [%1],%6               \n"
2594       "usubl      v1.8h, v2.8b, v3.8b            \n"
2595       "add        v0.8h, v0.8h, v1.8h            \n"
2596       "add        v0.8h, v0.8h, v1.8h            \n"
2597       "ld1        {v2.8b}, [%2],%5               \n"  // bottom
2598       "ld1        {v3.8b}, [%2],%6               \n"
2599       "subs       %w4, %w4, #8                   \n"  // 8 pixels
2600       "usubl      v1.8h, v2.8b, v3.8b            \n"
2601       "add        v0.8h, v0.8h, v1.8h            \n"
2602       "abs        v0.8h, v0.8h                   \n"
2603       "uqxtn      v0.8b, v0.8h                   \n"
2604       "st1        {v0.8b}, [%3], #8              \n"  // store 8 sobelx
2605       "b.gt       1b                             \n"
2606       : "+r"(src_y0),                           // %0
2607         "+r"(src_y1),                           // %1
2608         "+r"(src_y2),                           // %2
2609         "+r"(dst_sobelx),                       // %3
2610         "+r"(width)                             // %4
2611       : "r"(2LL),                               // %5
2612         "r"(6LL)                                // %6
2613       : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
2614       );
2615 }
2616 
2617 // SobelY as a matrix is
2618 // -1 -2 -1
2619 //  0  0  0
2620 //  1  2  1
SobelYRow_NEON(const uint8_t * src_y0,const uint8_t * src_y1,uint8_t * dst_sobely,int width)2621 void SobelYRow_NEON(const uint8_t* src_y0,
2622                     const uint8_t* src_y1,
2623                     uint8_t* dst_sobely,
2624                     int width) {
2625   asm volatile(
2626       "1:                                        \n"
2627       "ld1        {v0.8b}, [%0],%4               \n"  // left
2628       "ld1        {v1.8b}, [%1],%4               \n"
2629       "usubl      v0.8h, v0.8b, v1.8b            \n"
2630       "ld1        {v2.8b}, [%0],%4               \n"  // center * 2
2631       "ld1        {v3.8b}, [%1],%4               \n"
2632       "usubl      v1.8h, v2.8b, v3.8b            \n"
2633       "add        v0.8h, v0.8h, v1.8h            \n"
2634       "add        v0.8h, v0.8h, v1.8h            \n"
2635       "ld1        {v2.8b}, [%0],%5               \n"  // right
2636       "ld1        {v3.8b}, [%1],%5               \n"
2637       "subs       %w3, %w3, #8                   \n"  // 8 pixels
2638       "usubl      v1.8h, v2.8b, v3.8b            \n"
2639       "add        v0.8h, v0.8h, v1.8h            \n"
2640       "abs        v0.8h, v0.8h                   \n"
2641       "uqxtn      v0.8b, v0.8h                   \n"
2642       "st1        {v0.8b}, [%2], #8              \n"  // store 8 sobely
2643       "b.gt       1b                             \n"
2644       : "+r"(src_y0),                           // %0
2645         "+r"(src_y1),                           // %1
2646         "+r"(dst_sobely),                       // %2
2647         "+r"(width)                             // %3
2648       : "r"(1LL),                               // %4
2649         "r"(6LL)                                // %5
2650       : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
2651       );
2652 }
2653 
2654 // Caveat - rounds float to half float whereas scaling version truncates.
HalfFloat1Row_NEON(const uint16_t * src,uint16_t * dst,float,int width)2655 void HalfFloat1Row_NEON(const uint16_t* src,
2656                         uint16_t* dst,
2657                         float /*unused*/,
2658                         int width) {
2659   asm volatile(
2660       "1:                                        \n"
2661       "ld1        {v1.16b}, [%0], #16            \n"  // load 8 shorts
2662       "subs       %w2, %w2, #8                   \n"  // 8 pixels per loop
2663       "uxtl       v2.4s, v1.4h                   \n"  // 8 int's
2664       "uxtl2      v3.4s, v1.8h                   \n"
2665       "scvtf      v2.4s, v2.4s                   \n"  // 8 floats
2666       "scvtf      v3.4s, v3.4s                   \n"
2667       "fcvtn      v1.4h, v2.4s                   \n"  // 8 half floats
2668       "fcvtn2     v1.8h, v3.4s                   \n"
2669       "st1        {v1.16b}, [%1], #16            \n"  // store 8 shorts
2670       "b.gt       1b                             \n"
2671       : "+r"(src),   // %0
2672         "+r"(dst),   // %1
2673         "+r"(width)  // %2
2674       :
2675       : "cc", "memory", "v1", "v2", "v3");
2676 }
2677 
HalfFloatRow_NEON(const uint16_t * src,uint16_t * dst,float scale,int width)2678 void HalfFloatRow_NEON(const uint16_t* src,
2679                        uint16_t* dst,
2680                        float scale,
2681                        int width) {
2682   asm volatile(
2683       "1:                                        \n"
2684       "ld1        {v1.16b}, [%0], #16            \n"  // load 8 shorts
2685       "subs       %w2, %w2, #8                   \n"  // 8 pixels per loop
2686       "uxtl       v2.4s, v1.4h                   \n"  // 8 int's
2687       "uxtl2      v3.4s, v1.8h                   \n"
2688       "scvtf      v2.4s, v2.4s                   \n"  // 8 floats
2689       "scvtf      v3.4s, v3.4s                   \n"
2690       "fmul       v2.4s, v2.4s, %3.s[0]          \n"  // adjust exponent
2691       "fmul       v3.4s, v3.4s, %3.s[0]          \n"
2692       "uqshrn     v1.4h, v2.4s, #13              \n"  // isolate halffloat
2693       "uqshrn2    v1.8h, v3.4s, #13              \n"
2694       "st1        {v1.16b}, [%1], #16            \n"  // store 8 shorts
2695       "b.gt       1b                             \n"
2696       : "+r"(src),                      // %0
2697         "+r"(dst),                      // %1
2698         "+r"(width)                     // %2
2699       : "w"(scale * 1.9259299444e-34f)  // %3
2700       : "cc", "memory", "v1", "v2", "v3");
2701 }
2702 
ByteToFloatRow_NEON(const uint8_t * src,float * dst,float scale,int width)2703 void ByteToFloatRow_NEON(const uint8_t* src,
2704                          float* dst,
2705                          float scale,
2706                          int width) {
2707   asm volatile(
2708       "1:                                        \n"
2709       "ld1        {v1.8b}, [%0], #8              \n"  // load 8 bytes
2710       "subs       %w2, %w2, #8                   \n"  // 8 pixels per loop
2711       "uxtl       v1.8h, v1.8b                   \n"  // 8 shorts
2712       "uxtl       v2.4s, v1.4h                   \n"  // 8 ints
2713       "uxtl2      v3.4s, v1.8h                   \n"
2714       "scvtf      v2.4s, v2.4s                   \n"  // 8 floats
2715       "scvtf      v3.4s, v3.4s                   \n"
2716       "fmul       v2.4s, v2.4s, %3.s[0]          \n"  // scale
2717       "fmul       v3.4s, v3.4s, %3.s[0]          \n"
2718       "st1        {v2.16b, v3.16b}, [%1], #32    \n"  // store 8 floats
2719       "b.gt       1b                             \n"
2720       : "+r"(src),   // %0
2721         "+r"(dst),   // %1
2722         "+r"(width)  // %2
2723       : "w"(scale)   // %3
2724       : "cc", "memory", "v1", "v2", "v3");
2725 }
2726 
ScaleMaxSamples_NEON(const float * src,float * dst,float scale,int width)2727 float ScaleMaxSamples_NEON(const float* src,
2728                            float* dst,
2729                            float scale,
2730                            int width) {
2731   float fmax;
2732   asm volatile(
2733       "movi       v5.4s, #0                      \n"  // max
2734       "movi       v6.4s, #0                      \n"
2735 
2736       "1:                                        \n"
2737       "ld1        {v1.4s, v2.4s}, [%0], #32      \n"  // load 8 samples
2738       "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
2739       "fmul       v3.4s, v1.4s, %4.s[0]          \n"  // scale
2740       "fmul       v4.4s, v2.4s, %4.s[0]          \n"  // scale
2741       "fmax       v5.4s, v5.4s, v1.4s            \n"  // max
2742       "fmax       v6.4s, v6.4s, v2.4s            \n"
2743       "st1        {v3.4s, v4.4s}, [%1], #32      \n"  // store 8 samples
2744       "b.gt       1b                             \n"
2745       "fmax       v5.4s, v5.4s, v6.4s            \n"  // max
2746       "fmaxv      %s3, v5.4s                     \n"  // signed max acculator
2747       : "+r"(src),                                    // %0
2748         "+r"(dst),                                    // %1
2749         "+r"(width),                                  // %2
2750         "=w"(fmax)                                    // %3
2751       : "w"(scale)                                    // %4
2752       : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6");
2753   return fmax;
2754 }
2755 
ScaleSumSamples_NEON(const float * src,float * dst,float scale,int width)2756 float ScaleSumSamples_NEON(const float* src,
2757                            float* dst,
2758                            float scale,
2759                            int width) {
2760   float fsum;
2761   asm volatile(
2762       "movi       v5.4s, #0                      \n"  // max
2763       "movi       v6.4s, #0                      \n"  // max
2764 
2765       "1:                                        \n"
2766       "ld1        {v1.4s, v2.4s}, [%0], #32      \n"  // load 8 samples
2767       "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
2768       "fmul       v3.4s, v1.4s, %4.s[0]          \n"  // scale
2769       "fmul       v4.4s, v2.4s, %4.s[0]          \n"
2770       "fmla       v5.4s, v1.4s, v1.4s            \n"  // sum of squares
2771       "fmla       v6.4s, v2.4s, v2.4s            \n"
2772       "st1        {v3.4s, v4.4s}, [%1], #32      \n"  // store 8 samples
2773       "b.gt       1b                             \n"
2774       "faddp      v5.4s, v5.4s, v6.4s            \n"
2775       "faddp      v5.4s, v5.4s, v5.4s            \n"
2776       "faddp      %3.4s, v5.4s, v5.4s            \n"  // sum
2777       : "+r"(src),                                    // %0
2778         "+r"(dst),                                    // %1
2779         "+r"(width),                                  // %2
2780         "=w"(fsum)                                    // %3
2781       : "w"(scale)                                    // %4
2782       : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6");
2783   return fsum;
2784 }
2785 
ScaleSamples_NEON(const float * src,float * dst,float scale,int width)2786 void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) {
2787   asm volatile(
2788       "1:                                        \n"
2789       "ld1        {v1.4s, v2.4s}, [%0], #32      \n"  // load 8 samples
2790       "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
2791       "fmul       v1.4s, v1.4s, %3.s[0]          \n"  // scale
2792       "fmul       v2.4s, v2.4s, %3.s[0]          \n"  // scale
2793       "st1        {v1.4s, v2.4s}, [%1], #32      \n"  // store 8 samples
2794       "b.gt       1b                             \n"
2795       : "+r"(src),   // %0
2796         "+r"(dst),   // %1
2797         "+r"(width)  // %2
2798       : "w"(scale)   // %3
2799       : "cc", "memory", "v1", "v2");
2800 }
2801 
2802 // filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
GaussCol_NEON(const uint16_t * src0,const uint16_t * src1,const uint16_t * src2,const uint16_t * src3,const uint16_t * src4,uint32_t * dst,int width)2803 void GaussCol_NEON(const uint16_t* src0,
2804                    const uint16_t* src1,
2805                    const uint16_t* src2,
2806                    const uint16_t* src3,
2807                    const uint16_t* src4,
2808                    uint32_t* dst,
2809                    int width) {
2810   asm volatile(
2811       "movi       v6.8h, #4                      \n"  // constant 4
2812       "movi       v7.8h, #6                      \n"  // constant 6
2813 
2814       "1:                                        \n"
2815       "ld1        {v1.8h}, [%0], #16             \n"  // load 8 samples, 5 rows
2816       "ld1        {v2.8h}, [%4], #16             \n"
2817       "uaddl      v0.4s, v1.4h, v2.4h            \n"  // * 1
2818       "uaddl2     v1.4s, v1.8h, v2.8h            \n"  // * 1
2819       "ld1        {v2.8h}, [%1], #16             \n"
2820       "umlal      v0.4s, v2.4h, v6.4h            \n"  // * 4
2821       "umlal2     v1.4s, v2.8h, v6.8h            \n"  // * 4
2822       "ld1        {v2.8h}, [%2], #16             \n"
2823       "umlal      v0.4s, v2.4h, v7.4h            \n"  // * 6
2824       "umlal2     v1.4s, v2.8h, v7.8h            \n"  // * 6
2825       "ld1        {v2.8h}, [%3], #16             \n"
2826       "umlal      v0.4s, v2.4h, v6.4h            \n"  // * 4
2827       "umlal2     v1.4s, v2.8h, v6.8h            \n"  // * 4
2828       "subs       %w6, %w6, #8                   \n"  // 8 processed per loop
2829       "st1        {v0.4s,v1.4s}, [%5], #32       \n"  // store 8 samples
2830       "b.gt       1b                             \n"
2831       : "+r"(src0),  // %0
2832         "+r"(src1),  // %1
2833         "+r"(src2),  // %2
2834         "+r"(src3),  // %3
2835         "+r"(src4),  // %4
2836         "+r"(dst),   // %5
2837         "+r"(width)  // %6
2838       :
2839       : "cc", "memory", "v0", "v1", "v2", "v6", "v7");
2840 }
2841 
2842 // filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
GaussRow_NEON(const uint32_t * src,uint16_t * dst,int width)2843 void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {
2844   const uint32_t* src1 = src + 1;
2845   const uint32_t* src2 = src + 2;
2846   const uint32_t* src3 = src + 3;
2847   asm volatile(
2848       "movi       v6.4s, #4                      \n"  // constant 4
2849       "movi       v7.4s, #6                      \n"  // constant 6
2850 
2851       "1:                                        \n"
2852       "ld1        {v0.4s,v1.4s,v2.4s}, [%0], %6  \n"  // load 12 source samples
2853       "add        v0.4s, v0.4s, v1.4s            \n"  // * 1
2854       "add        v1.4s, v1.4s, v2.4s            \n"  // * 1
2855       "ld1        {v2.4s,v3.4s}, [%2], #32       \n"
2856       "mla        v0.4s, v2.4s, v7.4s            \n"  // * 6
2857       "mla        v1.4s, v3.4s, v7.4s            \n"  // * 6
2858       "ld1        {v2.4s,v3.4s}, [%1], #32       \n"
2859       "ld1        {v4.4s,v5.4s}, [%3], #32       \n"
2860       "add        v2.4s, v2.4s, v4.4s            \n"  // add rows for * 4
2861       "add        v3.4s, v3.4s, v5.4s            \n"
2862       "mla        v0.4s, v2.4s, v6.4s            \n"  // * 4
2863       "mla        v1.4s, v3.4s, v6.4s            \n"  // * 4
2864       "subs       %w5, %w5, #8                   \n"  // 8 processed per loop
2865       "uqrshrn    v0.4h, v0.4s, #8               \n"  // round and pack
2866       "uqrshrn2   v0.8h, v1.4s, #8               \n"
2867       "st1        {v0.8h}, [%4], #16             \n"  // store 8 samples
2868       "b.gt       1b                             \n"
2869       : "+r"(src),   // %0
2870         "+r"(src1),  // %1
2871         "+r"(src2),  // %2
2872         "+r"(src3),  // %3
2873         "+r"(dst),   // %4
2874         "+r"(width)  // %5
2875       : "r"(32LL)    // %6
2876       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
2877 }
2878 
2879 #endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
2880 
2881 #ifdef __cplusplus
2882 }  // extern "C"
2883 }  // namespace libyuv
2884 #endif
2885