1 /*
2  *  Copyright 2014 The LibYuv Project Authors. All rights reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS. All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "libyuv/row.h"
12 
13 #ifdef __cplusplus
14 namespace libyuv {
15 extern "C" {
16 #endif
17 
18 // This module is for GCC Neon armv8 64 bit.
19 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
20 
21 // Read 8 Y, 4 U and 4 V from 422
22 #define READYUV422                               \
23   "ld1        {v0.8b}, [%0], #8              \n" \
24   "ld1        {v1.s}[0], [%1], #4            \n" \
25   "ld1        {v1.s}[1], [%2], #4            \n"
26 
27 // Read 8 Y, 8 U and 8 V from 444
28 #define READYUV444                               \
29   "ld1        {v0.8b}, [%0], #8              \n" \
30   "ld1        {v1.d}[0], [%1], #8            \n" \
31   "ld1        {v1.d}[1], [%2], #8            \n" \
32   "uaddlp     v1.8h, v1.16b                  \n" \
33   "rshrn      v1.8b, v1.8h, #1               \n"
34 
35 // Read 8 Y, and set 4 U and 4 V to 128
36 #define READYUV400                               \
37   "ld1        {v0.8b}, [%0], #8              \n" \
38   "movi       v1.8b , #128                   \n"
39 
40 // Read 8 Y and 4 UV from NV12
41 #define READNV12                                 \
42   "ld1        {v0.8b}, [%0], #8              \n" \
43   "ld1        {v2.8b}, [%1], #8              \n" \
44   "uzp1       v1.8b, v2.8b, v2.8b            \n" \
45   "uzp2       v3.8b, v2.8b, v2.8b            \n" \
46   "ins        v1.s[1], v3.s[0]               \n"
47 
48 // Read 8 Y and 4 VU from NV21
49 #define READNV21                                 \
50   "ld1        {v0.8b}, [%0], #8              \n" \
51   "ld1        {v2.8b}, [%1], #8              \n" \
52   "uzp1       v3.8b, v2.8b, v2.8b            \n" \
53   "uzp2       v1.8b, v2.8b, v2.8b            \n" \
54   "ins        v1.s[1], v3.s[0]               \n"
55 
56 // Read 8 YUY2
57 #define READYUY2                                 \
58   "ld2        {v0.8b, v1.8b}, [%0], #16      \n" \
59   "uzp2       v3.8b, v1.8b, v1.8b            \n" \
60   "uzp1       v1.8b, v1.8b, v1.8b            \n" \
61   "ins        v1.s[1], v3.s[0]               \n"
62 
63 // Read 8 UYVY
64 #define READUYVY                                 \
65   "ld2        {v2.8b, v3.8b}, [%0], #16      \n" \
66   "orr        v0.8b, v3.8b, v3.8b            \n" \
67   "uzp1       v1.8b, v2.8b, v2.8b            \n" \
68   "uzp2       v3.8b, v2.8b, v2.8b            \n" \
69   "ins        v1.s[1], v3.s[0]               \n"
70 
71 #define YUVTORGB_SETUP                                      \
72   "ld3r       {v24.8h, v25.8h, v26.8h}, [%[kUVBiasBGR]] \n" \
73   "ld1r       {v31.4s}, [%[kYToRgb]]                    \n" \
74   "ld2        {v27.8h, v28.8h}, [%[kUVToRB]]            \n" \
75   "ld2        {v29.8h, v30.8h}, [%[kUVToG]]             \n"
76 
77 // clang-format off
78 
79 #define YUVTORGB(vR, vG, vB)                                        \
80   "uxtl       v0.8h, v0.8b                   \n" /* Extract Y    */ \
81   "shll       v2.8h, v1.8b, #8               \n" /* Replicate UV */ \
82   "ushll2     v3.4s, v0.8h, #0               \n" /* Y */            \
83   "ushll      v0.4s, v0.4h, #0               \n"                    \
84   "mul        v3.4s, v3.4s, v31.4s           \n"                    \
85   "mul        v0.4s, v0.4s, v31.4s           \n"                    \
86   "sqshrun    v0.4h, v0.4s, #16              \n"                    \
87   "sqshrun2   v0.8h, v3.4s, #16              \n" /* Y */            \
88   "uaddw      v1.8h, v2.8h, v1.8b            \n" /* Replicate UV */ \
89   "mov        v2.d[0], v1.d[1]               \n" /* Extract V */    \
90   "uxtl       v2.8h, v2.8b                   \n"                    \
91   "uxtl       v1.8h, v1.8b                   \n" /* Extract U */    \
92   "mul        v3.8h, v27.8h, v1.8h           \n"                    \
93   "mul        v5.8h, v29.8h, v1.8h           \n"                    \
94   "mul        v6.8h, v30.8h, v2.8h           \n"                    \
95   "mul        v7.8h, v28.8h, v2.8h           \n"                    \
96   "sqadd      v6.8h, v6.8h, v5.8h            \n"                    \
97   "sqadd      " #vB ".8h, v24.8h, v0.8h      \n" /* B */            \
98   "sqadd      " #vG ".8h, v25.8h, v0.8h      \n" /* G */            \
99   "sqadd      " #vR ".8h, v26.8h, v0.8h      \n" /* R */            \
100   "sqadd      " #vB ".8h, " #vB ".8h, v3.8h  \n" /* B */            \
101   "sqsub      " #vG ".8h, " #vG ".8h, v6.8h  \n" /* G */            \
102   "sqadd      " #vR ".8h, " #vR ".8h, v7.8h  \n" /* R */            \
103   "sqshrun    " #vB ".8b, " #vB ".8h, #6     \n" /* B */            \
104   "sqshrun    " #vG ".8b, " #vG ".8h, #6     \n" /* G */            \
105   "sqshrun    " #vR ".8b, " #vR ".8h, #6     \n" /* R */
106 
107 // clang-format on
108 
I444ToARGBRow_NEON(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)109 void I444ToARGBRow_NEON(const uint8_t* src_y,
110                         const uint8_t* src_u,
111                         const uint8_t* src_v,
112                         uint8_t* dst_argb,
113                         const struct YuvConstants* yuvconstants,
114                         int width) {
115   asm volatile (
116     YUVTORGB_SETUP
117       "movi        v23.8b, #255                  \n" /* A */
118       "1:                                        \n"
119     READYUV444
120       "prfm        pldl1keep, [%0, 448]          \n"
121     YUVTORGB(v22, v21, v20)
122       "prfm        pldl1keep, [%1, 448]          \n"
123       "prfm        pldl1keep, [%2, 448]          \n"
124       "subs        %w4, %w4, #8                  \n"
125       "st4         {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
126       "b.gt        1b                            \n"
127     : "+r"(src_y),     // %0
128       "+r"(src_u),     // %1
129       "+r"(src_v),     // %2
130       "+r"(dst_argb),  // %3
131       "+r"(width)      // %4
132     : [kUVToRB]"r"(&yuvconstants->kUVToRB),
133       [kUVToG]"r"(&yuvconstants->kUVToG),
134       [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
135       [kYToRgb]"r"(&yuvconstants->kYToRgb)
136     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
137       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
138   );
139 }
140 
I422ToARGBRow_NEON(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)141 void I422ToARGBRow_NEON(const uint8_t* src_y,
142                         const uint8_t* src_u,
143                         const uint8_t* src_v,
144                         uint8_t* dst_argb,
145                         const struct YuvConstants* yuvconstants,
146                         int width) {
147   asm volatile (
148     YUVTORGB_SETUP
149       "movi        v23.8b, #255                  \n" /* A */
150 
151       "1:                                        \n"
152     READYUV422
153       "prfm        pldl1keep, [%0, 448]          \n"
154     YUVTORGB(v22, v21, v20)
155       "prfm        pldl1keep, [%1, 128]          \n"
156       "prfm        pldl1keep, [%2, 128]          \n"
157       "subs        %w4, %w4, #8                  \n"
158       "st4         {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
159       "b.gt        1b                            \n"
160     : "+r"(src_y),     // %0
161       "+r"(src_u),     // %1
162       "+r"(src_v),     // %2
163       "+r"(dst_argb),  // %3
164       "+r"(width)      // %4
165     : [kUVToRB]"r"(&yuvconstants->kUVToRB),
166       [kUVToG]"r"(&yuvconstants->kUVToG),
167       [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
168       [kYToRgb]"r"(&yuvconstants->kYToRgb)
169     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
170       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
171   );
172 }
173 
I422AlphaToARGBRow_NEON(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,const uint8_t * src_a,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)174 void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
175                              const uint8_t* src_u,
176                              const uint8_t* src_v,
177                              const uint8_t* src_a,
178                              uint8_t* dst_argb,
179                              const struct YuvConstants* yuvconstants,
180                              int width) {
181   asm volatile (
182     YUVTORGB_SETUP
183       "1:                                        \n"
184     READYUV422
185       "prfm        pldl1keep, [%0, 448]          \n"
186     YUVTORGB(v22, v21, v20)
187       "ld1         {v23.8b}, [%3], #8            \n"
188       "prfm        pldl1keep, [%1, 128]          \n"
189       "prfm        pldl1keep, [%2, 128]          \n"
190       "prfm        pldl1keep, [%3, 448]          \n"
191       "subs        %w5, %w5, #8                  \n"
192       "st4         {v20.8b,v21.8b,v22.8b,v23.8b}, [%4], #32 \n"
193       "b.gt        1b                            \n"
194     : "+r"(src_y),     // %0
195       "+r"(src_u),     // %1
196       "+r"(src_v),     // %2
197       "+r"(src_a),     // %3
198       "+r"(dst_argb),  // %4
199       "+r"(width)      // %5
200     : [kUVToRB]"r"(&yuvconstants->kUVToRB),
201       [kUVToG]"r"(&yuvconstants->kUVToG),
202       [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
203       [kYToRgb]"r"(&yuvconstants->kYToRgb)
204     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
205       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
206   );
207 }
208 
I422ToRGBARow_NEON(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_rgba,const struct YuvConstants * yuvconstants,int width)209 void I422ToRGBARow_NEON(const uint8_t* src_y,
210                         const uint8_t* src_u,
211                         const uint8_t* src_v,
212                         uint8_t* dst_rgba,
213                         const struct YuvConstants* yuvconstants,
214                         int width) {
215   asm volatile (
216     YUVTORGB_SETUP
217       "movi        v20.8b, #255                  \n" /* A */
218       "1:                                        \n"
219     READYUV422
220       "prfm        pldl1keep, [%0, 448]          \n"
221     YUVTORGB(v23, v22, v21)
222       "prfm        pldl1keep, [%1, 128]          \n"
223       "prfm        pldl1keep, [%2, 128]          \n"
224       "subs        %w4, %w4, #8                  \n"
225       "st4         {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
226       "b.gt        1b                            \n"
227     : "+r"(src_y),     // %0
228       "+r"(src_u),     // %1
229       "+r"(src_v),     // %2
230       "+r"(dst_rgba),  // %3
231       "+r"(width)      // %4
232     : [kUVToRB]"r"(&yuvconstants->kUVToRB),
233       [kUVToG]"r"(&yuvconstants->kUVToG),
234       [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
235       [kYToRgb]"r"(&yuvconstants->kYToRgb)
236     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
237       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
238   );
239 }
240 
I422ToRGB24Row_NEON(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_rgb24,const struct YuvConstants * yuvconstants,int width)241 void I422ToRGB24Row_NEON(const uint8_t* src_y,
242                          const uint8_t* src_u,
243                          const uint8_t* src_v,
244                          uint8_t* dst_rgb24,
245                          const struct YuvConstants* yuvconstants,
246                          int width) {
247   asm volatile (
248     YUVTORGB_SETUP
249       "1:                                        \n"
250     READYUV422
251       "prfm        pldl1keep, [%0, 448]          \n"
252     YUVTORGB(v22, v21, v20)
253       "prfm        pldl1keep, [%1, 128]          \n"
254       "prfm        pldl1keep, [%2, 128]          \n"
255       "subs        %w4, %w4, #8                  \n"
256       "st3         {v20.8b,v21.8b,v22.8b}, [%3], #24 \n"
257       "b.gt        1b                            \n"
258     : "+r"(src_y),     // %0
259       "+r"(src_u),     // %1
260       "+r"(src_v),     // %2
261       "+r"(dst_rgb24), // %3
262       "+r"(width)      // %4
263     : [kUVToRB]"r"(&yuvconstants->kUVToRB),
264       [kUVToG]"r"(&yuvconstants->kUVToG),
265       [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
266       [kYToRgb]"r"(&yuvconstants->kYToRgb)
267     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
268       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
269   );
270 }
271 
272 #define ARGBTORGB565                                                        \
273   "shll       v0.8h,  v22.8b, #8             \n" /* R                    */ \
274   "shll       v21.8h, v21.8b, #8             \n" /* G                    */ \
275   "shll       v20.8h, v20.8b, #8             \n" /* B                    */ \
276   "sri        v0.8h,  v21.8h, #5             \n" /* RG                   */ \
277   "sri        v0.8h,  v20.8h, #11            \n" /* RGB                  */
278 
279 // clang-format off
280 
I422ToRGB565Row_NEON(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_rgb565,const struct YuvConstants * yuvconstants,int width)281 void I422ToRGB565Row_NEON(const uint8_t* src_y,
282                           const uint8_t* src_u,
283                           const uint8_t* src_v,
284                           uint8_t* dst_rgb565,
285                           const struct YuvConstants* yuvconstants,
286                           int width) {
287   asm volatile(
288     YUVTORGB_SETUP
289       "1:                                        \n"
290     READYUV422
291     YUVTORGB(v22, v21, v20)
292       "prfm        pldl1keep, [%0, 448]          \n"
293       "subs        %w4, %w4, #8                  \n"
294     ARGBTORGB565
295       "prfm        pldl1keep, [%1, 128]          \n"
296       "prfm        pldl1keep, [%2, 128]          \n"
297       "st1         {v0.8h}, [%3], #16            \n"  // store 8 pixels RGB565.
298       "b.gt        1b                            \n"
299       : "+r"(src_y),       // %0
300         "+r"(src_u),       // %1
301         "+r"(src_v),       // %2
302         "+r"(dst_rgb565),  // %3
303         "+r"(width)        // %4
304       : [kUVToRB] "r"(&yuvconstants->kUVToRB),
305         [kUVToG] "r"(&yuvconstants->kUVToG),
306         [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
307         [kYToRgb] "r"(&yuvconstants->kYToRgb)
308       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
309         "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30");
310 }
311 
312 #define ARGBTOARGB1555                                                      \
313   "shll       v0.8h,  v23.8b, #8             \n" /* A                    */ \
314   "shll       v22.8h, v22.8b, #8             \n" /* R                    */ \
315   "shll       v21.8h, v21.8b, #8             \n" /* G                    */ \
316   "shll       v20.8h, v20.8b, #8             \n" /* B                    */ \
317   "sri        v0.8h,  v22.8h, #1             \n" /* AR                   */ \
318   "sri        v0.8h,  v21.8h, #6             \n" /* ARG                  */ \
319   "sri        v0.8h,  v20.8h, #11            \n" /* ARGB                 */
320 
I422ToARGB1555Row_NEON(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb1555,const struct YuvConstants * yuvconstants,int width)321 void I422ToARGB1555Row_NEON(const uint8_t* src_y,
322                             const uint8_t* src_u,
323                             const uint8_t* src_v,
324                             uint8_t* dst_argb1555,
325                             const struct YuvConstants* yuvconstants,
326                             int width) {
327   asm volatile(
328     YUVTORGB_SETUP
329       "movi        v23.8b, #255                  \n"
330       "1:                                        \n"
331     READYUV422
332     YUVTORGB(v22, v21, v20)
333       "prfm        pldl1keep, [%0, 448]          \n"
334       "subs        %w4, %w4, #8                  \n"
335     ARGBTOARGB1555
336       "prfm        pldl1keep, [%1, 128]          \n"
337       "prfm        pldl1keep, [%2, 128]          \n"
338       "st1         {v0.8h}, [%3], #16            \n"  // store 8 pixels RGB565.
339       "b.gt        1b                            \n"
340       : "+r"(src_y),         // %0
341         "+r"(src_u),         // %1
342         "+r"(src_v),         // %2
343         "+r"(dst_argb1555),  // %3
344         "+r"(width)          // %4
345       : [kUVToRB] "r"(&yuvconstants->kUVToRB),
346         [kUVToG] "r"(&yuvconstants->kUVToG),
347         [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
348         [kYToRgb] "r"(&yuvconstants->kYToRgb)
349       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
350         "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30");
351 }
352 // clang-format on
353 
354 #define ARGBTOARGB4444                                                       \
355   /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f        */ \
356   "ushr       v20.8b, v20.8b, #4             \n" /* B                    */  \
357   "bic        v21.8b, v21.8b, v4.8b          \n" /* G                    */  \
358   "ushr       v22.8b, v22.8b, #4             \n" /* R                    */  \
359   "bic        v23.8b, v23.8b, v4.8b          \n" /* A                    */  \
360   "orr        v0.8b,  v20.8b, v21.8b         \n" /* BG                   */  \
361   "orr        v1.8b,  v22.8b, v23.8b         \n" /* RA                   */  \
362   "zip1       v0.16b, v0.16b, v1.16b         \n" /* BGRA                 */
363 
I422ToARGB4444Row_NEON(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb4444,const struct YuvConstants * yuvconstants,int width)364 void I422ToARGB4444Row_NEON(const uint8_t* src_y,
365                             const uint8_t* src_u,
366                             const uint8_t* src_v,
367                             uint8_t* dst_argb4444,
368                             const struct YuvConstants* yuvconstants,
369                             int width) {
370   asm volatile (
371     YUVTORGB_SETUP
372       "movi        v4.16b, #0x0f                 \n"  // bits to clear with vbic.
373       "1:                                        \n"
374     READYUV422
375     YUVTORGB(v22, v21, v20)
376       "prfm        pldl1keep, [%0, 448]          \n"
377       "subs        %w4, %w4, #8                  \n"
378       "movi        v23.8b, #255                  \n"
379     ARGBTOARGB4444
380       "prfm        pldl1keep, [%1, 128]          \n"
381       "prfm        pldl1keep, [%2, 128]          \n"
382       "st1         {v0.8h}, [%3], #16            \n"  // store 8 pixels ARGB4444.
383       "b.gt        1b                            \n"
384     : "+r"(src_y),    // %0
385       "+r"(src_u),    // %1
386       "+r"(src_v),    // %2
387       "+r"(dst_argb4444),  // %3
388       "+r"(width)     // %4
389     : [kUVToRB]"r"(&yuvconstants->kUVToRB),
390       [kUVToG]"r"(&yuvconstants->kUVToG),
391       [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
392       [kYToRgb]"r"(&yuvconstants->kYToRgb)
393     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
394       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
395   );
396 }
397 
I400ToARGBRow_NEON(const uint8_t * src_y,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)398 void I400ToARGBRow_NEON(const uint8_t* src_y,
399                         uint8_t* dst_argb,
400                         const struct YuvConstants* yuvconstants,
401                         int width) {
402   asm volatile (
403     YUVTORGB_SETUP
404       "movi        v23.8b, #255                  \n"
405       "1:                                        \n"
406     READYUV400
407     YUVTORGB(v22, v21, v20)
408       "prfm        pldl1keep, [%0, 448]          \n"
409       "subs        %w2, %w2, #8                  \n"
410       "st4         {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
411       "b.gt        1b                            \n"
412     : "+r"(src_y),     // %0
413       "+r"(dst_argb),  // %1
414       "+r"(width)      // %2
415     : [kUVToRB]"r"(&yuvconstants->kUVToRB),
416       [kUVToG]"r"(&yuvconstants->kUVToG),
417       [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
418       [kYToRgb]"r"(&yuvconstants->kYToRgb)
419     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
420       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
421   );
422 }
423 
J400ToARGBRow_NEON(const uint8_t * src_y,uint8_t * dst_argb,int width)424 void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
425   asm volatile(
426       "movi        v23.8b, #255                  \n"
427       "1:                                        \n"
428       "ld1         {v20.8b}, [%0], #8            \n"
429       "prfm        pldl1keep, [%0, 448]          \n"
430       "orr         v21.8b, v20.8b, v20.8b        \n"
431       "orr         v22.8b, v20.8b, v20.8b        \n"
432       "subs        %w2, %w2, #8                  \n"
433       "st4         {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
434       "b.gt        1b                            \n"
435       : "+r"(src_y),     // %0
436         "+r"(dst_argb),  // %1
437         "+r"(width)      // %2
438       :
439       : "cc", "memory", "v20", "v21", "v22", "v23");
440 }
441 
NV12ToARGBRow_NEON(const uint8_t * src_y,const uint8_t * src_uv,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)442 void NV12ToARGBRow_NEON(const uint8_t* src_y,
443                         const uint8_t* src_uv,
444                         uint8_t* dst_argb,
445                         const struct YuvConstants* yuvconstants,
446                         int width) {
447   asm volatile (
448     YUVTORGB_SETUP
449       "movi        v23.8b, #255                  \n"
450       "1:                                        \n"
451     READNV12
452       "prfm        pldl1keep, [%0, 448]          \n"
453     YUVTORGB(v22, v21, v20)
454       "prfm        pldl1keep, [%1, 256]          \n"
455       "subs        %w3, %w3, #8                  \n"
456       "st4         {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
457       "b.gt        1b                            \n"
458     : "+r"(src_y),     // %0
459       "+r"(src_uv),    // %1
460       "+r"(dst_argb),  // %2
461       "+r"(width)      // %3
462     : [kUVToRB]"r"(&yuvconstants->kUVToRB),
463       [kUVToG]"r"(&yuvconstants->kUVToG),
464       [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
465       [kYToRgb]"r"(&yuvconstants->kYToRgb)
466     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
467       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
468   );
469 }
470 
NV21ToARGBRow_NEON(const uint8_t * src_y,const uint8_t * src_vu,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)471 void NV21ToARGBRow_NEON(const uint8_t* src_y,
472                         const uint8_t* src_vu,
473                         uint8_t* dst_argb,
474                         const struct YuvConstants* yuvconstants,
475                         int width) {
476   asm volatile (
477     YUVTORGB_SETUP
478       "movi        v23.8b, #255                  \n"
479       "1:                                        \n"
480     READNV21
481       "prfm        pldl1keep, [%0, 448]          \n"
482     YUVTORGB(v22, v21, v20)
483       "prfm        pldl1keep, [%1, 256]          \n"
484       "subs        %w3, %w3, #8                  \n"
485       "st4         {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
486       "b.gt        1b                            \n"
487     : "+r"(src_y),     // %0
488       "+r"(src_vu),    // %1
489       "+r"(dst_argb),  // %2
490       "+r"(width)      // %3
491     : [kUVToRB]"r"(&yuvconstants->kUVToRB),
492       [kUVToG]"r"(&yuvconstants->kUVToG),
493       [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
494       [kYToRgb]"r"(&yuvconstants->kYToRgb)
495     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
496       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
497   );
498 }
499 
NV12ToRGB24Row_NEON(const uint8_t * src_y,const uint8_t * src_uv,uint8_t * dst_rgb24,const struct YuvConstants * yuvconstants,int width)500 void NV12ToRGB24Row_NEON(const uint8_t* src_y,
501                          const uint8_t* src_uv,
502                          uint8_t* dst_rgb24,
503                          const struct YuvConstants* yuvconstants,
504                          int width) {
505   asm volatile (
506     YUVTORGB_SETUP
507       "1:                                        \n"
508     READNV12
509       "prfm        pldl1keep, [%0, 448]          \n"
510     YUVTORGB(v22, v21, v20)
511       "prfm        pldl1keep, [%1, 256]          \n"
512       "subs        %w3, %w3, #8                  \n"
513       "st3         {v20.8b,v21.8b,v22.8b}, [%2], #24 \n"
514       "b.gt        1b                            \n"
515     : "+r"(src_y),     // %0
516       "+r"(src_uv),    // %1
517       "+r"(dst_rgb24),  // %2
518       "+r"(width)      // %3
519     : [kUVToRB]"r"(&yuvconstants->kUVToRB),
520       [kUVToG]"r"(&yuvconstants->kUVToG),
521       [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
522       [kYToRgb]"r"(&yuvconstants->kYToRgb)
523     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
524       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
525   );
526 }
527 
NV21ToRGB24Row_NEON(const uint8_t * src_y,const uint8_t * src_vu,uint8_t * dst_rgb24,const struct YuvConstants * yuvconstants,int width)528 void NV21ToRGB24Row_NEON(const uint8_t* src_y,
529                          const uint8_t* src_vu,
530                          uint8_t* dst_rgb24,
531                          const struct YuvConstants* yuvconstants,
532                          int width) {
533   asm volatile (
534     YUVTORGB_SETUP
535       "1:                                        \n"
536     READNV21
537       "prfm        pldl1keep, [%0, 448]          \n"
538     YUVTORGB(v22, v21, v20)
539       "prfm        pldl1keep, [%1, 256]          \n"
540       "subs        %w3, %w3, #8                  \n"
541       "st3         {v20.8b,v21.8b,v22.8b}, [%2], #24 \n"
542       "b.gt        1b                            \n"
543     : "+r"(src_y),     // %0
544       "+r"(src_vu),    // %1
545       "+r"(dst_rgb24),  // %2
546       "+r"(width)      // %3
547     : [kUVToRB]"r"(&yuvconstants->kUVToRB),
548       [kUVToG]"r"(&yuvconstants->kUVToG),
549       [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
550       [kYToRgb]"r"(&yuvconstants->kYToRgb)
551     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
552       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
553   );
554 }
555 
NV12ToRGB565Row_NEON(const uint8_t * src_y,const uint8_t * src_uv,uint8_t * dst_rgb565,const struct YuvConstants * yuvconstants,int width)556 void NV12ToRGB565Row_NEON(const uint8_t* src_y,
557                           const uint8_t* src_uv,
558                           uint8_t* dst_rgb565,
559                           const struct YuvConstants* yuvconstants,
560                           int width) {
561   asm volatile(
562       YUVTORGB_SETUP "1:                                        \n" READNV12
563                      "prfm        pldl1keep, [%0, 448]          \n" YUVTORGB(
564                          v22, v21, v20) ARGBTORGB565
565       "prfm        pldl1keep, [%1, 256]          \n"
566       "subs        %w3, %w3, #8                  \n"
567       "st1         {v0.8h}, [%2], 16             \n"  // store 8 pixels
568       "b.gt        1b                            \n"
569       : "+r"(src_y),       // %0
570         "+r"(src_uv),      // %1
571         "+r"(dst_rgb565),  // %2
572         "+r"(width)        // %3
573       : [kUVToRB] "r"(&yuvconstants->kUVToRB),
574         [kUVToG] "r"(&yuvconstants->kUVToG),
575         [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
576         [kYToRgb] "r"(&yuvconstants->kYToRgb)
577       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
578         "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30");
579 }
580 
YUY2ToARGBRow_NEON(const uint8_t * src_yuy2,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)581 void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
582                         uint8_t* dst_argb,
583                         const struct YuvConstants* yuvconstants,
584                         int width) {
585   asm volatile (
586     YUVTORGB_SETUP
587       "movi        v23.8b, #255                  \n"
588       "1:                                        \n"
589     READYUY2
590       "prfm        pldl1keep, [%0, 448]          \n"
591     YUVTORGB(v22, v21, v20)
592       "subs        %w2, %w2, #8                  \n"
593       "st4         {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
594       "b.gt        1b                            \n"
595     : "+r"(src_yuy2),  // %0
596       "+r"(dst_argb),  // %1
597       "+r"(width)      // %2
598     : [kUVToRB]"r"(&yuvconstants->kUVToRB),
599       [kUVToG]"r"(&yuvconstants->kUVToG),
600       [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
601       [kYToRgb]"r"(&yuvconstants->kYToRgb)
602     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
603       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
604   );
605 }
606 
UYVYToARGBRow_NEON(const uint8_t * src_uyvy,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)607 void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
608                         uint8_t* dst_argb,
609                         const struct YuvConstants* yuvconstants,
610                         int width) {
611   asm volatile (
612     YUVTORGB_SETUP
613       "movi        v23.8b, #255                  \n"
614       "1:                                        \n"
615     READUYVY
616     YUVTORGB(v22, v21, v20)
617       "prfm        pldl1keep, [%0, 448]          \n"
618       "subs        %w2, %w2, #8                  \n"
619       "st4         {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n"
620       "b.gt        1b                            \n"
621     : "+r"(src_uyvy),  // %0
622       "+r"(dst_argb),  // %1
623       "+r"(width)      // %2
624     : [kUVToRB]"r"(&yuvconstants->kUVToRB),
625       [kUVToG]"r"(&yuvconstants->kUVToG),
626       [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
627       [kYToRgb]"r"(&yuvconstants->kYToRgb)
628     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
629       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
630   );
631 }
632 
633 // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
SplitUVRow_NEON(const uint8_t * src_uv,uint8_t * dst_u,uint8_t * dst_v,int width)634 void SplitUVRow_NEON(const uint8_t* src_uv,
635                      uint8_t* dst_u,
636                      uint8_t* dst_v,
637                      int width) {
638   asm volatile(
639       "1:                                        \n"
640       "ld2         {v0.16b,v1.16b}, [%0], #32    \n"  // load 16 pairs of UV
641       "prfm        pldl1keep, [%0, 448]          \n"
642       "subs        %w3, %w3, #16                 \n"  // 16 processed per loop
643       "st1         {v0.16b}, [%1], #16           \n"  // store U
644       "st1         {v1.16b}, [%2], #16           \n"  // store V
645       "b.gt        1b                            \n"
646       : "+r"(src_uv),               // %0
647         "+r"(dst_u),                // %1
648         "+r"(dst_v),                // %2
649         "+r"(width)                 // %3  // Output registers
650       :                             // Input registers
651       : "cc", "memory", "v0", "v1"  // Clobber List
652   );
653 }
654 
655 // Reads 16 U's and V's and writes out 16 pairs of UV.
MergeUVRow_NEON(const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_uv,int width)656 void MergeUVRow_NEON(const uint8_t* src_u,
657                      const uint8_t* src_v,
658                      uint8_t* dst_uv,
659                      int width) {
660   asm volatile(
661       "1:                                        \n"
662       "ld1         {v0.16b}, [%0], #16           \n"  // load U
663       "ld1         {v1.16b}, [%1], #16           \n"  // load V
664       "prfm        pldl1keep, [%0, 448]          \n"
665       "prfm        pldl1keep, [%1, 448]          \n"
666       "subs        %w3, %w3, #16                 \n"  // 16 processed per loop
667       "st2         {v0.16b,v1.16b}, [%2], #32    \n"  // store 16 pairs of UV
668       "b.gt        1b                            \n"
669       : "+r"(src_u),                // %0
670         "+r"(src_v),                // %1
671         "+r"(dst_uv),               // %2
672         "+r"(width)                 // %3  // Output registers
673       :                             // Input registers
674       : "cc", "memory", "v0", "v1"  // Clobber List
675   );
676 }
677 
678 // Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b.
SplitRGBRow_NEON(const uint8_t * src_rgb,uint8_t * dst_r,uint8_t * dst_g,uint8_t * dst_b,int width)679 void SplitRGBRow_NEON(const uint8_t* src_rgb,
680                       uint8_t* dst_r,
681                       uint8_t* dst_g,
682                       uint8_t* dst_b,
683                       int width) {
684   asm volatile(
685       "1:                                        \n"
686       "ld3         {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 16 RGB
687       "prfm        pldl1keep, [%0, 448]          \n"
688       "subs        %w4, %w4, #16                 \n"  // 16 processed per loop
689       "st1         {v0.16b}, [%1], #16           \n"  // store R
690       "st1         {v1.16b}, [%2], #16           \n"  // store G
691       "st1         {v2.16b}, [%3], #16           \n"  // store B
692       "b.gt        1b                            \n"
693       : "+r"(src_rgb),                    // %0
694         "+r"(dst_r),                      // %1
695         "+r"(dst_g),                      // %2
696         "+r"(dst_b),                      // %3
697         "+r"(width)                       // %4
698       :                                   // Input registers
699       : "cc", "memory", "v0", "v1", "v2"  // Clobber List
700   );
701 }
702 
703 // Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time
MergeRGBRow_NEON(const uint8_t * src_r,const uint8_t * src_g,const uint8_t * src_b,uint8_t * dst_rgb,int width)704 void MergeRGBRow_NEON(const uint8_t* src_r,
705                       const uint8_t* src_g,
706                       const uint8_t* src_b,
707                       uint8_t* dst_rgb,
708                       int width) {
709   asm volatile(
710       "1:                                        \n"
711       "ld1         {v0.16b}, [%0], #16           \n"  // load R
712       "ld1         {v1.16b}, [%1], #16           \n"  // load G
713       "ld1         {v2.16b}, [%2], #16           \n"  // load B
714       "prfm        pldl1keep, [%0, 448]          \n"
715       "prfm        pldl1keep, [%1, 448]          \n"
716       "prfm        pldl1keep, [%2, 448]          \n"
717       "subs        %w4, %w4, #16                 \n"  // 16 processed per loop
718       "st3         {v0.16b,v1.16b,v2.16b}, [%3], #48 \n"  // store 16 RGB
719       "prfm        pldl1keep, [%0, 448]          \n"
720       "b.gt        1b                            \n"
721       : "+r"(src_r),                      // %0
722         "+r"(src_g),                      // %1
723         "+r"(src_b),                      // %2
724         "+r"(dst_rgb),                    // %3
725         "+r"(width)                       // %4
726       :                                   // Input registers
727       : "cc", "memory", "v0", "v1", "v2"  // Clobber List
728   );
729 }
730 
731 // Copy multiple of 32.
CopyRow_NEON(const uint8_t * src,uint8_t * dst,int width)732 void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
733   asm volatile(
734       "1:                                        \n"
735       "ldp         q0, q1, [%0], #32             \n"
736       "prfm        pldl1keep, [%0, 448]          \n"
737       "subs        %w2, %w2, #32                 \n"  // 32 processed per loop
738       "stp         q0, q1, [%1], #32             \n"
739       "b.gt        1b                            \n"
740       : "+r"(src),                  // %0
741         "+r"(dst),                  // %1
742         "+r"(width)                 // %2  // Output registers
743       :                             // Input registers
744       : "cc", "memory", "v0", "v1"  // Clobber List
745   );
746 }
747 
748 // SetRow writes 'width' bytes using an 8 bit value repeated.
SetRow_NEON(uint8_t * dst,uint8_t v8,int width)749 void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
750   asm volatile(
751       "dup         v0.16b, %w2                   \n"  // duplicate 16 bytes
752       "1:                                        \n"
753       "subs        %w1, %w1, #16                 \n"  // 16 bytes per loop
754       "st1         {v0.16b}, [%0], #16           \n"  // store
755       "b.gt        1b                            \n"
756       : "+r"(dst),   // %0
757         "+r"(width)  // %1
758       : "r"(v8)      // %2
759       : "cc", "memory", "v0");
760 }
761 
ARGBSetRow_NEON(uint8_t * dst,uint32_t v32,int width)762 void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
763   asm volatile(
764       "dup         v0.4s, %w2                    \n"  // duplicate 4 ints
765       "1:                                        \n"
766       "subs        %w1, %w1, #4                  \n"  // 4 ints per loop
767       "st1         {v0.16b}, [%0], #16           \n"  // store
768       "b.gt        1b                            \n"
769       : "+r"(dst),   // %0
770         "+r"(width)  // %1
771       : "r"(v32)     // %2
772       : "cc", "memory", "v0");
773 }
774 
775 // Shuffle table for reversing the bytes.
776 static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
777                                      7u,  6u,  5u,  4u,  3u,  2u,  1u, 0u};
778 
MirrorRow_NEON(const uint8_t * src,uint8_t * dst,int width)779 void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
780   asm volatile(
781       // Start at end of source row.
782       "ld1         {v3.16b}, [%3]                \n"  // shuffler
783       "add         %0, %0, %w2, sxtw             \n"
784       "sub         %0, %0, #32                   \n"
785       "1:                                        \n"
786       "ldr         q2, [%0, 16]                  \n"
787       "ldr         q1, [%0], -32                 \n"  // src -= 32
788       "subs        %w2, %w2, #32                 \n"  // 32 pixels per loop.
789       "tbl         v0.16b, {v2.16b}, v3.16b      \n"
790       "tbl         v1.16b, {v1.16b}, v3.16b      \n"
791       "st1         {v0.16b, v1.16b}, [%1], #32   \n"  // store 32 pixels
792       "b.gt        1b                            \n"
793       : "+r"(src),            // %0
794         "+r"(dst),            // %1
795         "+r"(width)           // %2
796       : "r"(&kShuffleMirror)  // %3
797       : "cc", "memory", "v0", "v1", "v2", "v3");
798 }
799 
800 // Shuffle table for reversing the UV.
801 static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u,
802                                        6u,  7u,  4u,  5u,  2u,  3u,  0u, 1u};
803 
MirrorUVRow_NEON(const uint8_t * src_uv,uint8_t * dst_uv,int width)804 void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
805   asm volatile(
806       // Start at end of source row.
807       "ld1         {v4.16b}, [%3]                \n"  // shuffler
808       "add         %0, %0, %w2, sxtw #1          \n"
809       "sub         %0, %0, #32                   \n"
810       "1:                                        \n"
811       "ldr         q1, [%0, 16]                  \n"
812       "ldr         q0, [%0], -32                 \n"  // src -= 32
813       "subs        %w2, %w2, #16                 \n"  // 16 pixels per loop.
814       "tbl         v2.16b, {v1.16b}, v4.16b      \n"
815       "tbl         v3.16b, {v0.16b}, v4.16b      \n"
816       "st1         {v2.16b, v3.16b}, [%1], #32   \n"  // dst += 32
817       "b.gt        1b                            \n"
818       : "+r"(src_uv),           // %0
819         "+r"(dst_uv),           // %1
820         "+r"(width)             // %2
821       : "r"(&kShuffleMirrorUV)  // %3
822       : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
823 }
824 
MirrorSplitUVRow_NEON(const uint8_t * src_uv,uint8_t * dst_u,uint8_t * dst_v,int width)825 void MirrorSplitUVRow_NEON(const uint8_t* src_uv,
826                            uint8_t* dst_u,
827                            uint8_t* dst_v,
828                            int width) {
829   asm volatile(
830       // Start at end of source row.
831       "ld1         {v4.16b}, [%4]                \n"  // shuffler
832       "add         %0, %0, %w3, sxtw #1          \n"
833       "sub         %0, %0, #32                   \n"
834       "1:                                        \n"
835       "ldr         q1, [%0, 16]                  \n"
836       "ldr         q0, [%0], -32                 \n"  // src -= 32
837       "subs        %w3, %w3, #16                 \n"  // 16 pixels per loop.
838       "tbl         v2.16b, {v1.16b}, v4.16b      \n"
839       "tbl         v3.16b, {v0.16b}, v4.16b      \n"
840       "uzp1        v0.16b, v2.16b, v3.16b        \n"  // U
841       "uzp2        v1.16b, v2.16b, v3.16b        \n"  // V
842       "st1         {v0.16b}, [%1], #16           \n"  // dst += 16
843       "st1         {v1.16b}, [%2], #16           \n"
844       "b.gt        1b                            \n"
845       : "+r"(src_uv),           // %0
846         "+r"(dst_u),            // %1
847         "+r"(dst_v),            // %2
848         "+r"(width)             // %3
849       : "r"(&kShuffleMirrorUV)  // %4
850       : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
851 }
852 
853 // Shuffle table for reversing the ARGB.
854 static const uvec8 kShuffleMirrorARGB = {12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u,
855                                          4u,  5u,  6u,  7u,  0u, 1u, 2u,  3u};
856 
ARGBMirrorRow_NEON(const uint8_t * src_argb,uint8_t * dst_argb,int width)857 void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
858   asm volatile(
859       // Start at end of source row.
860       "ld1         {v4.16b}, [%3]                \n"  // shuffler
861       "add         %0, %0, %w2, sxtw #2          \n"
862       "sub         %0, %0, #32                   \n"
863       "1:                                        \n"
864       "ldr         q1, [%0, 16]                  \n"
865       "ldr         q0, [%0], -32                 \n"  // src -= 32
866       "subs        %w2, %w2, #8                  \n"  // 8 pixels per loop.
867       "tbl         v2.16b, {v1.16b}, v4.16b      \n"
868       "tbl         v3.16b, {v0.16b}, v4.16b      \n"
869       "st1         {v2.16b, v3.16b}, [%1], #32   \n"  // dst += 32
870       "b.gt        1b                            \n"
871       : "+r"(src_argb),           // %0
872         "+r"(dst_argb),           // %1
873         "+r"(width)               // %2
874       : "r"(&kShuffleMirrorARGB)  // %3
875       : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
876 }
877 
RGB24MirrorRow_NEON(const uint8_t * src_rgb24,uint8_t * dst_rgb24,int width)878 void RGB24MirrorRow_NEON(const uint8_t* src_rgb24,
879                          uint8_t* dst_rgb24,
880                          int width) {
881   asm volatile(
882       "ld1         {v3.16b}, [%4]                \n"  // shuffler
883       "add         %0, %0, %w2, sxtw #1          \n"  // Start at end of row.
884       "add         %0, %0, %w2, sxtw             \n"
885       "sub         %0, %0, #48                   \n"
886 
887       "1:                                        \n"
888       "ld3         {v0.16b, v1.16b, v2.16b}, [%0], %3 \n"  // src -= 48
889       "subs        %w2, %w2, #16                 \n"  // 16 pixels per loop.
890       "tbl         v0.16b, {v0.16b}, v3.16b      \n"
891       "tbl         v1.16b, {v1.16b}, v3.16b      \n"
892       "tbl         v2.16b, {v2.16b}, v3.16b      \n"
893       "st3         {v0.16b, v1.16b, v2.16b}, [%1], #48 \n"  // dst += 48
894       "b.gt        1b                            \n"
895       : "+r"(src_rgb24),      // %0
896         "+r"(dst_rgb24),      // %1
897         "+r"(width)           // %2
898       : "r"((ptrdiff_t)-48),  // %3
899         "r"(&kShuffleMirror)  // %4
900       : "cc", "memory", "v0", "v1", "v2", "v3");
901 }
902 
RGB24ToARGBRow_NEON(const uint8_t * src_rgb24,uint8_t * dst_argb,int width)903 void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
904                          uint8_t* dst_argb,
905                          int width) {
906   asm volatile(
907       "movi        v4.8b, #255                   \n"  // Alpha
908       "1:                                        \n"
909       "ld3         {v1.8b,v2.8b,v3.8b}, [%0], #24 \n"  // load 8 pixels of
910                                                        // RGB24.
911       "prfm        pldl1keep, [%0, 448]          \n"
912       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
913       "st4         {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n"  // store 8 ARGB
914       "b.gt        1b                            \n"
915       : "+r"(src_rgb24),  // %0
916         "+r"(dst_argb),   // %1
917         "+r"(width)       // %2
918       :
919       : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List
920   );
921 }
922 
RAWToARGBRow_NEON(const uint8_t * src_raw,uint8_t * dst_argb,int width)923 void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
924   asm volatile(
925       "movi        v5.8b, #255                   \n"  // Alpha
926       "1:                                        \n"
927       "ld3         {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // read r g b
928       "prfm        pldl1keep, [%0, 448]          \n"
929       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
930       "orr         v3.8b, v1.8b, v1.8b           \n"  // move g
931       "orr         v4.8b, v0.8b, v0.8b           \n"  // move r
932       "st4         {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n"  // store b g r a
933       "b.gt        1b                            \n"
934       : "+r"(src_raw),   // %0
935         "+r"(dst_argb),  // %1
936         "+r"(width)      // %2
937       :
938       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5"  // Clobber List
939   );
940 }
941 
RAWToRGBARow_NEON(const uint8_t * src_raw,uint8_t * dst_rgba,int width)942 void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
943   asm volatile(
944       "movi        v0.8b, #255                   \n"  // Alpha
945       "1:                                        \n"
946       "ld3         {v3.8b,v4.8b,v5.8b}, [%0], #24 \n"  // read r g b
947       "prfm        pldl1keep, [%0, 448]          \n"
948       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
949       "orr         v2.8b, v4.8b, v4.8b           \n"  // move g
950       "orr         v1.8b, v5.8b, v5.8b           \n"  // move r
951       "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store a b g r
952       "b.gt        1b                            \n"
953       : "+r"(src_raw),   // %0
954         "+r"(dst_rgba),  // %1
955         "+r"(width)      // %2
956       :
957       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5"  // Clobber List
958   );
959 }
960 
RAWToRGB24Row_NEON(const uint8_t * src_raw,uint8_t * dst_rgb24,int width)961 void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
962   asm volatile(
963       "1:                                        \n"
964       "ld3         {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // read r g b
965       "prfm        pldl1keep, [%0, 448]          \n"
966       "subs        %w2, %w2, #8                  \n"   // 8 processed per loop.
967       "orr         v3.8b, v1.8b, v1.8b           \n"   // move g
968       "orr         v4.8b, v0.8b, v0.8b           \n"   // move r
969       "st3         {v2.8b,v3.8b,v4.8b}, [%1], #24 \n"  // store b g r
970       "b.gt        1b                            \n"
971       : "+r"(src_raw),    // %0
972         "+r"(dst_rgb24),  // %1
973         "+r"(width)       // %2
974       :
975       : "cc", "memory", "v0", "v1", "v2", "v3", "v4"  // Clobber List
976   );
977 }
978 
979 #define RGB565TOARGB                                                        \
980   "shrn       v6.8b, v0.8h, #5               \n" /* G xxGGGGGG           */ \
981   "shl        v6.8b, v6.8b, #2               \n" /* G GGGGGG00 upper 6   */ \
982   "ushr       v4.8b, v6.8b, #6               \n" /* G 000000GG lower 2   */ \
983   "orr        v1.8b, v4.8b, v6.8b            \n" /* G                    */ \
984   "xtn        v2.8b, v0.8h                   \n" /* B xxxBBBBB           */ \
985   "ushr       v0.8h, v0.8h, #11              \n" /* R 000RRRRR           */ \
986   "xtn2       v2.16b,v0.8h                   \n" /* R in upper part      */ \
987   "shl        v2.16b, v2.16b, #3             \n" /* R,B BBBBB000 upper 5 */ \
988   "ushr       v0.16b, v2.16b, #5             \n" /* R,B 00000BBB lower 3 */ \
989   "orr        v0.16b, v0.16b, v2.16b         \n" /* R,B                  */ \
990   "dup        v2.2D, v0.D[1]                 \n" /* R                    */
991 
RGB565ToARGBRow_NEON(const uint8_t * src_rgb565,uint8_t * dst_argb,int width)992 void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
993                           uint8_t* dst_argb,
994                           int width) {
995   asm volatile(
996       "movi        v3.8b, #255                   \n"  // Alpha
997       "1:                                        \n"
998       "ld1         {v0.16b}, [%0], #16           \n"  // load 8 RGB565 pixels.
999       "prfm        pldl1keep, [%0, 448]          \n"
1000       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
1001       RGB565TOARGB
1002       "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB
1003       "b.gt        1b                            \n"
1004       : "+r"(src_rgb565),  // %0
1005         "+r"(dst_argb),    // %1
1006         "+r"(width)        // %2
1007       :
1008       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6"  // Clobber List
1009   );
1010 }
1011 
1012 #define ARGB1555TOARGB                                                      \
1013   "ushr       v2.8h, v0.8h, #10              \n" /* R xxxRRRRR           */ \
1014   "shl        v2.8h, v2.8h, #3               \n" /* R RRRRR000 upper 5   */ \
1015   "xtn        v3.8b, v2.8h                   \n" /* RRRRR000 AAAAAAAA    */ \
1016                                                                             \
1017   "sshr       v2.8h, v0.8h, #15              \n" /* A AAAAAAAA           */ \
1018   "xtn2       v3.16b, v2.8h                  \n"                            \
1019                                                                             \
1020   "xtn        v2.8b, v0.8h                   \n" /* B xxxBBBBB           */ \
1021   "shrn2      v2.16b,v0.8h, #5               \n" /* G xxxGGGGG           */ \
1022                                                                             \
1023   "ushr       v1.16b, v3.16b, #5             \n" /* R,A 00000RRR lower 3 */ \
1024   "shl        v0.16b, v2.16b, #3             \n" /* B,G BBBBB000 upper 5 */ \
1025   "ushr       v2.16b, v0.16b, #5             \n" /* B,G 00000BBB lower 3 */ \
1026                                                                             \
1027   "orr        v0.16b, v0.16b, v2.16b         \n" /* B,G                  */ \
1028   "orr        v2.16b, v1.16b, v3.16b         \n" /* R,A                  */ \
1029   "dup        v1.2D, v0.D[1]                 \n"                            \
1030   "dup        v3.2D, v2.D[1]                 \n"
1031 
1032 // RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
1033 #define RGB555TOARGB                                                        \
1034   "ushr       v2.8h, v0.8h, #10              \n" /* R xxxRRRRR           */ \
1035   "shl        v2.8h, v2.8h, #3               \n" /* R RRRRR000 upper 5   */ \
1036   "xtn        v3.8b, v2.8h                   \n" /* RRRRR000             */ \
1037                                                                             \
1038   "xtn        v2.8b, v0.8h                   \n" /* B xxxBBBBB           */ \
1039   "shrn2      v2.16b,v0.8h, #5               \n" /* G xxxGGGGG           */ \
1040                                                                             \
1041   "ushr       v1.16b, v3.16b, #5             \n" /* R   00000RRR lower 3 */ \
1042   "shl        v0.16b, v2.16b, #3             \n" /* B,G BBBBB000 upper 5 */ \
1043   "ushr       v2.16b, v0.16b, #5             \n" /* B,G 00000BBB lower 3 */ \
1044                                                                             \
1045   "orr        v0.16b, v0.16b, v2.16b         \n" /* B,G                  */ \
1046   "orr        v2.16b, v1.16b, v3.16b         \n" /* R                    */ \
1047   "dup        v1.2D, v0.D[1]                 \n" /* G */
1048 
ARGB1555ToARGBRow_NEON(const uint8_t * src_argb1555,uint8_t * dst_argb,int width)1049 void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
1050                             uint8_t* dst_argb,
1051                             int width) {
1052   asm volatile(
1053       "movi        v3.8b, #255                   \n"  // Alpha
1054       "1:                                        \n"
1055       "ld1         {v0.16b}, [%0], #16           \n"  // load 8 ARGB1555 pixels.
1056       "prfm        pldl1keep, [%0, 448]          \n"
1057       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
1058       ARGB1555TOARGB
1059       "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB
1060       "b.gt        1b                            \n"
1061       : "+r"(src_argb1555),  // %0
1062         "+r"(dst_argb),      // %1
1063         "+r"(width)          // %2
1064       :
1065       : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
1066   );
1067 }
1068 
1069 // Convert v0.8h to b = v0.8b g = v1.8b r = v2.8b
1070 // clobbers v3
1071 #define ARGB4444TOARGB                                                      \
1072   "shrn       v1.8b,  v0.8h, #8              \n" /* v1(l) AR             */ \
1073   "xtn2       v1.16b, v0.8h                  \n" /* v1(h) GB             */ \
1074   "shl        v2.16b, v1.16b, #4             \n" /* B,R BBBB0000         */ \
1075   "ushr       v3.16b, v1.16b, #4             \n" /* G,A 0000GGGG         */ \
1076   "ushr       v0.16b, v2.16b, #4             \n" /* B,R 0000BBBB         */ \
1077   "shl        v1.16b, v3.16b, #4             \n" /* G,A GGGG0000         */ \
1078   "orr        v2.16b, v0.16b, v2.16b         \n" /* B,R BBBBBBBB         */ \
1079   "orr        v3.16b, v1.16b, v3.16b         \n" /* G,A GGGGGGGG         */ \
1080   "dup        v0.2D, v2.D[1]                 \n"                            \
1081   "dup        v1.2D, v3.D[1]                 \n"
1082 
ARGB4444ToARGBRow_NEON(const uint8_t * src_argb4444,uint8_t * dst_argb,int width)1083 void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
1084                             uint8_t* dst_argb,
1085                             int width) {
1086   asm volatile(
1087       "1:                                        \n"
1088       "ld1         {v0.16b}, [%0], #16           \n"  // load 8 ARGB4444 pixels.
1089       "prfm        pldl1keep, [%0, 448]          \n"
1090       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
1091       ARGB4444TOARGB
1092       "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB
1093       "b.gt        1b                            \n"
1094       : "+r"(src_argb4444),  // %0
1095         "+r"(dst_argb),      // %1
1096         "+r"(width)          // %2
1097       :
1098       : "cc", "memory", "v0", "v1", "v2", "v3", "v4"  // Clobber List
1099   );
1100 }
1101 
ARGBToRGB24Row_NEON(const uint8_t * src_argb,uint8_t * dst_rgb24,int width)1102 void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
1103                          uint8_t* dst_rgb24,
1104                          int width) {
1105   asm volatile(
1106       "1:                                        \n"
1107       "ld4         {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load 8 ARGB
1108       "prfm        pldl1keep, [%0, 448]          \n"
1109       "subs        %w2, %w2, #8                  \n"   // 8 processed per loop.
1110       "st3         {v1.8b,v2.8b,v3.8b}, [%1], #24 \n"  // store 8 pixels of
1111                                                        // RGB24
1112       "b.gt        1b                            \n"
1113       : "+r"(src_argb),   // %0
1114         "+r"(dst_rgb24),  // %1
1115         "+r"(width)       // %2
1116       :
1117       : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List
1118   );
1119 }
1120 
ARGBToRAWRow_NEON(const uint8_t * src_argb,uint8_t * dst_raw,int width)1121 void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
1122   asm volatile(
1123       "1:                                        \n"
1124       "ld4         {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load b g r a
1125       "prfm        pldl1keep, [%0, 448]          \n"
1126       "subs        %w2, %w2, #8                  \n"   // 8 processed per loop.
1127       "orr         v4.8b, v2.8b, v2.8b           \n"   // mov g
1128       "orr         v5.8b, v1.8b, v1.8b           \n"   // mov b
1129       "st3         {v3.8b,v4.8b,v5.8b}, [%1], #24 \n"  // store r g b
1130       "b.gt        1b                            \n"
1131       : "+r"(src_argb),  // %0
1132         "+r"(dst_raw),   // %1
1133         "+r"(width)      // %2
1134       :
1135       : "cc", "memory", "v1", "v2", "v3", "v4", "v5"  // Clobber List
1136   );
1137 }
1138 
YUY2ToYRow_NEON(const uint8_t * src_yuy2,uint8_t * dst_y,int width)1139 void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
1140   asm volatile(
1141       "1:                                        \n"
1142       "ld2         {v0.16b,v1.16b}, [%0], #32    \n"  // load 16 pixels of YUY2.
1143       "prfm        pldl1keep, [%0, 448]          \n"
1144       "subs        %w2, %w2, #16                 \n"  // 16 processed per loop.
1145       "st1         {v0.16b}, [%1], #16           \n"  // store 16 pixels of Y.
1146       "b.gt        1b                            \n"
1147       : "+r"(src_yuy2),  // %0
1148         "+r"(dst_y),     // %1
1149         "+r"(width)      // %2
1150       :
1151       : "cc", "memory", "v0", "v1"  // Clobber List
1152   );
1153 }
1154 
UYVYToYRow_NEON(const uint8_t * src_uyvy,uint8_t * dst_y,int width)1155 void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
1156   asm volatile(
1157       "1:                                        \n"
1158       "ld2         {v0.16b,v1.16b}, [%0], #32    \n"  // load 16 pixels of UYVY.
1159       "prfm        pldl1keep, [%0, 448]          \n"
1160       "subs        %w2, %w2, #16                 \n"  // 16 processed per loop.
1161       "st1         {v1.16b}, [%1], #16           \n"  // store 16 pixels of Y.
1162       "b.gt        1b                            \n"
1163       : "+r"(src_uyvy),  // %0
1164         "+r"(dst_y),     // %1
1165         "+r"(width)      // %2
1166       :
1167       : "cc", "memory", "v0", "v1"  // Clobber List
1168   );
1169 }
1170 
YUY2ToUV422Row_NEON(const uint8_t * src_yuy2,uint8_t * dst_u,uint8_t * dst_v,int width)1171 void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
1172                          uint8_t* dst_u,
1173                          uint8_t* dst_v,
1174                          int width) {
1175   asm volatile(
1176       "1:                                        \n"
1177       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 YUY2
1178       "prfm        pldl1keep, [%0, 448]          \n"
1179       "subs        %w3, %w3, #16                 \n"  // 16 pixels = 8 UVs.
1180       "st1         {v1.8b}, [%1], #8             \n"  // store 8 U.
1181       "st1         {v3.8b}, [%2], #8             \n"  // store 8 V.
1182       "b.gt        1b                            \n"
1183       : "+r"(src_yuy2),  // %0
1184         "+r"(dst_u),     // %1
1185         "+r"(dst_v),     // %2
1186         "+r"(width)      // %3
1187       :
1188       : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
1189   );
1190 }
1191 
UYVYToUV422Row_NEON(const uint8_t * src_uyvy,uint8_t * dst_u,uint8_t * dst_v,int width)1192 void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
1193                          uint8_t* dst_u,
1194                          uint8_t* dst_v,
1195                          int width) {
1196   asm volatile(
1197       "1:                                        \n"
1198       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 UYVY
1199       "prfm        pldl1keep, [%0, 448]          \n"
1200       "subs        %w3, %w3, #16                 \n"  // 16 pixels = 8 UVs.
1201       "st1         {v0.8b}, [%1], #8             \n"  // store 8 U.
1202       "st1         {v2.8b}, [%2], #8             \n"  // store 8 V.
1203       "b.gt        1b                            \n"
1204       : "+r"(src_uyvy),  // %0
1205         "+r"(dst_u),     // %1
1206         "+r"(dst_v),     // %2
1207         "+r"(width)      // %3
1208       :
1209       : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
1210   );
1211 }
1212 
YUY2ToUVRow_NEON(const uint8_t * src_yuy2,int stride_yuy2,uint8_t * dst_u,uint8_t * dst_v,int width)1213 void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
1214                       int stride_yuy2,
1215                       uint8_t* dst_u,
1216                       uint8_t* dst_v,
1217                       int width) {
1218   const uint8_t* src_yuy2b = src_yuy2 + stride_yuy2;
1219   asm volatile(
1220       "1:                                        \n"
1221       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels
1222       "prfm        pldl1keep, [%0, 448]          \n"
1223       "subs        %w4, %w4, #16                 \n"  // 16 pixels = 8 UVs.
1224       "ld4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load next row
1225       "urhadd      v1.8b, v1.8b, v5.8b           \n"  // average rows of U
1226       "urhadd      v3.8b, v3.8b, v7.8b           \n"  // average rows of V
1227       "st1         {v1.8b}, [%2], #8             \n"  // store 8 U.
1228       "st1         {v3.8b}, [%3], #8             \n"  // store 8 V.
1229       "b.gt        1b                            \n"
1230       : "+r"(src_yuy2),   // %0
1231         "+r"(src_yuy2b),  // %1
1232         "+r"(dst_u),      // %2
1233         "+r"(dst_v),      // %3
1234         "+r"(width)       // %4
1235       :
1236       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
1237         "v7"  // Clobber List
1238   );
1239 }
1240 
UYVYToUVRow_NEON(const uint8_t * src_uyvy,int stride_uyvy,uint8_t * dst_u,uint8_t * dst_v,int width)1241 void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
1242                       int stride_uyvy,
1243                       uint8_t* dst_u,
1244                       uint8_t* dst_v,
1245                       int width) {
1246   const uint8_t* src_uyvyb = src_uyvy + stride_uyvy;
1247   asm volatile(
1248       "1:                                        \n"
1249       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels
1250       "prfm        pldl1keep, [%0, 448]          \n"
1251       "subs        %w4, %w4, #16                 \n"  // 16 pixels = 8 UVs.
1252       "ld4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load next row
1253       "urhadd      v0.8b, v0.8b, v4.8b           \n"  // average rows of U
1254       "urhadd      v2.8b, v2.8b, v6.8b           \n"  // average rows of V
1255       "st1         {v0.8b}, [%2], #8             \n"  // store 8 U.
1256       "st1         {v2.8b}, [%3], #8             \n"  // store 8 V.
1257       "b.gt        1b                            \n"
1258       : "+r"(src_uyvy),   // %0
1259         "+r"(src_uyvyb),  // %1
1260         "+r"(dst_u),      // %2
1261         "+r"(dst_v),      // %3
1262         "+r"(width)       // %4
1263       :
1264       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
1265         "v7"  // Clobber List
1266   );
1267 }
1268 
1269 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
ARGBShuffleRow_NEON(const uint8_t * src_argb,uint8_t * dst_argb,const uint8_t * shuffler,int width)1270 void ARGBShuffleRow_NEON(const uint8_t* src_argb,
1271                          uint8_t* dst_argb,
1272                          const uint8_t* shuffler,
1273                          int width) {
1274   asm volatile(
1275       "ld1         {v2.16b}, [%3]                \n"  // shuffler
1276       "1:                                        \n"
1277       "ld1         {v0.16b}, [%0], #16           \n"  // load 4 pixels.
1278       "prfm        pldl1keep, [%0, 448]          \n"
1279       "subs        %w2, %w2, #4                  \n"  // 4 processed per loop
1280       "tbl         v1.16b, {v0.16b}, v2.16b      \n"  // look up 4 pixels
1281       "st1         {v1.16b}, [%1], #16           \n"  // store 4.
1282       "b.gt        1b                            \n"
1283       : "+r"(src_argb),                   // %0
1284         "+r"(dst_argb),                   // %1
1285         "+r"(width)                       // %2
1286       : "r"(shuffler)                     // %3
1287       : "cc", "memory", "v0", "v1", "v2"  // Clobber List
1288   );
1289 }
1290 
I422ToYUY2Row_NEON(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_yuy2,int width)1291 void I422ToYUY2Row_NEON(const uint8_t* src_y,
1292                         const uint8_t* src_u,
1293                         const uint8_t* src_v,
1294                         uint8_t* dst_yuy2,
1295                         int width) {
1296   asm volatile(
1297       "1:                                        \n"
1298       "ld2         {v0.8b, v1.8b}, [%0], #16     \n"  // load 16 Ys
1299       "prfm        pldl1keep, [%0, 448]          \n"
1300       "orr         v2.8b, v1.8b, v1.8b           \n"
1301       "ld1         {v1.8b}, [%1], #8             \n"         // load 8 Us
1302       "ld1         {v3.8b}, [%2], #8             \n"         // load 8 Vs
1303       "subs        %w4, %w4, #16                 \n"         // 16 pixels
1304       "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels.
1305       "b.gt        1b                            \n"
1306       : "+r"(src_y),     // %0
1307         "+r"(src_u),     // %1
1308         "+r"(src_v),     // %2
1309         "+r"(dst_yuy2),  // %3
1310         "+r"(width)      // %4
1311       :
1312       : "cc", "memory", "v0", "v1", "v2", "v3");
1313 }
1314 
I422ToUYVYRow_NEON(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_uyvy,int width)1315 void I422ToUYVYRow_NEON(const uint8_t* src_y,
1316                         const uint8_t* src_u,
1317                         const uint8_t* src_v,
1318                         uint8_t* dst_uyvy,
1319                         int width) {
1320   asm volatile(
1321       "1:                                        \n"
1322       "ld2         {v1.8b,v2.8b}, [%0], #16      \n"  // load 16 Ys
1323       "prfm        pldl1keep, [%0, 448]          \n"
1324       "orr         v3.8b, v2.8b, v2.8b           \n"
1325       "ld1         {v0.8b}, [%1], #8             \n"         // load 8 Us
1326       "ld1         {v2.8b}, [%2], #8             \n"         // load 8 Vs
1327       "subs        %w4, %w4, #16                 \n"         // 16 pixels
1328       "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels.
1329       "b.gt        1b                            \n"
1330       : "+r"(src_y),     // %0
1331         "+r"(src_u),     // %1
1332         "+r"(src_v),     // %2
1333         "+r"(dst_uyvy),  // %3
1334         "+r"(width)      // %4
1335       :
1336       : "cc", "memory", "v0", "v1", "v2", "v3");
1337 }
1338 
ARGBToRGB565Row_NEON(const uint8_t * src_argb,uint8_t * dst_rgb565,int width)1339 void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
1340                           uint8_t* dst_rgb565,
1341                           int width) {
1342   asm volatile(
1343       "1:                                        \n"
1344       "ld4         {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8
1345                                                                  // pixels
1346       "prfm        pldl1keep, [%0, 448]          \n"
1347       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
1348       ARGBTORGB565
1349       "st1         {v0.16b}, [%1], #16           \n"  // store 8 pixels RGB565.
1350       "b.gt        1b                            \n"
1351       : "+r"(src_argb),    // %0
1352         "+r"(dst_rgb565),  // %1
1353         "+r"(width)        // %2
1354       :
1355       : "cc", "memory", "v0", "v20", "v21", "v22", "v23");
1356 }
1357 
ARGBToRGB565DitherRow_NEON(const uint8_t * src_argb,uint8_t * dst_rgb,const uint32_t dither4,int width)1358 void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
1359                                 uint8_t* dst_rgb,
1360                                 const uint32_t dither4,
1361                                 int width) {
1362   asm volatile(
1363       "dup         v1.4s, %w2                    \n"  // dither4
1364       "1:                                        \n"
1365       "ld4         {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"  // load 8
1366                                                                  // pixels
1367       "prfm        pldl1keep, [%0, 448]          \n"
1368       "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
1369       "uqadd       v20.8b, v20.8b, v1.8b         \n"
1370       "uqadd       v21.8b, v21.8b, v1.8b         \n"
1371       "uqadd       v22.8b, v22.8b, v1.8b         \n" ARGBTORGB565
1372       "st1         {v0.16b}, [%0], #16           \n"  // store 8 pixels RGB565.
1373       "b.gt        1b                            \n"
1374       : "+r"(dst_rgb)   // %0
1375       : "r"(src_argb),  // %1
1376         "r"(dither4),   // %2
1377         "r"(width)      // %3
1378       : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23");
1379 }
1380 
ARGBToARGB1555Row_NEON(const uint8_t * src_argb,uint8_t * dst_argb1555,int width)1381 void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
1382                             uint8_t* dst_argb1555,
1383                             int width) {
1384   asm volatile(
1385       "1:                                        \n"
1386       "ld4         {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8
1387                                                                  // pixels
1388       "prfm        pldl1keep, [%0, 448]          \n"
1389       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
1390       ARGBTOARGB1555
1391       "st1         {v0.16b}, [%1], #16           \n"  // store 8 pixels
1392       "b.gt        1b                            \n"
1393       : "+r"(src_argb),      // %0
1394         "+r"(dst_argb1555),  // %1
1395         "+r"(width)          // %2
1396       :
1397       : "cc", "memory", "v0", "v20", "v21", "v22", "v23");
1398 }
1399 
ARGBToARGB4444Row_NEON(const uint8_t * src_argb,uint8_t * dst_argb4444,int width)1400 void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
1401                             uint8_t* dst_argb4444,
1402                             int width) {
1403   asm volatile(
1404       "movi        v4.16b, #0x0f                 \n"  // bits to clear with
1405                                                       // vbic.
1406       "1:                                        \n"
1407       "ld4         {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8
1408                                                                  // pixels
1409       "prfm        pldl1keep, [%0, 448]          \n"
1410       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
1411       ARGBTOARGB4444
1412       "st1         {v0.16b}, [%1], #16           \n"  // store 8 pixels
1413       "b.gt        1b                            \n"
1414       : "+r"(src_argb),      // %0
1415         "+r"(dst_argb4444),  // %1
1416         "+r"(width)          // %2
1417       :
1418       : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23");
1419 }
1420 
ARGBToYRow_NEON(const uint8_t * src_argb,uint8_t * dst_y,int width)1421 void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
1422   asm volatile(
1423       "movi        v4.8b, #25                    \n"  // B * 0.1016 coefficient
1424       "movi        v5.8b, #129                   \n"  // G * 0.5078 coefficient
1425       "movi        v6.8b, #66                    \n"  // R * 0.2578 coefficient
1426       "movi        v7.8b, #16                    \n"  // Add 16 constant
1427       "1:                                        \n"
1428       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
1429       "prfm        pldl1keep, [%0, 448]          \n"
1430       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
1431       "umull       v3.8h, v0.8b, v4.8b           \n"  // B
1432       "umlal       v3.8h, v1.8b, v5.8b           \n"  // G
1433       "umlal       v3.8h, v2.8b, v6.8b           \n"  // R
1434       "uqrshrn     v0.8b, v3.8h, #8              \n"  // 16 bit to 8 bit Y
1435       "uqadd       v0.8b, v0.8b, v7.8b           \n"
1436       "st1         {v0.8b}, [%1], #8             \n"  // store 8 pixels Y.
1437       "b.gt        1b                            \n"
1438       : "+r"(src_argb),  // %0
1439         "+r"(dst_y),     // %1
1440         "+r"(width)      // %2
1441       :
1442       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
1443 }
1444 
ARGBExtractAlphaRow_NEON(const uint8_t * src_argb,uint8_t * dst_a,int width)1445 void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
1446                               uint8_t* dst_a,
1447                               int width) {
1448   asm volatile(
1449       "1:                                        \n"
1450       "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16
1451       "prfm        pldl1keep, [%0, 448]          \n"
1452       "subs        %w2, %w2, #16                 \n"  // 16 processed per loop
1453       "st1         {v3.16b}, [%1], #16           \n"  // store 16 A's.
1454       "b.gt        1b                            \n"
1455       : "+r"(src_argb),  // %0
1456         "+r"(dst_a),     // %1
1457         "+r"(width)      // %2
1458       :
1459       : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
1460   );
1461 }
1462 
ARGBToYJRow_NEON(const uint8_t * src_argb,uint8_t * dst_y,int width)1463 void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
1464   asm volatile(
1465       "movi        v4.8b, #29                    \n"  // B * 0.1140 coefficient
1466       "movi        v5.8b, #150                   \n"  // G * 0.5870 coefficient
1467       "movi        v6.8b, #77                    \n"  // R * 0.2990 coefficient
1468       "1:                                        \n"
1469       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
1470       "prfm        pldl1keep, [%0, 448]          \n"
1471       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
1472       "umull       v3.8h, v0.8b, v4.8b           \n"  // B
1473       "umlal       v3.8h, v1.8b, v5.8b           \n"  // G
1474       "umlal       v3.8h, v2.8b, v6.8b           \n"  // R
1475       "uqrshrn     v0.8b, v3.8h, #8              \n"  // 16 bit to 8 bit Y
1476       "st1         {v0.8b}, [%1], #8             \n"  // store 8 pixels Y.
1477       "b.gt        1b                            \n"
1478       : "+r"(src_argb),  // %0
1479         "+r"(dst_y),     // %1
1480         "+r"(width)      // %2
1481       :
1482       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
1483 }
1484 
RGBAToYJRow_NEON(const uint8_t * src_argb,uint8_t * dst_y,int width)1485 void RGBAToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
1486   asm volatile(
1487       "movi        v4.8b, #29                    \n"  // B * 0.1140 coefficient
1488       "movi        v5.8b, #150                   \n"  // G * 0.5870 coefficient
1489       "movi        v6.8b, #77                    \n"  // R * 0.2990 coefficient
1490       "1:                                        \n"
1491       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 RGBA
1492       "prfm        pldl1keep, [%0, 448]          \n"
1493       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
1494       "umull       v0.8h, v1.8b, v4.8b           \n"  // B
1495       "umlal       v0.8h, v2.8b, v5.8b           \n"  // G
1496       "umlal       v0.8h, v3.8b, v6.8b           \n"  // R
1497       "uqrshrn     v3.8b, v0.8h, #8              \n"  // 16 bit to 8 bit Y
1498       "st1         {v3.8b}, [%1], #8             \n"  // store 8 pixels Y.
1499       "b.gt        1b                            \n"
1500       : "+r"(src_argb),  // %0
1501         "+r"(dst_y),     // %1
1502         "+r"(width)      // %2
1503       :
1504       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
1505 }
1506 
1507 // 8x1 pixels.
ARGBToUV444Row_NEON(const uint8_t * src_argb,uint8_t * dst_u,uint8_t * dst_v,int width)1508 void ARGBToUV444Row_NEON(const uint8_t* src_argb,
1509                          uint8_t* dst_u,
1510                          uint8_t* dst_v,
1511                          int width) {
1512   asm volatile(
1513       "movi        v24.8b, #112                  \n"  // UB / VR 0.875
1514                                                       // coefficient
1515       "movi        v25.8b, #74                   \n"  // UG -0.5781 coefficient
1516       "movi        v26.8b, #38                   \n"  // UR -0.2969 coefficient
1517       "movi        v27.8b, #18                   \n"  // VB -0.1406 coefficient
1518       "movi        v28.8b, #94                   \n"  // VG -0.7344 coefficient
1519       "movi        v29.16b,#0x80                 \n"  // 128.5
1520       "1:                                        \n"
1521       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
1522       "prfm        pldl1keep, [%0, 448]          \n"
1523       "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
1524       "umull       v4.8h, v0.8b, v24.8b          \n"  // B
1525       "umlsl       v4.8h, v1.8b, v25.8b          \n"  // G
1526       "umlsl       v4.8h, v2.8b, v26.8b          \n"  // R
1527       "add         v4.8h, v4.8h, v29.8h          \n"  // +128 -> unsigned
1528 
1529       "umull       v3.8h, v2.8b, v24.8b          \n"  // R
1530       "umlsl       v3.8h, v1.8b, v28.8b          \n"  // G
1531       "umlsl       v3.8h, v0.8b, v27.8b          \n"  // B
1532       "add         v3.8h, v3.8h, v29.8h          \n"  // +128 -> unsigned
1533 
1534       "uqshrn      v0.8b, v4.8h, #8              \n"  // 16 bit to 8 bit U
1535       "uqshrn      v1.8b, v3.8h, #8              \n"  // 16 bit to 8 bit V
1536 
1537       "st1         {v0.8b}, [%1], #8             \n"  // store 8 pixels U.
1538       "st1         {v1.8b}, [%2], #8             \n"  // store 8 pixels V.
1539       "b.gt        1b                            \n"
1540       : "+r"(src_argb),  // %0
1541         "+r"(dst_u),     // %1
1542         "+r"(dst_v),     // %2
1543         "+r"(width)      // %3
1544       :
1545       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26",
1546         "v27", "v28", "v29");
1547 }
1548 
1549 #define RGBTOUV_SETUP_REG                                                  \
1550   "movi       v20.8h, #56, lsl #0  \n" /* UB/VR coefficient (0.875) / 2 */ \
1551   "movi       v21.8h, #37, lsl #0  \n" /* UG coefficient (-0.5781) / 2  */ \
1552   "movi       v22.8h, #19, lsl #0  \n" /* UR coefficient (-0.2969) / 2  */ \
1553   "movi       v23.8h, #9,  lsl #0  \n" /* VB coefficient (-0.1406) / 2  */ \
1554   "movi       v24.8h, #47, lsl #0  \n" /* VG coefficient (-0.7344) / 2  */ \
1555   "movi       v25.16b, #0x80       \n" /* 128.5 (0x8080 in 16-bit)      */
1556 
1557 // 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
1558 // clang-format off
1559 #define RGBTOUV(QB, QG, QR)                                                 \
1560   "mul        v3.8h, " #QB ",v20.8h          \n" /* B                    */ \
1561   "mul        v4.8h, " #QR ",v20.8h          \n" /* R                    */ \
1562   "mls        v3.8h, " #QG ",v21.8h          \n" /* G                    */ \
1563   "mls        v4.8h, " #QG ",v24.8h          \n" /* G                    */ \
1564   "mls        v3.8h, " #QR ",v22.8h          \n" /* R                    */ \
1565   "mls        v4.8h, " #QB ",v23.8h          \n" /* B                    */ \
1566   "add        v3.8h, v3.8h, v25.8h           \n" /* +128 -> unsigned     */ \
1567   "add        v4.8h, v4.8h, v25.8h           \n" /* +128 -> unsigned     */ \
1568   "uqshrn     v0.8b, v3.8h, #8               \n" /* 16 bit to 8 bit U    */ \
1569   "uqshrn     v1.8b, v4.8h, #8               \n" /* 16 bit to 8 bit V    */
1570 // clang-format on
1571 
1572 // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
1573 // TODO(fbarchard): consider ptrdiff_t for all strides.
1574 
ARGBToUVRow_NEON(const uint8_t * src_argb,int src_stride_argb,uint8_t * dst_u,uint8_t * dst_v,int width)1575 void ARGBToUVRow_NEON(const uint8_t* src_argb,
1576                       int src_stride_argb,
1577                       uint8_t* dst_u,
1578                       uint8_t* dst_v,
1579                       int width) {
1580   const uint8_t* src_argb_1 = src_argb + src_stride_argb;
1581   asm volatile (
1582     RGBTOUV_SETUP_REG
1583       "1:                                        \n"
1584       "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
1585       "prfm        pldl1keep, [%0, 448]          \n"
1586       "uaddlp      v0.8h, v0.16b                 \n"  // B 16 bytes -> 8 shorts.
1587       "uaddlp      v1.8h, v1.16b                 \n"  // G 16 bytes -> 8 shorts.
1588       "uaddlp      v2.8h, v2.16b                 \n"  // R 16 bytes -> 8 shorts.
1589 
1590       "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
1591       "prfm        pldl1keep, [%1, 448]          \n"
1592       "uadalp      v0.8h, v4.16b                 \n"  // B 16 bytes -> 8 shorts.
1593       "uadalp      v1.8h, v5.16b                 \n"  // G 16 bytes -> 8 shorts.
1594       "uadalp      v2.8h, v6.16b                 \n"  // R 16 bytes -> 8 shorts.
1595 
1596       "urshr       v0.8h, v0.8h, #1              \n"  // 2x average
1597       "urshr       v1.8h, v1.8h, #1              \n"
1598       "urshr       v2.8h, v2.8h, #1              \n"
1599 
1600       "subs        %w4, %w4, #16                 \n"  // 16 processed per loop.
1601     RGBTOUV(v0.8h, v1.8h, v2.8h)
1602       "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
1603       "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
1604       "b.gt        1b                            \n"
1605   : "+r"(src_argb),  // %0
1606     "+r"(src_argb_1),  // %1
1607     "+r"(dst_u),     // %2
1608     "+r"(dst_v),     // %3
1609     "+r"(width)        // %4
1610   :
1611   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1612     "v20", "v21", "v22", "v23", "v24", "v25"
1613   );
1614 }
1615 
ARGBToUVJRow_NEON(const uint8_t * src_argb,int src_stride_argb,uint8_t * dst_u,uint8_t * dst_v,int width)1616 void ARGBToUVJRow_NEON(const uint8_t* src_argb,
1617                        int src_stride_argb,
1618                        uint8_t* dst_u,
1619                        uint8_t* dst_v,
1620                        int width) {
1621   const uint8_t* src_argb_1 = src_argb + src_stride_argb;
1622   asm volatile (
1623       "movi        v20.8h, #63, lsl #0           \n"  // UB/VR coeff (0.500) / 2
1624       "movi        v21.8h, #42, lsl #0           \n"  // UG coeff (-0.33126) / 2
1625       "movi        v22.8h, #21, lsl #0           \n"  // UR coeff (-0.16874) / 2
1626       "movi        v23.8h, #10, lsl #0           \n"  // VB coeff (-0.08131) / 2
1627       "movi        v24.8h, #53, lsl #0           \n"  // VG coeff (-0.41869) / 2
1628       "movi        v25.16b, #0x80                \n"  // 128.5 (0x8080 in 16-bit)
1629       "1:                                        \n"
1630       "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
1631       "prfm        pldl1keep, [%0, 448]          \n"
1632       "uaddlp      v0.8h, v0.16b                 \n"  // B 16 bytes -> 8 shorts.
1633       "uaddlp      v1.8h, v1.16b                 \n"  // G 16 bytes -> 8 shorts.
1634       "uaddlp      v2.8h, v2.16b                 \n"  // R 16 bytes -> 8 shorts.
1635       "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
1636       "prfm        pldl1keep, [%1, 448]          \n"
1637       "uadalp      v0.8h, v4.16b                 \n"  // B 16 bytes -> 8 shorts.
1638       "uadalp      v1.8h, v5.16b                 \n"  // G 16 bytes -> 8 shorts.
1639       "uadalp      v2.8h, v6.16b                 \n"  // R 16 bytes -> 8 shorts.
1640 
1641       "urshr       v0.8h, v0.8h, #1              \n"  // 2x average
1642       "urshr       v1.8h, v1.8h, #1              \n"
1643       "urshr       v2.8h, v2.8h, #1              \n"
1644 
1645       "subs        %w4, %w4, #16                 \n"  // 32 processed per loop.
1646     RGBTOUV(v0.8h, v1.8h, v2.8h)
1647       "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
1648       "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
1649       "b.gt        1b                            \n"
1650   : "+r"(src_argb),  // %0
1651     "+r"(src_argb_1),  // %1
1652     "+r"(dst_u),     // %2
1653     "+r"(dst_v),     // %3
1654     "+r"(width)        // %4
1655   :
1656   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1657     "v20", "v21", "v22", "v23", "v24", "v25"
1658   );
1659 }
1660 
BGRAToUVRow_NEON(const uint8_t * src_bgra,int src_stride_bgra,uint8_t * dst_u,uint8_t * dst_v,int width)1661 void BGRAToUVRow_NEON(const uint8_t* src_bgra,
1662                       int src_stride_bgra,
1663                       uint8_t* dst_u,
1664                       uint8_t* dst_v,
1665                       int width) {
1666   const uint8_t* src_bgra_1 = src_bgra + src_stride_bgra;
1667   asm volatile (
1668     RGBTOUV_SETUP_REG
1669       "1:                                        \n"
1670       "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
1671       "prfm        pldl1keep, [%0, 448]          \n"
1672       "uaddlp      v0.8h, v3.16b                 \n"  // B 16 bytes -> 8 shorts.
1673       "uaddlp      v3.8h, v2.16b                 \n"  // G 16 bytes -> 8 shorts.
1674       "uaddlp      v2.8h, v1.16b                 \n"  // R 16 bytes -> 8 shorts.
1675       "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more
1676       "prfm        pldl1keep, [%1, 448]          \n"
1677       "uadalp      v0.8h, v7.16b                 \n"  // B 16 bytes -> 8 shorts.
1678       "uadalp      v3.8h, v6.16b                 \n"  // G 16 bytes -> 8 shorts.
1679       "uadalp      v2.8h, v5.16b                 \n"  // R 16 bytes -> 8 shorts.
1680 
1681       "urshr       v0.8h, v0.8h, #1              \n"  // 2x average
1682       "urshr       v1.8h, v3.8h, #1              \n"
1683       "urshr       v2.8h, v2.8h, #1              \n"
1684 
1685       "subs        %w4, %w4, #16                 \n"  // 32 processed per loop.
1686     RGBTOUV(v0.8h, v1.8h, v2.8h)
1687       "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
1688       "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
1689       "b.gt        1b                            \n"
1690   : "+r"(src_bgra),  // %0
1691     "+r"(src_bgra_1),  // %1
1692     "+r"(dst_u),     // %2
1693     "+r"(dst_v),     // %3
1694     "+r"(width)        // %4
1695   :
1696   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1697     "v20", "v21", "v22", "v23", "v24", "v25"
1698   );
1699 }
1700 
ABGRToUVRow_NEON(const uint8_t * src_abgr,int src_stride_abgr,uint8_t * dst_u,uint8_t * dst_v,int width)1701 void ABGRToUVRow_NEON(const uint8_t* src_abgr,
1702                       int src_stride_abgr,
1703                       uint8_t* dst_u,
1704                       uint8_t* dst_v,
1705                       int width) {
1706   const uint8_t* src_abgr_1 = src_abgr + src_stride_abgr;
1707   asm volatile (
1708     RGBTOUV_SETUP_REG
1709       "1:                                        \n"
1710       "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
1711       "prfm        pldl1keep, [%0, 448]          \n"
1712       "uaddlp      v3.8h, v2.16b                 \n"  // B 16 bytes -> 8 shorts.
1713       "uaddlp      v2.8h, v1.16b                 \n"  // G 16 bytes -> 8 shorts.
1714       "uaddlp      v1.8h, v0.16b                 \n"  // R 16 bytes -> 8 shorts.
1715       "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more.
1716       "prfm        pldl1keep, [%1, 448]          \n"
1717       "uadalp      v3.8h, v6.16b                 \n"  // B 16 bytes -> 8 shorts.
1718       "uadalp      v2.8h, v5.16b                 \n"  // G 16 bytes -> 8 shorts.
1719       "uadalp      v1.8h, v4.16b                 \n"  // R 16 bytes -> 8 shorts.
1720 
1721       "urshr       v0.8h, v3.8h, #1              \n"  // 2x average
1722       "urshr       v2.8h, v2.8h, #1              \n"
1723       "urshr       v1.8h, v1.8h, #1              \n"
1724 
1725       "subs        %w4, %w4, #16                 \n"  // 32 processed per loop.
1726     RGBTOUV(v0.8h, v2.8h, v1.8h)
1727       "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
1728       "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
1729       "b.gt        1b                            \n"
1730   : "+r"(src_abgr),  // %0
1731     "+r"(src_abgr_1),  // %1
1732     "+r"(dst_u),     // %2
1733     "+r"(dst_v),     // %3
1734     "+r"(width)        // %4
1735   :
1736   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1737     "v20", "v21", "v22", "v23", "v24", "v25"
1738   );
1739 }
1740 
RGBAToUVRow_NEON(const uint8_t * src_rgba,int src_stride_rgba,uint8_t * dst_u,uint8_t * dst_v,int width)1741 void RGBAToUVRow_NEON(const uint8_t* src_rgba,
1742                       int src_stride_rgba,
1743                       uint8_t* dst_u,
1744                       uint8_t* dst_v,
1745                       int width) {
1746   const uint8_t* src_rgba_1 = src_rgba + src_stride_rgba;
1747   asm volatile (
1748     RGBTOUV_SETUP_REG
1749       "1:                                        \n"
1750       "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
1751       "prfm        pldl1keep, [%0, 448]          \n"
1752       "uaddlp      v0.8h, v1.16b                 \n"  // B 16 bytes -> 8 shorts.
1753       "uaddlp      v1.8h, v2.16b                 \n"  // G 16 bytes -> 8 shorts.
1754       "uaddlp      v2.8h, v3.16b                 \n"  // R 16 bytes -> 8 shorts.
1755       "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more.
1756       "prfm        pldl1keep, [%1, 448]          \n"
1757       "uadalp      v0.8h, v5.16b                 \n"  // B 16 bytes -> 8 shorts.
1758       "uadalp      v1.8h, v6.16b                 \n"  // G 16 bytes -> 8 shorts.
1759       "uadalp      v2.8h, v7.16b                 \n"  // R 16 bytes -> 8 shorts.
1760 
1761       "urshr       v0.8h, v0.8h, #1              \n"  // 2x average
1762       "urshr       v1.8h, v1.8h, #1              \n"
1763       "urshr       v2.8h, v2.8h, #1              \n"
1764 
1765       "subs        %w4, %w4, #16                 \n"  // 32 processed per loop.
1766     RGBTOUV(v0.8h, v1.8h, v2.8h)
1767       "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
1768       "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
1769       "b.gt        1b                            \n"
1770   : "+r"(src_rgba),  // %0
1771     "+r"(src_rgba_1),  // %1
1772     "+r"(dst_u),     // %2
1773     "+r"(dst_v),     // %3
1774     "+r"(width)        // %4
1775   :
1776   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1777     "v20", "v21", "v22", "v23", "v24", "v25"
1778   );
1779 }
1780 
RGB24ToUVRow_NEON(const uint8_t * src_rgb24,int src_stride_rgb24,uint8_t * dst_u,uint8_t * dst_v,int width)1781 void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
1782                        int src_stride_rgb24,
1783                        uint8_t* dst_u,
1784                        uint8_t* dst_v,
1785                        int width) {
1786   const uint8_t* src_rgb24_1 = src_rgb24 + src_stride_rgb24;
1787   asm volatile (
1788     RGBTOUV_SETUP_REG
1789       "1:                                        \n"
1790       "ld3         {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 16 pixels.
1791       "prfm        pldl1keep, [%0, 448]          \n"
1792       "uaddlp      v0.8h, v0.16b                 \n"  // B 16 bytes -> 8 shorts.
1793       "uaddlp      v1.8h, v1.16b                 \n"  // G 16 bytes -> 8 shorts.
1794       "uaddlp      v2.8h, v2.16b                 \n"  // R 16 bytes -> 8 shorts.
1795       "ld3         {v4.16b,v5.16b,v6.16b}, [%1], #48 \n"  // load 16 more.
1796       "prfm        pldl1keep, [%1, 448]          \n"
1797       "uadalp      v0.8h, v4.16b                 \n"  // B 16 bytes -> 8 shorts.
1798       "uadalp      v1.8h, v5.16b                 \n"  // G 16 bytes -> 8 shorts.
1799       "uadalp      v2.8h, v6.16b                 \n"  // R 16 bytes -> 8 shorts.
1800 
1801       "urshr       v0.8h, v0.8h, #1              \n"  // 2x average
1802       "urshr       v1.8h, v1.8h, #1              \n"
1803       "urshr       v2.8h, v2.8h, #1              \n"
1804 
1805       "subs        %w4, %w4, #16                 \n"  // 32 processed per loop.
1806     RGBTOUV(v0.8h, v1.8h, v2.8h)
1807       "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
1808       "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
1809       "b.gt        1b                            \n"
1810   : "+r"(src_rgb24),  // %0
1811     "+r"(src_rgb24_1),  // %1
1812     "+r"(dst_u),     // %2
1813     "+r"(dst_v),     // %3
1814     "+r"(width)        // %4
1815   :
1816   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1817     "v20", "v21", "v22", "v23", "v24", "v25"
1818   );
1819 }
1820 
RAWToUVRow_NEON(const uint8_t * src_raw,int src_stride_raw,uint8_t * dst_u,uint8_t * dst_v,int width)1821 void RAWToUVRow_NEON(const uint8_t* src_raw,
1822                      int src_stride_raw,
1823                      uint8_t* dst_u,
1824                      uint8_t* dst_v,
1825                      int width) {
1826   const uint8_t* src_raw_1 = src_raw + src_stride_raw;
1827   asm volatile (
1828     RGBTOUV_SETUP_REG
1829       "1:                                        \n"
1830       "ld3         {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 8 RAW pixels.
1831       "prfm        pldl1keep, [%0, 448]          \n"
1832       "uaddlp      v2.8h, v2.16b                 \n"  // B 16 bytes -> 8 shorts.
1833       "uaddlp      v1.8h, v1.16b                 \n"  // G 16 bytes -> 8 shorts.
1834       "uaddlp      v0.8h, v0.16b                 \n"  // R 16 bytes -> 8 shorts.
1835       "ld3         {v4.16b,v5.16b,v6.16b}, [%1], #48 \n"  // load 8 more RAW pixels
1836       "prfm        pldl1keep, [%1, 448]          \n"
1837       "uadalp      v2.8h, v6.16b                 \n"  // B 16 bytes -> 8 shorts.
1838       "uadalp      v1.8h, v5.16b                 \n"  // G 16 bytes -> 8 shorts.
1839       "uadalp      v0.8h, v4.16b                 \n"  // R 16 bytes -> 8 shorts.
1840 
1841       "urshr       v2.8h, v2.8h, #1              \n"  // 2x average
1842       "urshr       v1.8h, v1.8h, #1              \n"
1843       "urshr       v0.8h, v0.8h, #1              \n"
1844 
1845       "subs        %w4, %w4, #16                 \n"  // 32 processed per loop.
1846     RGBTOUV(v2.8h, v1.8h, v0.8h)
1847       "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
1848       "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
1849       "b.gt        1b                            \n"
1850   : "+r"(src_raw),  // %0
1851     "+r"(src_raw_1),  // %1
1852     "+r"(dst_u),     // %2
1853     "+r"(dst_v),     // %3
1854     "+r"(width)        // %4
1855   :
1856   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1857     "v20", "v21", "v22", "v23", "v24", "v25"
1858   );
1859 }
1860 
1861 // 16x2 pixels -> 8x1.  width is number of rgb pixels. e.g. 16.
RGB565ToUVRow_NEON(const uint8_t * src_rgb565,int src_stride_rgb565,uint8_t * dst_u,uint8_t * dst_v,int width)1862 void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
1863                         int src_stride_rgb565,
1864                         uint8_t* dst_u,
1865                         uint8_t* dst_v,
1866                         int width) {
1867   const uint8_t* src_rgb565_1 = src_rgb565 + src_stride_rgb565;
1868   asm volatile(
1869       RGBTOUV_SETUP_REG
1870       "1:                                        \n"
1871       "ld1         {v0.16b}, [%0], #16           \n"  // load 8 RGB565 pixels.
1872       "prfm        pldl1keep, [%0, 448]          \n"
1873       RGB565TOARGB
1874       "uaddlp      v16.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
1875       "uaddlp      v17.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
1876       "uaddlp      v18.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
1877       "ld1         {v0.16b}, [%0], #16           \n"  // next 8 RGB565 pixels.
1878       RGB565TOARGB
1879       "uaddlp      v26.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
1880       "uaddlp      v27.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
1881       "uaddlp      v28.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
1882 
1883       "ld1         {v0.16b}, [%1], #16           \n"  // load 8 RGB565 pixels.
1884       "prfm        pldl1keep, [%1, 448]          \n"
1885       RGB565TOARGB
1886       "uadalp      v16.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
1887       "uadalp      v17.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
1888       "uadalp      v18.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
1889       "ld1         {v0.16b}, [%1], #16           \n"  // next 8 RGB565 pixels.
1890       RGB565TOARGB
1891       "uadalp      v26.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
1892       "uadalp      v27.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
1893       "uadalp      v28.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
1894 
1895       "ins         v16.D[1], v26.D[0]            \n"
1896       "ins         v17.D[1], v27.D[0]            \n"
1897       "ins         v18.D[1], v28.D[0]            \n"
1898 
1899       "urshr       v0.8h, v16.8h, #1             \n"  // 2x average
1900       "urshr       v1.8h, v17.8h, #1             \n"
1901       "urshr       v2.8h, v18.8h, #1             \n"
1902 
1903       "subs        %w4, %w4, #16                 \n"  // 16 processed per loop.
1904       RGBTOUV(v0.8h, v1.8h, v2.8h)
1905       "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
1906       "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
1907       "b.gt        1b                            \n"
1908       : "+r"(src_rgb565),    // %0
1909         "+r"(src_rgb565_1),  // %1
1910         "+r"(dst_u),           // %2
1911         "+r"(dst_v),           // %3
1912         "+r"(width)            // %4
1913       :
1914       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17",
1915         "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
1916         "v28");
1917 }
1918 
1919 // 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
ARGB1555ToUVRow_NEON(const uint8_t * src_argb1555,int src_stride_argb1555,uint8_t * dst_u,uint8_t * dst_v,int width)1920 void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
1921                           int src_stride_argb1555,
1922                           uint8_t* dst_u,
1923                           uint8_t* dst_v,
1924                           int width) {
1925   const uint8_t* src_argb1555_1 = src_argb1555 + src_stride_argb1555;
1926   asm volatile(
1927       RGBTOUV_SETUP_REG
1928       "1:                                        \n"
1929       "ld1         {v0.16b}, [%0], #16           \n"  // load 8 ARGB1555 pixels.
1930       "prfm        pldl1keep, [%0, 448]          \n"
1931       RGB555TOARGB
1932       "uaddlp      v16.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
1933       "uaddlp      v17.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
1934       "uaddlp      v18.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
1935       "ld1         {v0.16b}, [%0], #16           \n"  // next 8 ARGB1555 pixels.
1936       RGB555TOARGB
1937       "uaddlp      v26.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
1938       "uaddlp      v27.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
1939       "uaddlp      v28.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
1940 
1941       "ld1         {v0.16b}, [%1], #16           \n"  // load 8 ARGB1555 pixels.
1942       "prfm        pldl1keep, [%1, 448]          \n"
1943       RGB555TOARGB
1944       "uadalp      v16.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
1945       "uadalp      v17.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
1946       "uadalp      v18.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
1947       "ld1         {v0.16b}, [%1], #16           \n"  // next 8 ARGB1555 pixels.
1948       RGB555TOARGB
1949       "uadalp      v26.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
1950       "uadalp      v27.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
1951       "uadalp      v28.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
1952 
1953       "ins         v16.D[1], v26.D[0]            \n"
1954       "ins         v17.D[1], v27.D[0]            \n"
1955       "ins         v18.D[1], v28.D[0]            \n"
1956 
1957       "urshr       v0.8h, v16.8h, #1             \n"  // 2x average
1958       "urshr       v1.8h, v17.8h, #1             \n"
1959       "urshr       v2.8h, v18.8h, #1             \n"
1960 
1961       "subs        %w4, %w4, #16                 \n"  // 16 processed per loop.
1962       RGBTOUV(v0.8h, v1.8h, v2.8h)
1963       "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
1964       "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
1965       "b.gt        1b                            \n"
1966       : "+r"(src_argb1555),    // %0
1967         "+r"(src_argb1555_1),  // %1
1968         "+r"(dst_u),           // %2
1969         "+r"(dst_v),           // %3
1970         "+r"(width)            // %4
1971       :
1972       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17",
1973         "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
1974         "v28");
1975 }
1976 
1977 // 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
ARGB4444ToUVRow_NEON(const uint8_t * src_argb4444,int src_stride_argb4444,uint8_t * dst_u,uint8_t * dst_v,int width)1978 void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
1979                           int src_stride_argb4444,
1980                           uint8_t* dst_u,
1981                           uint8_t* dst_v,
1982                           int width) {
1983   const uint8_t* src_argb4444_1 = src_argb4444 + src_stride_argb4444;
1984   asm volatile(
1985       RGBTOUV_SETUP_REG  // sets v20-v25
1986       "1:                                        \n"
1987       "ld1         {v0.16b}, [%0], #16           \n"  // load 8 ARGB4444 pixels.
1988       "prfm        pldl1keep, [%0, 448]          \n"
1989       ARGB4444TOARGB
1990       "uaddlp      v16.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
1991       "uaddlp      v17.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
1992       "uaddlp      v18.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
1993       "ld1         {v0.16b}, [%0], #16           \n"  // next 8 ARGB4444 pixels.
1994       ARGB4444TOARGB
1995       "uaddlp      v26.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
1996       "uaddlp      v27.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
1997       "uaddlp      v28.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
1998 
1999       "ld1         {v0.16b}, [%1], #16           \n"  // load 8 ARGB4444 pixels.
2000       "prfm        pldl1keep, [%1, 448]          \n"
2001       ARGB4444TOARGB
2002       "uadalp      v16.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
2003       "uadalp      v17.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
2004       "uadalp      v18.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
2005       "ld1         {v0.16b}, [%1], #16           \n"  // next 8 ARGB4444 pixels.
2006       ARGB4444TOARGB
2007       "uadalp      v26.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
2008       "uadalp      v27.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
2009       "uadalp      v28.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
2010 
2011       "ins         v16.D[1], v26.D[0]            \n"
2012       "ins         v17.D[1], v27.D[0]            \n"
2013       "ins         v18.D[1], v28.D[0]            \n"
2014 
2015       "urshr       v0.8h, v16.8h, #1             \n"  // 2x average
2016       "urshr       v1.8h, v17.8h, #1             \n"
2017       "urshr       v2.8h, v18.8h, #1             \n"
2018 
2019       "subs        %w4, %w4, #16                 \n"  // 16 processed per loop.
2020       RGBTOUV(v0.8h, v1.8h, v2.8h)
2021       "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
2022       "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
2023       "b.gt        1b                            \n"
2024       : "+r"(src_argb4444),    // %0
2025         "+r"(src_argb4444_1),  // %1
2026         "+r"(dst_u),           // %2
2027         "+r"(dst_v),           // %3
2028         "+r"(width)            // %4
2029       :
2030       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17",
2031         "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
2032         "v28"
2033 
2034   );
2035 }
2036 
RGB565ToYRow_NEON(const uint8_t * src_rgb565,uint8_t * dst_y,int width)2037 void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
2038   asm volatile(
2039       "movi        v24.8b, #25                   \n"  // B * 0.1016 coefficient
2040       "movi        v25.8b, #129                  \n"  // G * 0.5078 coefficient
2041       "movi        v26.8b, #66                   \n"  // R * 0.2578 coefficient
2042       "movi        v27.8b, #16                   \n"  // Add 16 constant
2043       "1:                                        \n"
2044       "ld1         {v0.16b}, [%0], #16           \n"  // load 8 RGB565 pixels.
2045       "prfm        pldl1keep, [%0, 448]          \n"
2046       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
2047       RGB565TOARGB
2048       "umull       v3.8h, v0.8b, v24.8b          \n"  // B
2049       "umlal       v3.8h, v1.8b, v25.8b          \n"  // G
2050       "umlal       v3.8h, v2.8b, v26.8b          \n"  // R
2051       "uqrshrn     v0.8b, v3.8h, #8              \n"  // 16 bit to 8 bit Y
2052       "uqadd       v0.8b, v0.8b, v27.8b          \n"
2053       "st1         {v0.8b}, [%1], #8             \n"  // store 8 pixels Y.
2054       "b.gt        1b                            \n"
2055       : "+r"(src_rgb565),  // %0
2056         "+r"(dst_y),       // %1
2057         "+r"(width)        // %2
2058       :
2059       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6", "v24", "v25", "v26",
2060         "v27");
2061 }
2062 
ARGB1555ToYRow_NEON(const uint8_t * src_argb1555,uint8_t * dst_y,int width)2063 void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
2064                          uint8_t* dst_y,
2065                          int width) {
2066   asm volatile(
2067       "movi        v4.8b, #25                    \n"  // B * 0.1016 coefficient
2068       "movi        v5.8b, #129                   \n"  // G * 0.5078 coefficient
2069       "movi        v6.8b, #66                    \n"  // R * 0.2578 coefficient
2070       "movi        v7.8b, #16                    \n"  // Add 16 constant
2071       "1:                                        \n"
2072       "ld1         {v0.16b}, [%0], #16           \n"  // load 8 ARGB1555 pixels.
2073       "prfm        pldl1keep, [%0, 448]          \n"
2074       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
2075       ARGB1555TOARGB
2076       "umull       v3.8h, v0.8b, v4.8b           \n"  // B
2077       "umlal       v3.8h, v1.8b, v5.8b           \n"  // G
2078       "umlal       v3.8h, v2.8b, v6.8b           \n"  // R
2079       "uqrshrn     v0.8b, v3.8h, #8              \n"  // 16 bit to 8 bit Y
2080       "uqadd       v0.8b, v0.8b, v7.8b           \n"
2081       "st1         {v0.8b}, [%1], #8             \n"  // store 8 pixels Y.
2082       "b.gt        1b                            \n"
2083       : "+r"(src_argb1555),  // %0
2084         "+r"(dst_y),         // %1
2085         "+r"(width)          // %2
2086       :
2087       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
2088 }
2089 
ARGB4444ToYRow_NEON(const uint8_t * src_argb4444,uint8_t * dst_y,int width)2090 void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
2091                          uint8_t* dst_y,
2092                          int width) {
2093   asm volatile(
2094       "movi        v24.8b, #25                   \n"  // B * 0.1016 coefficient
2095       "movi        v25.8b, #129                  \n"  // G * 0.5078 coefficient
2096       "movi        v26.8b, #66                   \n"  // R * 0.2578 coefficient
2097       "movi        v27.8b, #16                   \n"  // Add 16 constant
2098       "1:                                        \n"
2099       "ld1         {v0.16b}, [%0], #16           \n"  // load 8 ARGB4444 pixels.
2100       "prfm        pldl1keep, [%0, 448]          \n"
2101       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
2102       ARGB4444TOARGB
2103       "umull       v3.8h, v0.8b, v24.8b          \n"  // B
2104       "umlal       v3.8h, v1.8b, v25.8b          \n"  // G
2105       "umlal       v3.8h, v2.8b, v26.8b          \n"  // R
2106       "uqrshrn     v0.8b, v3.8h, #8              \n"  // 16 bit to 8 bit Y
2107       "uqadd       v0.8b, v0.8b, v27.8b          \n"
2108       "st1         {v0.8b}, [%1], #8             \n"  // store 8 pixels Y.
2109       "b.gt        1b                            \n"
2110       : "+r"(src_argb4444),  // %0
2111         "+r"(dst_y),         // %1
2112         "+r"(width)          // %2
2113       :
2114       : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27");
2115 }
2116 
BGRAToYRow_NEON(const uint8_t * src_bgra,uint8_t * dst_y,int width)2117 void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
2118   asm volatile(
2119       "movi        v4.8b, #66                    \n"  // R * 0.2578 coefficient
2120       "movi        v5.8b, #129                   \n"  // G * 0.5078 coefficient
2121       "movi        v6.8b, #25                    \n"  // B * 0.1016 coefficient
2122       "movi        v7.8b, #16                    \n"  // Add 16 constant
2123       "1:                                        \n"
2124       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
2125       "prfm        pldl1keep, [%0, 448]          \n"
2126       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
2127       "umull       v16.8h, v1.8b, v4.8b          \n"  // R
2128       "umlal       v16.8h, v2.8b, v5.8b          \n"  // G
2129       "umlal       v16.8h, v3.8b, v6.8b          \n"  // B
2130       "uqrshrn     v0.8b, v16.8h, #8             \n"  // 16 bit to 8 bit Y
2131       "uqadd       v0.8b, v0.8b, v7.8b           \n"
2132       "st1         {v0.8b}, [%1], #8             \n"  // store 8 pixels Y.
2133       "b.gt        1b                            \n"
2134       : "+r"(src_bgra),  // %0
2135         "+r"(dst_y),     // %1
2136         "+r"(width)      // %2
2137       :
2138       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
2139 }
2140 
ABGRToYRow_NEON(const uint8_t * src_abgr,uint8_t * dst_y,int width)2141 void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
2142   asm volatile(
2143       "movi        v6.8b, #25                    \n"  // B * 0.1016 coefficient
2144       "movi        v5.8b, #129                   \n"  // G * 0.5078 coefficient
2145       "movi        v4.8b, #66                    \n"  // R * 0.2578 coefficient
2146       "movi        v7.8b, #16                    \n"  // Add 16 constant
2147       "1:                                        \n"
2148       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
2149       "prfm        pldl1keep, [%0, 448]          \n"
2150       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
2151       "umull       v16.8h, v0.8b, v4.8b          \n"  // R
2152       "umlal       v16.8h, v1.8b, v5.8b          \n"  // G
2153       "umlal       v16.8h, v2.8b, v6.8b          \n"  // B
2154       "uqrshrn     v0.8b, v16.8h, #8             \n"  // 16 bit to 8 bit Y
2155       "uqadd       v0.8b, v0.8b, v7.8b           \n"
2156       "st1         {v0.8b}, [%1], #8             \n"  // store 8 pixels Y.
2157       "b.gt        1b                            \n"
2158       : "+r"(src_abgr),  // %0
2159         "+r"(dst_y),     // %1
2160         "+r"(width)      // %2
2161       :
2162       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
2163 }
2164 
RGBAToYRow_NEON(const uint8_t * src_rgba,uint8_t * dst_y,int width)2165 void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
2166   asm volatile(
2167       "movi        v4.8b, #25                    \n"  // B * 0.1016 coefficient
2168       "movi        v5.8b, #129                   \n"  // G * 0.5078 coefficient
2169       "movi        v6.8b, #66                    \n"  // R * 0.2578 coefficient
2170       "movi        v7.8b, #16                    \n"  // Add 16 constant
2171       "1:                                        \n"
2172       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
2173       "prfm        pldl1keep, [%0, 448]          \n"
2174       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
2175       "umull       v16.8h, v1.8b, v4.8b          \n"  // B
2176       "umlal       v16.8h, v2.8b, v5.8b          \n"  // G
2177       "umlal       v16.8h, v3.8b, v6.8b          \n"  // R
2178       "uqrshrn     v0.8b, v16.8h, #8             \n"  // 16 bit to 8 bit Y
2179       "uqadd       v0.8b, v0.8b, v7.8b           \n"
2180       "st1         {v0.8b}, [%1], #8             \n"  // store 8 pixels Y.
2181       "b.gt        1b                            \n"
2182       : "+r"(src_rgba),  // %0
2183         "+r"(dst_y),     // %1
2184         "+r"(width)      // %2
2185       :
2186       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
2187 }
2188 
RGB24ToYRow_NEON(const uint8_t * src_rgb24,uint8_t * dst_y,int width)2189 void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
2190   asm volatile(
2191       "movi        v4.8b, #25                    \n"  // B * 0.1016 coefficient
2192       "movi        v5.8b, #129                   \n"  // G * 0.5078 coefficient
2193       "movi        v6.8b, #66                    \n"  // R * 0.2578 coefficient
2194       "movi        v7.8b, #16                    \n"  // Add 16 constant
2195       "1:                                        \n"
2196       "ld3         {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels.
2197       "prfm        pldl1keep, [%0, 448]          \n"
2198       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
2199       "umull       v16.8h, v0.8b, v4.8b          \n"  // B
2200       "umlal       v16.8h, v1.8b, v5.8b          \n"  // G
2201       "umlal       v16.8h, v2.8b, v6.8b          \n"  // R
2202       "uqrshrn     v0.8b, v16.8h, #8             \n"  // 16 bit to 8 bit Y
2203       "uqadd       v0.8b, v0.8b, v7.8b           \n"
2204       "st1         {v0.8b}, [%1], #8             \n"  // store 8 pixels Y.
2205       "b.gt        1b                            \n"
2206       : "+r"(src_rgb24),  // %0
2207         "+r"(dst_y),      // %1
2208         "+r"(width)       // %2
2209       :
2210       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
2211 }
2212 
RAWToYRow_NEON(const uint8_t * src_raw,uint8_t * dst_y,int width)2213 void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) {
2214   asm volatile(
2215       "movi        v6.8b, #25                    \n"  // B * 0.1016 coefficient
2216       "movi        v5.8b, #129                   \n"  // G * 0.5078 coefficient
2217       "movi        v4.8b, #66                    \n"  // R * 0.2578 coefficient
2218       "movi        v7.8b, #16                    \n"  // Add 16 constant
2219       "1:                                        \n"
2220       "ld3         {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels.
2221       "prfm        pldl1keep, [%0, 448]          \n"
2222       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
2223       "umull       v16.8h, v0.8b, v4.8b          \n"  // B
2224       "umlal       v16.8h, v1.8b, v5.8b          \n"  // G
2225       "umlal       v16.8h, v2.8b, v6.8b          \n"  // R
2226       "uqrshrn     v0.8b, v16.8h, #8             \n"  // 16 bit to 8 bit Y
2227       "uqadd       v0.8b, v0.8b, v7.8b           \n"
2228       "st1         {v0.8b}, [%1], #8             \n"  // store 8 pixels Y.
2229       "b.gt        1b                            \n"
2230       : "+r"(src_raw),  // %0
2231         "+r"(dst_y),    // %1
2232         "+r"(width)     // %2
2233       :
2234       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
2235 }
2236 
RGB24ToYJRow_NEON(const uint8_t * src_rgb24,uint8_t * dst_yj,int width)2237 void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
2238   asm volatile(
2239       "movi        v4.8b, #29                    \n"  // B * 0.1140 coefficient
2240       "movi        v5.8b, #150                   \n"  // G * 0.5870 coefficient
2241       "movi        v6.8b, #77                    \n"  // R * 0.2990 coefficient
2242       "1:                                        \n"
2243       "ld3         {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels.
2244       "prfm        pldl1keep, [%0, 448]          \n"
2245       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
2246       "umull       v0.8h, v0.8b, v4.8b           \n"  // B
2247       "umlal       v0.8h, v1.8b, v5.8b           \n"  // G
2248       "umlal       v0.8h, v2.8b, v6.8b           \n"  // R
2249       "uqrshrn     v0.8b, v0.8h, #8              \n"  // 16 bit to 8 bit Y
2250       "st1         {v0.8b}, [%1], #8             \n"  // store 8 pixels Y.
2251       "b.gt        1b                            \n"
2252       : "+r"(src_rgb24),  // %0
2253         "+r"(dst_yj),     // %1
2254         "+r"(width)       // %2
2255       :
2256       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
2257 }
2258 
RAWToYJRow_NEON(const uint8_t * src_raw,uint8_t * dst_yj,int width)2259 void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
2260   asm volatile(
2261       "movi        v6.8b, #29                    \n"  // B * 0.1140 coefficient
2262       "movi        v5.8b, #150                   \n"  // G * 0.5870 coefficient
2263       "movi        v4.8b, #77                    \n"  // R * 0.2990 coefficient
2264       "1:                                        \n"
2265       "ld3         {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels.
2266       "prfm        pldl1keep, [%0, 448]          \n"
2267       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
2268       "umull       v0.8h, v0.8b, v4.8b           \n"  // B
2269       "umlal       v0.8h, v1.8b, v5.8b           \n"  // G
2270       "umlal       v0.8h, v2.8b, v6.8b           \n"  // R
2271       "uqrshrn     v0.8b, v0.8h, #8              \n"  // 16 bit to 8 bit Y
2272       "st1         {v0.8b}, [%1], #8             \n"  // store 8 pixels Y.
2273       "b.gt        1b                            \n"
2274       : "+r"(src_raw),  // %0
2275         "+r"(dst_yj),   // %1
2276         "+r"(width)     // %2
2277       :
2278       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
2279 }
2280 
2281 // Bilinear filter 16x2 -> 16x1
InterpolateRow_NEON(uint8_t * dst_ptr,const uint8_t * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)2282 void InterpolateRow_NEON(uint8_t* dst_ptr,
2283                          const uint8_t* src_ptr,
2284                          ptrdiff_t src_stride,
2285                          int dst_width,
2286                          int source_y_fraction) {
2287   int y1_fraction = source_y_fraction;
2288   int y0_fraction = 256 - y1_fraction;
2289   const uint8_t* src_ptr1 = src_ptr + src_stride;
2290   asm volatile(
2291       "cmp         %w4, #0                       \n"
2292       "b.eq        100f                          \n"
2293       "cmp         %w4, #128                     \n"
2294       "b.eq        50f                           \n"
2295 
2296       "dup         v5.16b, %w4                   \n"
2297       "dup         v4.16b, %w5                   \n"
2298       // General purpose row blend.
2299       "1:                                        \n"
2300       "ld1         {v0.16b}, [%1], #16           \n"
2301       "ld1         {v1.16b}, [%2], #16           \n"
2302       "prfm        pldl1keep, [%1, 448]          \n"
2303       "prfm        pldl1keep, [%2, 448]          \n"
2304       "subs        %w3, %w3, #16                 \n"
2305       "umull       v2.8h, v0.8b,  v4.8b          \n"
2306       "umull2      v3.8h, v0.16b, v4.16b         \n"
2307       "umlal       v2.8h, v1.8b,  v5.8b          \n"
2308       "umlal2      v3.8h, v1.16b, v5.16b         \n"
2309       "rshrn       v0.8b,  v2.8h, #8             \n"
2310       "rshrn2      v0.16b, v3.8h, #8             \n"
2311       "st1         {v0.16b}, [%0], #16           \n"
2312       "b.gt        1b                            \n"
2313       "b           99f                           \n"
2314 
2315       // Blend 50 / 50.
2316       "50:                                       \n"
2317       "ld1         {v0.16b}, [%1], #16           \n"
2318       "ld1         {v1.16b}, [%2], #16           \n"
2319       "prfm        pldl1keep, [%1, 448]          \n"
2320       "prfm        pldl1keep, [%2, 448]          \n"
2321       "subs        %w3, %w3, #16                 \n"
2322       "urhadd      v0.16b, v0.16b, v1.16b        \n"
2323       "st1         {v0.16b}, [%0], #16           \n"
2324       "b.gt        50b                           \n"
2325       "b           99f                           \n"
2326 
2327       // Blend 100 / 0 - Copy row unchanged.
2328       "100:                                      \n"
2329       "ld1         {v0.16b}, [%1], #16           \n"
2330       "prfm        pldl1keep, [%1, 448]          \n"
2331       "subs        %w3, %w3, #16                 \n"
2332       "st1         {v0.16b}, [%0], #16           \n"
2333       "b.gt        100b                          \n"
2334 
2335       "99:                                       \n"
2336       : "+r"(dst_ptr),      // %0
2337         "+r"(src_ptr),      // %1
2338         "+r"(src_ptr1),     // %2
2339         "+r"(dst_width),    // %3
2340         "+r"(y1_fraction),  // %4
2341         "+r"(y0_fraction)   // %5
2342       :
2343       : "cc", "memory", "v0", "v1", "v3", "v4", "v5");
2344 }
2345 
2346 // dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
ARGBBlendRow_NEON(const uint8_t * src_argb0,const uint8_t * src_argb1,uint8_t * dst_argb,int width)2347 void ARGBBlendRow_NEON(const uint8_t* src_argb0,
2348                        const uint8_t* src_argb1,
2349                        uint8_t* dst_argb,
2350                        int width) {
2351   asm volatile(
2352       "subs        %w3, %w3, #8                  \n"
2353       "b.lt        89f                           \n"
2354       // Blend 8 pixels.
2355       "8:                                        \n"
2356       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB0
2357       "ld4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 ARGB1
2358       "prfm        pldl1keep, [%0, 448]          \n"
2359       "prfm        pldl1keep, [%1, 448]          \n"
2360       "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
2361       "umull       v16.8h, v4.8b, v3.8b          \n"  // db * a
2362       "umull       v17.8h, v5.8b, v3.8b          \n"  // dg * a
2363       "umull       v18.8h, v6.8b, v3.8b          \n"  // dr * a
2364       "uqrshrn     v16.8b, v16.8h, #8            \n"  // db >>= 8
2365       "uqrshrn     v17.8b, v17.8h, #8            \n"  // dg >>= 8
2366       "uqrshrn     v18.8b, v18.8h, #8            \n"  // dr >>= 8
2367       "uqsub       v4.8b, v4.8b, v16.8b          \n"  // db - (db * a / 256)
2368       "uqsub       v5.8b, v5.8b, v17.8b          \n"  // dg - (dg * a / 256)
2369       "uqsub       v6.8b, v6.8b, v18.8b          \n"  // dr - (dr * a / 256)
2370       "uqadd       v0.8b, v0.8b, v4.8b           \n"  // + sb
2371       "uqadd       v1.8b, v1.8b, v5.8b           \n"  // + sg
2372       "uqadd       v2.8b, v2.8b, v6.8b           \n"  // + sr
2373       "movi        v3.8b, #255                   \n"  // a = 255
2374       "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
2375                                                              // pixels
2376       "b.ge        8b                            \n"
2377 
2378       "89:                                       \n"
2379       "adds        %w3, %w3, #8-1                \n"
2380       "b.lt        99f                           \n"
2381 
2382       // Blend 1 pixels.
2383       "1:                                        \n"
2384       "ld4         {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n"  // load 1 pixel
2385                                                            // ARGB0.
2386       "ld4         {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n"  // load 1 pixel
2387                                                            // ARGB1.
2388       "prfm        pldl1keep, [%0, 448]          \n"
2389       "prfm        pldl1keep, [%1, 448]          \n"
2390       "subs        %w3, %w3, #1                  \n"  // 1 processed per loop.
2391       "umull       v16.8h, v4.8b, v3.8b          \n"  // db * a
2392       "umull       v17.8h, v5.8b, v3.8b          \n"  // dg * a
2393       "umull       v18.8h, v6.8b, v3.8b          \n"  // dr * a
2394       "uqrshrn     v16.8b, v16.8h, #8            \n"  // db >>= 8
2395       "uqrshrn     v17.8b, v17.8h, #8            \n"  // dg >>= 8
2396       "uqrshrn     v18.8b, v18.8h, #8            \n"  // dr >>= 8
2397       "uqsub       v4.8b, v4.8b, v16.8b          \n"  // db - (db * a / 256)
2398       "uqsub       v5.8b, v5.8b, v17.8b          \n"  // dg - (dg * a / 256)
2399       "uqsub       v6.8b, v6.8b, v18.8b          \n"  // dr - (dr * a / 256)
2400       "uqadd       v0.8b, v0.8b, v4.8b           \n"  // + sb
2401       "uqadd       v1.8b, v1.8b, v5.8b           \n"  // + sg
2402       "uqadd       v2.8b, v2.8b, v6.8b           \n"  // + sr
2403       "movi        v3.8b, #255                   \n"  // a = 255
2404       "st4         {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n"  // store 1 pixel.
2405       "b.ge        1b                            \n"
2406 
2407       "99:                                       \n"
2408 
2409       : "+r"(src_argb0),  // %0
2410         "+r"(src_argb1),  // %1
2411         "+r"(dst_argb),   // %2
2412         "+r"(width)       // %3
2413       :
2414       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
2415         "v17", "v18");
2416 }
2417 
2418 // Attenuate 8 pixels at a time.
ARGBAttenuateRow_NEON(const uint8_t * src_argb,uint8_t * dst_argb,int width)2419 void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
2420                            uint8_t* dst_argb,
2421                            int width) {
2422   asm volatile(
2423       // Attenuate 8 pixels.
2424       "1:                                        \n"
2425       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
2426       "prfm        pldl1keep, [%0, 448]          \n"
2427       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
2428       "umull       v4.8h, v0.8b, v3.8b           \n"  // b * a
2429       "umull       v5.8h, v1.8b, v3.8b           \n"  // g * a
2430       "umull       v6.8h, v2.8b, v3.8b           \n"  // r * a
2431       "uqrshrn     v0.8b, v4.8h, #8              \n"  // b >>= 8
2432       "uqrshrn     v1.8b, v5.8h, #8              \n"  // g >>= 8
2433       "uqrshrn     v2.8b, v6.8h, #8              \n"  // r >>= 8
2434       "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB
2435       "b.gt        1b                            \n"
2436       : "+r"(src_argb),  // %0
2437         "+r"(dst_argb),  // %1
2438         "+r"(width)      // %2
2439       :
2440       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
2441 }
2442 
2443 // Quantize 8 ARGB pixels (32 bytes).
2444 // dst = (dst * scale >> 16) * interval_size + interval_offset;
ARGBQuantizeRow_NEON(uint8_t * dst_argb,int scale,int interval_size,int interval_offset,int width)2445 void ARGBQuantizeRow_NEON(uint8_t* dst_argb,
2446                           int scale,
2447                           int interval_size,
2448                           int interval_offset,
2449                           int width) {
2450   asm volatile(
2451       "dup         v4.8h, %w2                    \n"
2452       "ushr        v4.8h, v4.8h, #1              \n"  // scale >>= 1
2453       "dup         v5.8h, %w3                    \n"  // interval multiply.
2454       "dup         v6.8h, %w4                    \n"  // interval add
2455 
2456       // 8 pixel loop.
2457       "1:                                        \n"
2458       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n"  // load 8  ARGB.
2459       "prfm        pldl1keep, [%0, 448]          \n"
2460       "subs        %w1, %w1, #8                  \n"  // 8 processed per loop.
2461       "uxtl        v0.8h, v0.8b                  \n"  // b (0 .. 255)
2462       "uxtl        v1.8h, v1.8b                  \n"
2463       "uxtl        v2.8h, v2.8b                  \n"
2464       "sqdmulh     v0.8h, v0.8h, v4.8h           \n"  // b * scale
2465       "sqdmulh     v1.8h, v1.8h, v4.8h           \n"  // g
2466       "sqdmulh     v2.8h, v2.8h, v4.8h           \n"  // r
2467       "mul         v0.8h, v0.8h, v5.8h           \n"  // b * interval_size
2468       "mul         v1.8h, v1.8h, v5.8h           \n"  // g
2469       "mul         v2.8h, v2.8h, v5.8h           \n"  // r
2470       "add         v0.8h, v0.8h, v6.8h           \n"  // b + interval_offset
2471       "add         v1.8h, v1.8h, v6.8h           \n"  // g
2472       "add         v2.8h, v2.8h, v6.8h           \n"  // r
2473       "uqxtn       v0.8b, v0.8h                  \n"
2474       "uqxtn       v1.8b, v1.8h                  \n"
2475       "uqxtn       v2.8b, v2.8h                  \n"
2476       "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // store 8 ARGB
2477       "b.gt        1b                            \n"
2478       : "+r"(dst_argb),       // %0
2479         "+r"(width)           // %1
2480       : "r"(scale),           // %2
2481         "r"(interval_size),   // %3
2482         "r"(interval_offset)  // %4
2483       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
2484 }
2485 
2486 // Shade 8 pixels at a time by specified value.
2487 // NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
2488 // Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
ARGBShadeRow_NEON(const uint8_t * src_argb,uint8_t * dst_argb,int width,uint32_t value)2489 void ARGBShadeRow_NEON(const uint8_t* src_argb,
2490                        uint8_t* dst_argb,
2491                        int width,
2492                        uint32_t value) {
2493   asm volatile(
2494       "dup         v0.4s, %w3                    \n"  // duplicate scale value.
2495       "zip1        v0.8b, v0.8b, v0.8b           \n"  // v0.8b aarrggbb.
2496       "ushr        v0.8h, v0.8h, #1              \n"  // scale / 2.
2497 
2498       // 8 pixel loop.
2499       "1:                                        \n"
2500       "ld4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n"  // load 8 ARGB
2501       "prfm        pldl1keep, [%0, 448]          \n"
2502       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
2503       "uxtl        v4.8h, v4.8b                  \n"  // b (0 .. 255)
2504       "uxtl        v5.8h, v5.8b                  \n"
2505       "uxtl        v6.8h, v6.8b                  \n"
2506       "uxtl        v7.8h, v7.8b                  \n"
2507       "sqrdmulh    v4.8h, v4.8h, v0.h[0]         \n"  // b * scale * 2
2508       "sqrdmulh    v5.8h, v5.8h, v0.h[1]         \n"  // g
2509       "sqrdmulh    v6.8h, v6.8h, v0.h[2]         \n"  // r
2510       "sqrdmulh    v7.8h, v7.8h, v0.h[3]         \n"  // a
2511       "uqxtn       v4.8b, v4.8h                  \n"
2512       "uqxtn       v5.8b, v5.8h                  \n"
2513       "uqxtn       v6.8b, v6.8h                  \n"
2514       "uqxtn       v7.8b, v7.8h                  \n"
2515       "st4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // store 8 ARGB
2516       "b.gt        1b                            \n"
2517       : "+r"(src_argb),  // %0
2518         "+r"(dst_argb),  // %1
2519         "+r"(width)      // %2
2520       : "r"(value)       // %3
2521       : "cc", "memory", "v0", "v4", "v5", "v6", "v7");
2522 }
2523 
2524 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
2525 // Similar to ARGBToYJ but stores ARGB.
2526 // C code is (29 * b + 150 * g + 77 * r + 128) >> 8;
ARGBGrayRow_NEON(const uint8_t * src_argb,uint8_t * dst_argb,int width)2527 void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
2528   asm volatile(
2529       "movi        v24.8b, #29                   \n"  // B * 0.1140 coefficient
2530       "movi        v25.8b, #150                  \n"  // G * 0.5870 coefficient
2531       "movi        v26.8b, #77                   \n"  // R * 0.2990 coefficient
2532       "1:                                        \n"
2533       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
2534       "prfm        pldl1keep, [%0, 448]          \n"
2535       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
2536       "umull       v4.8h, v0.8b, v24.8b          \n"  // B
2537       "umlal       v4.8h, v1.8b, v25.8b          \n"  // G
2538       "umlal       v4.8h, v2.8b, v26.8b          \n"  // R
2539       "uqrshrn     v0.8b, v4.8h, #8              \n"  // 16 bit to 8 bit B
2540       "orr         v1.8b, v0.8b, v0.8b           \n"  // G
2541       "orr         v2.8b, v0.8b, v0.8b           \n"  // R
2542       "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 pixels.
2543       "b.gt        1b                            \n"
2544       : "+r"(src_argb),  // %0
2545         "+r"(dst_argb),  // %1
2546         "+r"(width)      // %2
2547       :
2548       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26");
2549 }
2550 
2551 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
2552 //    b = (r * 35 + g * 68 + b * 17) >> 7
2553 //    g = (r * 45 + g * 88 + b * 22) >> 7
2554 //    r = (r * 50 + g * 98 + b * 24) >> 7
2555 
ARGBSepiaRow_NEON(uint8_t * dst_argb,int width)2556 void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) {
2557   asm volatile(
2558       "movi        v20.8b, #17                   \n"  // BB coefficient
2559       "movi        v21.8b, #68                   \n"  // BG coefficient
2560       "movi        v22.8b, #35                   \n"  // BR coefficient
2561       "movi        v24.8b, #22                   \n"  // GB coefficient
2562       "movi        v25.8b, #88                   \n"  // GG coefficient
2563       "movi        v26.8b, #45                   \n"  // GR coefficient
2564       "movi        v28.8b, #24                   \n"  // BB coefficient
2565       "movi        v29.8b, #98                   \n"  // BG coefficient
2566       "movi        v30.8b, #50                   \n"  // BR coefficient
2567       "1:                                        \n"
2568       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n"  // load 8 ARGB pixels.
2569       "prfm        pldl1keep, [%0, 448]          \n"
2570       "subs        %w1, %w1, #8                  \n"  // 8 processed per loop.
2571       "umull       v4.8h, v0.8b, v20.8b          \n"  // B to Sepia B
2572       "umlal       v4.8h, v1.8b, v21.8b          \n"  // G
2573       "umlal       v4.8h, v2.8b, v22.8b          \n"  // R
2574       "umull       v5.8h, v0.8b, v24.8b          \n"  // B to Sepia G
2575       "umlal       v5.8h, v1.8b, v25.8b          \n"  // G
2576       "umlal       v5.8h, v2.8b, v26.8b          \n"  // R
2577       "umull       v6.8h, v0.8b, v28.8b          \n"  // B to Sepia R
2578       "umlal       v6.8h, v1.8b, v29.8b          \n"  // G
2579       "umlal       v6.8h, v2.8b, v30.8b          \n"  // R
2580       "uqshrn      v0.8b, v4.8h, #7              \n"  // 16 bit to 8 bit B
2581       "uqshrn      v1.8b, v5.8h, #7              \n"  // 16 bit to 8 bit G
2582       "uqshrn      v2.8b, v6.8h, #7              \n"  // 16 bit to 8 bit R
2583       "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // store 8 pixels.
2584       "b.gt        1b                            \n"
2585       : "+r"(dst_argb),  // %0
2586         "+r"(width)      // %1
2587       :
2588       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
2589         "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30");
2590 }
2591 
2592 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
2593 // TODO(fbarchard): Was same as Sepia except matrix is provided.  This function
2594 // needs to saturate.  Consider doing a non-saturating version.
ARGBColorMatrixRow_NEON(const uint8_t * src_argb,uint8_t * dst_argb,const int8_t * matrix_argb,int width)2595 void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
2596                              uint8_t* dst_argb,
2597                              const int8_t* matrix_argb,
2598                              int width) {
2599   asm volatile(
2600       "ld1         {v2.16b}, [%3]                \n"  // load 3 ARGB vectors.
2601       "sxtl        v0.8h, v2.8b                  \n"  // B,G coefficients s16.
2602       "sxtl2       v1.8h, v2.16b                 \n"  // R,A coefficients s16.
2603 
2604       "1:                                        \n"
2605       "ld4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n"  // load 8 ARGB
2606       "prfm        pldl1keep, [%0, 448]          \n"
2607       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
2608       "uxtl        v16.8h, v16.8b                \n"  // b (0 .. 255) 16 bit
2609       "uxtl        v17.8h, v17.8b                \n"  // g
2610       "uxtl        v18.8h, v18.8b                \n"  // r
2611       "uxtl        v19.8h, v19.8b                \n"  // a
2612       "mul         v22.8h, v16.8h, v0.h[0]       \n"  // B = B * Matrix B
2613       "mul         v23.8h, v16.8h, v0.h[4]       \n"  // G = B * Matrix G
2614       "mul         v24.8h, v16.8h, v1.h[0]       \n"  // R = B * Matrix R
2615       "mul         v25.8h, v16.8h, v1.h[4]       \n"  // A = B * Matrix A
2616       "mul         v4.8h, v17.8h, v0.h[1]        \n"  // B += G * Matrix B
2617       "mul         v5.8h, v17.8h, v0.h[5]        \n"  // G += G * Matrix G
2618       "mul         v6.8h, v17.8h, v1.h[1]        \n"  // R += G * Matrix R
2619       "mul         v7.8h, v17.8h, v1.h[5]        \n"  // A += G * Matrix A
2620       "sqadd       v22.8h, v22.8h, v4.8h         \n"  // Accumulate B
2621       "sqadd       v23.8h, v23.8h, v5.8h         \n"  // Accumulate G
2622       "sqadd       v24.8h, v24.8h, v6.8h         \n"  // Accumulate R
2623       "sqadd       v25.8h, v25.8h, v7.8h         \n"  // Accumulate A
2624       "mul         v4.8h, v18.8h, v0.h[2]        \n"  // B += R * Matrix B
2625       "mul         v5.8h, v18.8h, v0.h[6]        \n"  // G += R * Matrix G
2626       "mul         v6.8h, v18.8h, v1.h[2]        \n"  // R += R * Matrix R
2627       "mul         v7.8h, v18.8h, v1.h[6]        \n"  // A += R * Matrix A
2628       "sqadd       v22.8h, v22.8h, v4.8h         \n"  // Accumulate B
2629       "sqadd       v23.8h, v23.8h, v5.8h         \n"  // Accumulate G
2630       "sqadd       v24.8h, v24.8h, v6.8h         \n"  // Accumulate R
2631       "sqadd       v25.8h, v25.8h, v7.8h         \n"  // Accumulate A
2632       "mul         v4.8h, v19.8h, v0.h[3]        \n"  // B += A * Matrix B
2633       "mul         v5.8h, v19.8h, v0.h[7]        \n"  // G += A * Matrix G
2634       "mul         v6.8h, v19.8h, v1.h[3]        \n"  // R += A * Matrix R
2635       "mul         v7.8h, v19.8h, v1.h[7]        \n"  // A += A * Matrix A
2636       "sqadd       v22.8h, v22.8h, v4.8h         \n"  // Accumulate B
2637       "sqadd       v23.8h, v23.8h, v5.8h         \n"  // Accumulate G
2638       "sqadd       v24.8h, v24.8h, v6.8h         \n"  // Accumulate R
2639       "sqadd       v25.8h, v25.8h, v7.8h         \n"  // Accumulate A
2640       "sqshrun     v16.8b, v22.8h, #6            \n"  // 16 bit to 8 bit B
2641       "sqshrun     v17.8b, v23.8h, #6            \n"  // 16 bit to 8 bit G
2642       "sqshrun     v18.8b, v24.8h, #6            \n"  // 16 bit to 8 bit R
2643       "sqshrun     v19.8b, v25.8h, #6            \n"  // 16 bit to 8 bit A
2644       "st4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n"  // store 8 ARGB
2645       "b.gt        1b                            \n"
2646       : "+r"(src_argb),   // %0
2647         "+r"(dst_argb),   // %1
2648         "+r"(width)       // %2
2649       : "r"(matrix_argb)  // %3
2650       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
2651         "v17", "v18", "v19", "v22", "v23", "v24", "v25");
2652 }
2653 
2654 // TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable.
2655 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
ARGBMultiplyRow_NEON(const uint8_t * src_argb0,const uint8_t * src_argb1,uint8_t * dst_argb,int width)2656 void ARGBMultiplyRow_NEON(const uint8_t* src_argb0,
2657                           const uint8_t* src_argb1,
2658                           uint8_t* dst_argb,
2659                           int width) {
2660   asm volatile(
2661       // 8 pixel loop.
2662       "1:                                        \n"
2663       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
2664       "ld4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more
2665       "prfm        pldl1keep, [%0, 448]          \n"
2666       "prfm        pldl1keep, [%1, 448]          \n"
2667       "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
2668       "umull       v0.8h, v0.8b, v4.8b           \n"  // multiply B
2669       "umull       v1.8h, v1.8b, v5.8b           \n"  // multiply G
2670       "umull       v2.8h, v2.8b, v6.8b           \n"  // multiply R
2671       "umull       v3.8h, v3.8b, v7.8b           \n"  // multiply A
2672       "rshrn       v0.8b, v0.8h, #8              \n"  // 16 bit to 8 bit B
2673       "rshrn       v1.8b, v1.8h, #8              \n"  // 16 bit to 8 bit G
2674       "rshrn       v2.8b, v2.8h, #8              \n"  // 16 bit to 8 bit R
2675       "rshrn       v3.8b, v3.8h, #8              \n"  // 16 bit to 8 bit A
2676       "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
2677       "b.gt        1b                            \n"
2678       : "+r"(src_argb0),  // %0
2679         "+r"(src_argb1),  // %1
2680         "+r"(dst_argb),   // %2
2681         "+r"(width)       // %3
2682       :
2683       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
2684 }
2685 
2686 // Add 2 rows of ARGB pixels together, 8 pixels at a time.
ARGBAddRow_NEON(const uint8_t * src_argb0,const uint8_t * src_argb1,uint8_t * dst_argb,int width)2687 void ARGBAddRow_NEON(const uint8_t* src_argb0,
2688                      const uint8_t* src_argb1,
2689                      uint8_t* dst_argb,
2690                      int width) {
2691   asm volatile(
2692       // 8 pixel loop.
2693       "1:                                        \n"
2694       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
2695       "ld4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more
2696       "prfm        pldl1keep, [%0, 448]          \n"
2697       "prfm        pldl1keep, [%1, 448]          \n"
2698       "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
2699       "uqadd       v0.8b, v0.8b, v4.8b           \n"
2700       "uqadd       v1.8b, v1.8b, v5.8b           \n"
2701       "uqadd       v2.8b, v2.8b, v6.8b           \n"
2702       "uqadd       v3.8b, v3.8b, v7.8b           \n"
2703       "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
2704       "b.gt        1b                            \n"
2705       : "+r"(src_argb0),  // %0
2706         "+r"(src_argb1),  // %1
2707         "+r"(dst_argb),   // %2
2708         "+r"(width)       // %3
2709       :
2710       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
2711 }
2712 
2713 // Subtract 2 rows of ARGB pixels, 8 pixels at a time.
ARGBSubtractRow_NEON(const uint8_t * src_argb0,const uint8_t * src_argb1,uint8_t * dst_argb,int width)2714 void ARGBSubtractRow_NEON(const uint8_t* src_argb0,
2715                           const uint8_t* src_argb1,
2716                           uint8_t* dst_argb,
2717                           int width) {
2718   asm volatile(
2719       // 8 pixel loop.
2720       "1:                                        \n"
2721       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
2722       "ld4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more
2723       "prfm        pldl1keep, [%0, 448]          \n"
2724       "prfm        pldl1keep, [%1, 448]          \n"
2725       "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
2726       "uqsub       v0.8b, v0.8b, v4.8b           \n"
2727       "uqsub       v1.8b, v1.8b, v5.8b           \n"
2728       "uqsub       v2.8b, v2.8b, v6.8b           \n"
2729       "uqsub       v3.8b, v3.8b, v7.8b           \n"
2730       "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
2731       "b.gt        1b                            \n"
2732       : "+r"(src_argb0),  // %0
2733         "+r"(src_argb1),  // %1
2734         "+r"(dst_argb),   // %2
2735         "+r"(width)       // %3
2736       :
2737       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
2738 }
2739 
2740 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
2741 // A = 255
2742 // R = Sobel
2743 // G = Sobel
2744 // B = Sobel
SobelRow_NEON(const uint8_t * src_sobelx,const uint8_t * src_sobely,uint8_t * dst_argb,int width)2745 void SobelRow_NEON(const uint8_t* src_sobelx,
2746                    const uint8_t* src_sobely,
2747                    uint8_t* dst_argb,
2748                    int width) {
2749   asm volatile(
2750       "movi        v3.8b, #255                   \n"  // alpha
2751       // 8 pixel loop.
2752       "1:                                        \n"
2753       "ld1         {v0.8b}, [%0], #8             \n"  // load 8 sobelx.
2754       "ld1         {v1.8b}, [%1], #8             \n"  // load 8 sobely.
2755       "prfm        pldl1keep, [%0, 448]          \n"
2756       "prfm        pldl1keep, [%1, 448]          \n"
2757       "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
2758       "uqadd       v0.8b, v0.8b, v1.8b           \n"  // add
2759       "orr         v1.8b, v0.8b, v0.8b           \n"
2760       "orr         v2.8b, v0.8b, v0.8b           \n"
2761       "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
2762       "b.gt        1b                            \n"
2763       : "+r"(src_sobelx),  // %0
2764         "+r"(src_sobely),  // %1
2765         "+r"(dst_argb),    // %2
2766         "+r"(width)        // %3
2767       :
2768       : "cc", "memory", "v0", "v1", "v2", "v3");
2769 }
2770 
2771 // Adds Sobel X and Sobel Y and stores Sobel into plane.
SobelToPlaneRow_NEON(const uint8_t * src_sobelx,const uint8_t * src_sobely,uint8_t * dst_y,int width)2772 void SobelToPlaneRow_NEON(const uint8_t* src_sobelx,
2773                           const uint8_t* src_sobely,
2774                           uint8_t* dst_y,
2775                           int width) {
2776   asm volatile(
2777       // 16 pixel loop.
2778       "1:                                        \n"
2779       "ld1         {v0.16b}, [%0], #16           \n"  // load 16 sobelx.
2780       "ld1         {v1.16b}, [%1], #16           \n"  // load 16 sobely.
2781       "prfm        pldl1keep, [%0, 448]          \n"
2782       "prfm        pldl1keep, [%1, 448]          \n"
2783       "subs        %w3, %w3, #16                 \n"  // 16 processed per loop.
2784       "uqadd       v0.16b, v0.16b, v1.16b        \n"  // add
2785       "st1         {v0.16b}, [%2], #16           \n"  // store 16 pixels.
2786       "b.gt        1b                            \n"
2787       : "+r"(src_sobelx),  // %0
2788         "+r"(src_sobely),  // %1
2789         "+r"(dst_y),       // %2
2790         "+r"(width)        // %3
2791       :
2792       : "cc", "memory", "v0", "v1");
2793 }
2794 
2795 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
2796 // A = 255
2797 // R = Sobel X
2798 // G = Sobel
2799 // B = Sobel Y
SobelXYRow_NEON(const uint8_t * src_sobelx,const uint8_t * src_sobely,uint8_t * dst_argb,int width)2800 void SobelXYRow_NEON(const uint8_t* src_sobelx,
2801                      const uint8_t* src_sobely,
2802                      uint8_t* dst_argb,
2803                      int width) {
2804   asm volatile(
2805       "movi        v3.8b, #255                   \n"  // alpha
2806       // 8 pixel loop.
2807       "1:                                        \n"
2808       "ld1         {v2.8b}, [%0], #8             \n"  // load 8 sobelx.
2809       "ld1         {v0.8b}, [%1], #8             \n"  // load 8 sobely.
2810       "prfm        pldl1keep, [%0, 448]          \n"
2811       "prfm        pldl1keep, [%1, 448]          \n"
2812       "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
2813       "uqadd       v1.8b, v0.8b, v2.8b           \n"  // add
2814       "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
2815       "b.gt        1b                            \n"
2816       : "+r"(src_sobelx),  // %0
2817         "+r"(src_sobely),  // %1
2818         "+r"(dst_argb),    // %2
2819         "+r"(width)        // %3
2820       :
2821       : "cc", "memory", "v0", "v1", "v2", "v3");
2822 }
2823 
2824 // SobelX as a matrix is
2825 // -1  0  1
2826 // -2  0  2
2827 // -1  0  1
SobelXRow_NEON(const uint8_t * src_y0,const uint8_t * src_y1,const uint8_t * src_y2,uint8_t * dst_sobelx,int width)2828 void SobelXRow_NEON(const uint8_t* src_y0,
2829                     const uint8_t* src_y1,
2830                     const uint8_t* src_y2,
2831                     uint8_t* dst_sobelx,
2832                     int width) {
2833   asm volatile(
2834       "1:                                        \n"
2835       "ld1         {v0.8b}, [%0],%5              \n"  // top
2836       "ld1         {v1.8b}, [%0],%6              \n"
2837       "prfm        pldl1keep, [%0, 448]          \n"
2838       "usubl       v0.8h, v0.8b, v1.8b           \n"
2839       "ld1         {v2.8b}, [%1],%5              \n"  // center * 2
2840       "ld1         {v3.8b}, [%1],%6              \n"
2841       "prfm        pldl1keep, [%1, 448]          \n"
2842       "usubl       v1.8h, v2.8b, v3.8b           \n"
2843       "add         v0.8h, v0.8h, v1.8h           \n"
2844       "add         v0.8h, v0.8h, v1.8h           \n"
2845       "ld1         {v2.8b}, [%2],%5              \n"  // bottom
2846       "ld1         {v3.8b}, [%2],%6              \n"
2847       "prfm        pldl1keep, [%2, 448]          \n"
2848       "subs        %w4, %w4, #8                  \n"  // 8 pixels
2849       "usubl       v1.8h, v2.8b, v3.8b           \n"
2850       "add         v0.8h, v0.8h, v1.8h           \n"
2851       "abs         v0.8h, v0.8h                  \n"
2852       "uqxtn       v0.8b, v0.8h                  \n"
2853       "st1         {v0.8b}, [%3], #8             \n"  // store 8 sobelx
2854       "b.gt        1b                            \n"
2855       : "+r"(src_y0),                           // %0
2856         "+r"(src_y1),                           // %1
2857         "+r"(src_y2),                           // %2
2858         "+r"(dst_sobelx),                       // %3
2859         "+r"(width)                             // %4
2860       : "r"(2LL),                               // %5
2861         "r"(6LL)                                // %6
2862       : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
2863   );
2864 }
2865 
2866 // SobelY as a matrix is
2867 // -1 -2 -1
2868 //  0  0  0
2869 //  1  2  1
SobelYRow_NEON(const uint8_t * src_y0,const uint8_t * src_y1,uint8_t * dst_sobely,int width)2870 void SobelYRow_NEON(const uint8_t* src_y0,
2871                     const uint8_t* src_y1,
2872                     uint8_t* dst_sobely,
2873                     int width) {
2874   asm volatile(
2875       "1:                                        \n"
2876       "ld1         {v0.8b}, [%0],%4              \n"  // left
2877       "ld1         {v1.8b}, [%1],%4              \n"
2878       "usubl       v0.8h, v0.8b, v1.8b           \n"
2879       "ld1         {v2.8b}, [%0],%4              \n"  // center * 2
2880       "ld1         {v3.8b}, [%1],%4              \n"
2881       "usubl       v1.8h, v2.8b, v3.8b           \n"
2882       "add         v0.8h, v0.8h, v1.8h           \n"
2883       "add         v0.8h, v0.8h, v1.8h           \n"
2884       "ld1         {v2.8b}, [%0],%5              \n"  // right
2885       "ld1         {v3.8b}, [%1],%5              \n"
2886       "prfm        pldl1keep, [%0, 448]          \n"
2887       "prfm        pldl1keep, [%1, 448]          \n"
2888       "subs        %w3, %w3, #8                  \n"  // 8 pixels
2889       "usubl       v1.8h, v2.8b, v3.8b           \n"
2890       "add         v0.8h, v0.8h, v1.8h           \n"
2891       "abs         v0.8h, v0.8h                  \n"
2892       "uqxtn       v0.8b, v0.8h                  \n"
2893       "st1         {v0.8b}, [%2], #8             \n"  // store 8 sobely
2894       "b.gt        1b                            \n"
2895       : "+r"(src_y0),                           // %0
2896         "+r"(src_y1),                           // %1
2897         "+r"(dst_sobely),                       // %2
2898         "+r"(width)                             // %3
2899       : "r"(1LL),                               // %4
2900         "r"(6LL)                                // %5
2901       : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
2902   );
2903 }
2904 
2905 // Caveat - rounds float to half float whereas scaling version truncates.
HalfFloat1Row_NEON(const uint16_t * src,uint16_t * dst,float,int width)2906 void HalfFloat1Row_NEON(const uint16_t* src,
2907                         uint16_t* dst,
2908                         float /*unused*/,
2909                         int width) {
2910   asm volatile(
2911       "1:                                        \n"
2912       "ld1         {v1.16b}, [%0], #16           \n"  // load 8 shorts
2913       "prfm        pldl1keep, [%0, 448]          \n"
2914       "subs        %w2, %w2, #8                  \n"  // 8 pixels per loop
2915       "uxtl        v2.4s, v1.4h                  \n"  // 8 int's
2916       "uxtl2       v3.4s, v1.8h                  \n"
2917       "scvtf       v2.4s, v2.4s                  \n"  // 8 floats
2918       "scvtf       v3.4s, v3.4s                  \n"
2919       "fcvtn       v1.4h, v2.4s                  \n"  // 8 half floats
2920       "fcvtn2      v1.8h, v3.4s                  \n"
2921       "st1         {v1.16b}, [%1], #16           \n"  // store 8 shorts
2922       "b.gt        1b                            \n"
2923       : "+r"(src),   // %0
2924         "+r"(dst),   // %1
2925         "+r"(width)  // %2
2926       :
2927       : "cc", "memory", "v1", "v2", "v3");
2928 }
2929 
HalfFloatRow_NEON(const uint16_t * src,uint16_t * dst,float scale,int width)2930 void HalfFloatRow_NEON(const uint16_t* src,
2931                        uint16_t* dst,
2932                        float scale,
2933                        int width) {
2934   asm volatile(
2935       "1:                                        \n"
2936       "ld1         {v1.16b}, [%0], #16           \n"  // load 8 shorts
2937       "prfm        pldl1keep, [%0, 448]          \n"
2938       "subs        %w2, %w2, #8                  \n"  // 8 pixels per loop
2939       "uxtl        v2.4s, v1.4h                  \n"  // 8 int's
2940       "uxtl2       v3.4s, v1.8h                  \n"
2941       "scvtf       v2.4s, v2.4s                  \n"  // 8 floats
2942       "scvtf       v3.4s, v3.4s                  \n"
2943       "fmul        v2.4s, v2.4s, %3.s[0]         \n"  // adjust exponent
2944       "fmul        v3.4s, v3.4s, %3.s[0]         \n"
2945       "uqshrn      v1.4h, v2.4s, #13             \n"  // isolate halffloat
2946       "uqshrn2     v1.8h, v3.4s, #13             \n"
2947       "st1         {v1.16b}, [%1], #16           \n"  // store 8 shorts
2948       "b.gt        1b                            \n"
2949       : "+r"(src),                      // %0
2950         "+r"(dst),                      // %1
2951         "+r"(width)                     // %2
2952       : "w"(scale * 1.9259299444e-34f)  // %3
2953       : "cc", "memory", "v1", "v2", "v3");
2954 }
2955 
ByteToFloatRow_NEON(const uint8_t * src,float * dst,float scale,int width)2956 void ByteToFloatRow_NEON(const uint8_t* src,
2957                          float* dst,
2958                          float scale,
2959                          int width) {
2960   asm volatile(
2961       "1:                                        \n"
2962       "ld1         {v1.8b}, [%0], #8             \n"  // load 8 bytes
2963       "prfm        pldl1keep, [%0, 448]          \n"
2964       "subs        %w2, %w2, #8                  \n"  // 8 pixels per loop
2965       "uxtl        v1.8h, v1.8b                  \n"  // 8 shorts
2966       "uxtl        v2.4s, v1.4h                  \n"  // 8 ints
2967       "uxtl2       v3.4s, v1.8h                  \n"
2968       "scvtf       v2.4s, v2.4s                  \n"  // 8 floats
2969       "scvtf       v3.4s, v3.4s                  \n"
2970       "fmul        v2.4s, v2.4s, %3.s[0]         \n"  // scale
2971       "fmul        v3.4s, v3.4s, %3.s[0]         \n"
2972       "st1         {v2.16b, v3.16b}, [%1], #32   \n"  // store 8 floats
2973       "b.gt        1b                            \n"
2974       : "+r"(src),   // %0
2975         "+r"(dst),   // %1
2976         "+r"(width)  // %2
2977       : "w"(scale)   // %3
2978       : "cc", "memory", "v1", "v2", "v3");
2979 }
2980 
ScaleMaxSamples_NEON(const float * src,float * dst,float scale,int width)2981 float ScaleMaxSamples_NEON(const float* src,
2982                            float* dst,
2983                            float scale,
2984                            int width) {
2985   float fmax;
2986   asm volatile(
2987       "movi        v5.4s, #0                     \n"  // max
2988       "movi        v6.4s, #0                     \n"
2989 
2990       "1:                                        \n"
2991       "ld1         {v1.4s, v2.4s}, [%0], #32     \n"  // load 8 samples
2992       "prfm        pldl1keep, [%0, 448]          \n"
2993       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop
2994       "fmul        v3.4s, v1.4s, %4.s[0]         \n"  // scale
2995       "fmul        v4.4s, v2.4s, %4.s[0]         \n"  // scale
2996       "fmax        v5.4s, v5.4s, v1.4s           \n"  // max
2997       "fmax        v6.4s, v6.4s, v2.4s           \n"
2998       "st1         {v3.4s, v4.4s}, [%1], #32     \n"  // store 8 samples
2999       "b.gt        1b                            \n"
3000       "fmax        v5.4s, v5.4s, v6.4s           \n"  // max
3001       "fmaxv       %s3, v5.4s                    \n"  // signed max acculator
3002       : "+r"(src),                                    // %0
3003         "+r"(dst),                                    // %1
3004         "+r"(width),                                  // %2
3005         "=w"(fmax)                                    // %3
3006       : "w"(scale)                                    // %4
3007       : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6");
3008   return fmax;
3009 }
3010 
ScaleSumSamples_NEON(const float * src,float * dst,float scale,int width)3011 float ScaleSumSamples_NEON(const float* src,
3012                            float* dst,
3013                            float scale,
3014                            int width) {
3015   float fsum;
3016   asm volatile(
3017       "movi        v5.4s, #0                     \n"  // max
3018       "movi        v6.4s, #0                     \n"  // max
3019 
3020       "1:                                        \n"
3021       "ld1         {v1.4s, v2.4s}, [%0], #32     \n"  // load 8 samples
3022       "prfm        pldl1keep, [%0, 448]          \n"
3023       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop
3024       "fmul        v3.4s, v1.4s, %4.s[0]         \n"  // scale
3025       "fmul        v4.4s, v2.4s, %4.s[0]         \n"
3026       "fmla        v5.4s, v1.4s, v1.4s           \n"  // sum of squares
3027       "fmla        v6.4s, v2.4s, v2.4s           \n"
3028       "st1         {v3.4s, v4.4s}, [%1], #32     \n"  // store 8 samples
3029       "b.gt        1b                            \n"
3030       "faddp       v5.4s, v5.4s, v6.4s           \n"
3031       "faddp       v5.4s, v5.4s, v5.4s           \n"
3032       "faddp       %3.4s, v5.4s, v5.4s           \n"  // sum
3033       : "+r"(src),                                    // %0
3034         "+r"(dst),                                    // %1
3035         "+r"(width),                                  // %2
3036         "=w"(fsum)                                    // %3
3037       : "w"(scale)                                    // %4
3038       : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6");
3039   return fsum;
3040 }
3041 
ScaleSamples_NEON(const float * src,float * dst,float scale,int width)3042 void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) {
3043   asm volatile(
3044       "1:                                        \n"
3045       "ld1         {v1.4s, v2.4s}, [%0], #32     \n"  // load 8 samples
3046       "prfm        pldl1keep, [%0, 448]          \n"
3047       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop
3048       "fmul        v1.4s, v1.4s, %3.s[0]         \n"  // scale
3049       "fmul        v2.4s, v2.4s, %3.s[0]         \n"  // scale
3050       "st1         {v1.4s, v2.4s}, [%1], #32     \n"  // store 8 samples
3051       "b.gt        1b                            \n"
3052       : "+r"(src),   // %0
3053         "+r"(dst),   // %1
3054         "+r"(width)  // %2
3055       : "w"(scale)   // %3
3056       : "cc", "memory", "v1", "v2");
3057 }
3058 
3059 // filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
GaussCol_NEON(const uint16_t * src0,const uint16_t * src1,const uint16_t * src2,const uint16_t * src3,const uint16_t * src4,uint32_t * dst,int width)3060 void GaussCol_NEON(const uint16_t* src0,
3061                    const uint16_t* src1,
3062                    const uint16_t* src2,
3063                    const uint16_t* src3,
3064                    const uint16_t* src4,
3065                    uint32_t* dst,
3066                    int width) {
3067   asm volatile(
3068       "movi        v6.8h, #4                     \n"  // constant 4
3069       "movi        v7.8h, #6                     \n"  // constant 6
3070 
3071       "1:                                        \n"
3072       "ld1         {v1.8h}, [%0], #16            \n"  // load 8 samples, 5 rows
3073       "ld1         {v2.8h}, [%4], #16            \n"
3074       "uaddl       v0.4s, v1.4h, v2.4h           \n"  // * 1
3075       "prfm        pldl1keep, [%0, 448]          \n"
3076       "uaddl2      v1.4s, v1.8h, v2.8h           \n"  // * 1
3077       "ld1         {v2.8h}, [%1], #16            \n"
3078       "umlal       v0.4s, v2.4h, v6.4h           \n"  // * 4
3079       "prfm        pldl1keep, [%1, 448]          \n"
3080       "umlal2      v1.4s, v2.8h, v6.8h           \n"  // * 4
3081       "ld1         {v2.8h}, [%2], #16            \n"
3082       "umlal       v0.4s, v2.4h, v7.4h           \n"  // * 6
3083       "prfm        pldl1keep, [%2, 448]          \n"
3084       "umlal2      v1.4s, v2.8h, v7.8h           \n"  // * 6
3085       "ld1         {v2.8h}, [%3], #16            \n"
3086       "umlal       v0.4s, v2.4h, v6.4h           \n"  // * 4
3087       "prfm        pldl1keep, [%3, 448]          \n"
3088       "umlal2      v1.4s, v2.8h, v6.8h           \n"  // * 4
3089       "subs        %w6, %w6, #8                  \n"  // 8 processed per loop
3090       "st1         {v0.4s,v1.4s}, [%5], #32      \n"  // store 8 samples
3091       "prfm        pldl1keep, [%4, 448]          \n"
3092       "b.gt        1b                            \n"
3093       : "+r"(src0),  // %0
3094         "+r"(src1),  // %1
3095         "+r"(src2),  // %2
3096         "+r"(src3),  // %3
3097         "+r"(src4),  // %4
3098         "+r"(dst),   // %5
3099         "+r"(width)  // %6
3100       :
3101       : "cc", "memory", "v0", "v1", "v2", "v6", "v7");
3102 }
3103 
3104 // filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
GaussRow_NEON(const uint32_t * src,uint16_t * dst,int width)3105 void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {
3106   const uint32_t* src1 = src + 1;
3107   const uint32_t* src2 = src + 2;
3108   const uint32_t* src3 = src + 3;
3109   asm volatile(
3110       "movi        v6.4s, #4                     \n"  // constant 4
3111       "movi        v7.4s, #6                     \n"  // constant 6
3112 
3113       "1:                                        \n"
3114       "ld1         {v0.4s,v1.4s,v2.4s}, [%0], %6 \n"  // load 12 source samples
3115       "add         v0.4s, v0.4s, v1.4s           \n"  // * 1
3116       "add         v1.4s, v1.4s, v2.4s           \n"  // * 1
3117       "ld1         {v2.4s,v3.4s}, [%2], #32      \n"
3118       "mla         v0.4s, v2.4s, v7.4s           \n"  // * 6
3119       "mla         v1.4s, v3.4s, v7.4s           \n"  // * 6
3120       "ld1         {v2.4s,v3.4s}, [%1], #32      \n"
3121       "ld1         {v4.4s,v5.4s}, [%3], #32      \n"
3122       "add         v2.4s, v2.4s, v4.4s           \n"  // add rows for * 4
3123       "add         v3.4s, v3.4s, v5.4s           \n"
3124       "prfm        pldl1keep, [%0, 448]          \n"
3125       "mla         v0.4s, v2.4s, v6.4s           \n"  // * 4
3126       "mla         v1.4s, v3.4s, v6.4s           \n"  // * 4
3127       "subs        %w5, %w5, #8                  \n"  // 8 processed per loop
3128       "uqrshrn     v0.4h, v0.4s, #8              \n"  // round and pack
3129       "uqrshrn2    v0.8h, v1.4s, #8              \n"
3130       "st1         {v0.8h}, [%4], #16            \n"  // store 8 samples
3131       "b.gt        1b                            \n"
3132       : "+r"(src),   // %0
3133         "+r"(src1),  // %1
3134         "+r"(src2),  // %2
3135         "+r"(src3),  // %3
3136         "+r"(dst),   // %4
3137         "+r"(width)  // %5
3138       : "r"(32LL)    // %6
3139       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
3140 }
3141 
3142 static const vecf32 kGaussCoefficients = {4.0f, 6.0f, 1.0f / 256.0f, 0.0f};
3143 
3144 // filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
GaussCol_F32_NEON(const float * src0,const float * src1,const float * src2,const float * src3,const float * src4,float * dst,int width)3145 void GaussCol_F32_NEON(const float* src0,
3146                        const float* src1,
3147                        const float* src2,
3148                        const float* src3,
3149                        const float* src4,
3150                        float* dst,
3151                        int width) {
3152   asm volatile(
3153       "ld2r        {v6.4s, v7.4s}, [%7]          \n"  // constants 4 and 6
3154 
3155       "1:                                        \n"
3156       "ld1         {v0.4s, v1.4s}, [%0], #32     \n"  // load 8 samples, 5 rows
3157       "ld1         {v2.4s, v3.4s}, [%1], #32     \n"
3158       "fmla        v0.4s, v2.4s, v6.4s           \n"  // * 4
3159       "ld1         {v4.4s, v5.4s}, [%2], #32     \n"
3160       "fmla        v1.4s, v3.4s, v6.4s           \n"
3161       "prfm        pldl1keep, [%0, 448]          \n"
3162       "fmla        v0.4s, v4.4s, v7.4s           \n"  // * 6
3163       "ld1         {v2.4s, v3.4s}, [%3], #32     \n"
3164       "fmla        v1.4s, v5.4s, v7.4s           \n"
3165       "prfm        pldl1keep, [%1, 448]          \n"
3166       "fmla        v0.4s, v2.4s, v6.4s           \n"  // * 4
3167       "ld1         {v4.4s, v5.4s}, [%4], #32     \n"
3168       "fmla        v1.4s, v3.4s, v6.4s           \n"
3169       "prfm        pldl1keep, [%2, 448]          \n"
3170       "fadd        v0.4s, v0.4s, v4.4s           \n"  // * 1
3171       "prfm        pldl1keep, [%3, 448]          \n"
3172       "fadd        v1.4s, v1.4s, v5.4s           \n"
3173       "prfm        pldl1keep, [%4, 448]          \n"
3174       "subs        %w6, %w6, #8                  \n"  // 8 processed per loop
3175       "st1         {v0.4s, v1.4s}, [%5], #32     \n"  // store 8 samples
3176       "b.gt        1b                            \n"
3177       : "+r"(src0),               // %0
3178         "+r"(src1),               // %1
3179         "+r"(src2),               // %2
3180         "+r"(src3),               // %3
3181         "+r"(src4),               // %4
3182         "+r"(dst),                // %5
3183         "+r"(width)               // %6
3184       : "r"(&kGaussCoefficients)  // %7
3185       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
3186 }
3187 
3188 // filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
GaussRow_F32_NEON(const float * src,float * dst,int width)3189 void GaussRow_F32_NEON(const float* src, float* dst, int width) {
3190   asm volatile(
3191       "ld3r        {v6.4s, v7.4s, v8.4s}, [%3]   \n"  // constants 4, 6, 1/256
3192 
3193       "1:                                        \n"
3194       "ld1         {v0.4s, v1.4s, v2.4s}, [%0], %4 \n"  // load 12 samples, 5
3195                                                         // rows
3196       "fadd        v0.4s, v0.4s, v1.4s           \n"    // * 1
3197       "ld1         {v4.4s, v5.4s}, [%0], %5      \n"
3198       "fadd        v1.4s, v1.4s, v2.4s           \n"
3199       "fmla        v0.4s, v4.4s, v7.4s           \n"  // * 6
3200       "ld1         {v2.4s, v3.4s}, [%0], %4      \n"
3201       "fmla        v1.4s, v5.4s, v7.4s           \n"
3202       "ld1         {v4.4s, v5.4s}, [%0], %6      \n"
3203       "fadd        v2.4s, v2.4s, v4.4s           \n"
3204       "fadd        v3.4s, v3.4s, v5.4s           \n"
3205       "fmla        v0.4s, v2.4s, v6.4s           \n"  // * 4
3206       "fmla        v1.4s, v3.4s, v6.4s           \n"
3207       "prfm        pldl1keep, [%0, 448]          \n"
3208       "fmul        v0.4s, v0.4s, v8.4s           \n"  // / 256
3209       "fmul        v1.4s, v1.4s, v8.4s           \n"
3210       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop
3211       "st1         {v0.4s, v1.4s}, [%1], #32     \n"  // store 8 samples
3212       "b.gt        1b                            \n"
3213       : "+r"(src),                 // %0
3214         "+r"(dst),                 // %1
3215         "+r"(width)                // %2
3216       : "r"(&kGaussCoefficients),  // %3
3217         "r"(8LL),                  // %4
3218         "r"(-4LL),                 // %5
3219         "r"(20LL)                  // %6
3220       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8");
3221 }
3222 
3223 // Convert biplanar NV21 to packed YUV24
NV21ToYUV24Row_NEON(const uint8_t * src_y,const uint8_t * src_vu,uint8_t * dst_yuv24,int width)3224 void NV21ToYUV24Row_NEON(const uint8_t* src_y,
3225                          const uint8_t* src_vu,
3226                          uint8_t* dst_yuv24,
3227                          int width) {
3228   asm volatile(
3229       "1:                                        \n"
3230       "ld1         {v2.16b}, [%0], #16           \n"  // load 16 Y values
3231       "ld2         {v0.8b, v1.8b}, [%1], #16     \n"  // load 8 VU values
3232       "prfm        pldl1keep, [%0, 448]          \n"
3233       "prfm        pldl1keep, [%1, 448]          \n"
3234       "zip1        v0.16b, v0.16b, v0.16b        \n"      // replicate V values
3235       "zip1        v1.16b, v1.16b, v1.16b        \n"      // replicate U values
3236       "subs        %w3, %w3, #16                 \n"      // 16 pixels per loop
3237       "st3         {v0.16b,v1.16b,v2.16b}, [%2], #48 \n"  // store 16 YUV pixels
3238       "b.gt        1b                            \n"
3239       : "+r"(src_y),      // %0
3240         "+r"(src_vu),     // %1
3241         "+r"(dst_yuv24),  // %2
3242         "+r"(width)       // %3
3243       :
3244       : "cc", "memory", "v0", "v1", "v2");
3245 }
3246 
AYUVToUVRow_NEON(const uint8_t * src_ayuv,int src_stride_ayuv,uint8_t * dst_uv,int width)3247 void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
3248                       int src_stride_ayuv,
3249                       uint8_t* dst_uv,
3250                       int width) {
3251   const uint8_t* src_ayuv_1 = src_ayuv + src_stride_ayuv;
3252   asm volatile(
3253 
3254       "1:                                        \n"
3255       "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 ayuv
3256       "prfm        pldl1keep, [%0, 448]          \n"
3257       "uaddlp      v0.8h, v0.16b                 \n"  // V 16 bytes -> 8 shorts.
3258       "uaddlp      v1.8h, v1.16b                 \n"  // U 16 bytes -> 8 shorts.
3259       "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
3260       "prfm        pldl1keep, [%1, 448]          \n"
3261       "uadalp      v0.8h, v4.16b                 \n"  // V 16 bytes -> 8 shorts.
3262       "uadalp      v1.8h, v5.16b                 \n"  // U 16 bytes -> 8 shorts.
3263       "uqrshrn     v3.8b, v0.8h, #2              \n"  // 2x2 average
3264       "uqrshrn     v2.8b, v1.8h, #2              \n"
3265       "subs        %w3, %w3, #16                 \n"  // 16 processed per loop.
3266       "st2         {v2.8b,v3.8b}, [%2], #16      \n"  // store 8 pixels UV.
3267       "b.gt        1b                            \n"
3268       : "+r"(src_ayuv),    // %0
3269         "+r"(src_ayuv_1),  // %1
3270         "+r"(dst_uv),      // %2
3271         "+r"(width)        // %3
3272       :
3273       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
3274 }
3275 
AYUVToVURow_NEON(const uint8_t * src_ayuv,int src_stride_ayuv,uint8_t * dst_vu,int width)3276 void AYUVToVURow_NEON(const uint8_t* src_ayuv,
3277                       int src_stride_ayuv,
3278                       uint8_t* dst_vu,
3279                       int width) {
3280   const uint8_t* src_ayuv_1 = src_ayuv + src_stride_ayuv;
3281   asm volatile(
3282 
3283       "1:                                        \n"
3284       "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 ayuv
3285       "prfm        pldl1keep, [%0, 448]          \n"
3286       "uaddlp      v0.8h, v0.16b                 \n"  // V 16 bytes -> 8 shorts.
3287       "uaddlp      v1.8h, v1.16b                 \n"  // U 16 bytes -> 8 shorts.
3288       "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
3289       "prfm        pldl1keep, [%1, 448]          \n"
3290       "uadalp      v0.8h, v4.16b                 \n"  // V 16 bytes -> 8 shorts.
3291       "uadalp      v1.8h, v5.16b                 \n"  // U 16 bytes -> 8 shorts.
3292       "uqrshrn     v0.8b, v0.8h, #2              \n"  // 2x2 average
3293       "uqrshrn     v1.8b, v1.8h, #2              \n"
3294       "subs        %w3, %w3, #16                 \n"  // 16 processed per loop.
3295       "st2         {v0.8b,v1.8b}, [%2], #16      \n"  // store 8 pixels VU.
3296       "b.gt        1b                            \n"
3297       : "+r"(src_ayuv),    // %0
3298         "+r"(src_ayuv_1),  // %1
3299         "+r"(dst_vu),      // %2
3300         "+r"(width)        // %3
3301       :
3302       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
3303 }
3304 
3305 // Copy row of AYUV Y's into Y
AYUVToYRow_NEON(const uint8_t * src_ayuv,uint8_t * dst_y,int width)3306 void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
3307   asm volatile(
3308       "1:                                        \n"
3309       "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16
3310       "prfm        pldl1keep, [%0, 448]          \n"
3311       "subs        %w2, %w2, #16                 \n"  // 16 pixels per loop
3312       "st1         {v2.16b}, [%1], #16           \n"  // store 16 Y pixels
3313       "b.gt        1b                            \n"
3314       : "+r"(src_ayuv),  // %0
3315         "+r"(dst_y),     // %1
3316         "+r"(width)      // %2
3317       :
3318       : "cc", "memory", "v0", "v1", "v2", "v3");
3319 }
3320 
3321 // Shuffle table for swapping UV bytes.
3322 static const uvec8 kShuffleSwapUV = {1u, 0u, 3u,  2u,  5u,  4u,  7u,  6u,
3323                                      9u, 8u, 11u, 10u, 13u, 12u, 15u, 14u};
3324 
3325 // Convert UV plane of NV12 to VU of NV21.
SwapUVRow_NEON(const uint8_t * src_uv,uint8_t * dst_vu,int width)3326 void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
3327   asm volatile(
3328       "ld1         {v2.16b}, [%3]                \n"  // shuffler
3329       "1:                                        \n"
3330       "ld1         {v0.16b}, [%0], 16            \n"  // load 16 UV values
3331       "ld1         {v1.16b}, [%0], 16            \n"
3332       "prfm        pldl1keep, [%0, 448]          \n"
3333       "subs        %w2, %w2, #16                 \n"  // 16 pixels per loop
3334       "tbl         v0.16b, {v0.16b}, v2.16b      \n"
3335       "tbl         v1.16b, {v1.16b}, v2.16b      \n"
3336       "stp         q0, q1, [%1], 32              \n"  // store 16 VU pixels
3337       "b.gt        1b                            \n"
3338       : "+r"(src_uv),         // %0
3339         "+r"(dst_vu),         // %1
3340         "+r"(width)           // %2
3341       : "r"(&kShuffleSwapUV)  // %3
3342       : "cc", "memory", "v0", "v1", "v2");
3343 }
3344 
HalfMergeUVRow_NEON(const uint8_t * src_u,int src_stride_u,const uint8_t * src_v,int src_stride_v,uint8_t * dst_uv,int width)3345 void HalfMergeUVRow_NEON(const uint8_t* src_u,
3346                          int src_stride_u,
3347                          const uint8_t* src_v,
3348                          int src_stride_v,
3349                          uint8_t* dst_uv,
3350                          int width) {
3351   const uint8_t* src_u_1 = src_u + src_stride_u;
3352   const uint8_t* src_v_1 = src_v + src_stride_v;
3353   asm volatile(
3354       "1:                                        \n"
3355       "ld1         {v0.16b}, [%0], #16           \n"  // load 16 U values
3356       "ld1         {v1.16b}, [%2], #16           \n"  // load 16 V values
3357       "ld1         {v2.16b}, [%1], #16           \n"
3358       "ld1         {v3.16b}, [%3], #16           \n"
3359       "uaddlp      v0.8h, v0.16b                 \n"  // half size
3360       "prfm        pldl1keep, [%0, 448]          \n"
3361       "uaddlp      v1.8h, v1.16b                 \n"
3362       "prfm        pldl1keep, [%2, 448]          \n"
3363       "uadalp      v0.8h, v2.16b                 \n"
3364       "prfm        pldl1keep, [%1, 448]          \n"
3365       "uadalp      v1.8h, v3.16b                 \n"
3366       "prfm        pldl1keep, [%3, 448]          \n"
3367       "uqrshrn     v0.8b, v0.8h, #2              \n"
3368       "uqrshrn     v1.8b, v1.8h, #2              \n"
3369       "subs        %w5, %w5, #16                 \n"  // 16 src pixels per loop
3370       "st2         {v0.8b, v1.8b}, [%4], #16     \n"  // store 8 UV pixels
3371       "b.gt        1b                            \n"
3372       : "+r"(src_u),    // %0
3373         "+r"(src_u_1),  // %1
3374         "+r"(src_v),    // %2
3375         "+r"(src_v_1),  // %3
3376         "+r"(dst_uv),   // %4
3377         "+r"(width)     // %5
3378       :
3379       : "cc", "memory", "v0", "v1", "v2", "v3");
3380 }
3381 
3382 #endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
3383 
3384 #ifdef __cplusplus
3385 }  // extern "C"
3386 }  // namespace libyuv
3387 #endif
3388