1 /*
2 * Copyright 2014 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "libyuv/row.h"
12
13 #ifdef __cplusplus
14 namespace libyuv {
15 extern "C" {
16 #endif
17
18 // This module is for GCC Neon armv8 64 bit.
19 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
20
21 // Read 8 Y, 4 U and 4 V from 422
22 #define READYUV422 \
23 "ld1 {v0.8b}, [%0], #8 \n" \
24 "ld1 {v1.s}[0], [%1], #4 \n" \
25 "ld1 {v1.s}[1], [%2], #4 \n"
26
27 // Read 8 Y, 8 U and 8 V from 444
28 #define READYUV444 \
29 "ld1 {v0.8b}, [%0], #8 \n" \
30 "ld1 {v1.d}[0], [%1], #8 \n" \
31 "ld1 {v1.d}[1], [%2], #8 \n" \
32 "uaddlp v1.8h, v1.16b \n" \
33 "rshrn v1.8b, v1.8h, #1 \n"
34
35 // Read 8 Y, and set 4 U and 4 V to 128
36 #define READYUV400 \
37 "ld1 {v0.8b}, [%0], #8 \n" \
38 "movi v1.8b , #128 \n"
39
40 // Read 8 Y and 4 UV from NV12
41 #define READNV12 \
42 "ld1 {v0.8b}, [%0], #8 \n" \
43 "ld1 {v2.8b}, [%1], #8 \n" \
44 "uzp1 v1.8b, v2.8b, v2.8b \n" \
45 "uzp2 v3.8b, v2.8b, v2.8b \n" \
46 "ins v1.s[1], v3.s[0] \n"
47
48 // Read 8 Y and 4 VU from NV21
49 #define READNV21 \
50 "ld1 {v0.8b}, [%0], #8 \n" \
51 "ld1 {v2.8b}, [%1], #8 \n" \
52 "uzp1 v3.8b, v2.8b, v2.8b \n" \
53 "uzp2 v1.8b, v2.8b, v2.8b \n" \
54 "ins v1.s[1], v3.s[0] \n"
55
56 // Read 8 YUY2
57 #define READYUY2 \
58 "ld2 {v0.8b, v1.8b}, [%0], #16 \n" \
59 "uzp2 v3.8b, v1.8b, v1.8b \n" \
60 "uzp1 v1.8b, v1.8b, v1.8b \n" \
61 "ins v1.s[1], v3.s[0] \n"
62
63 // Read 8 UYVY
64 #define READUYVY \
65 "ld2 {v2.8b, v3.8b}, [%0], #16 \n" \
66 "orr v0.8b, v3.8b, v3.8b \n" \
67 "uzp1 v1.8b, v2.8b, v2.8b \n" \
68 "uzp2 v3.8b, v2.8b, v2.8b \n" \
69 "ins v1.s[1], v3.s[0] \n"
70
71 #define YUVTORGB_SETUP \
72 "ld3r {v24.8h, v25.8h, v26.8h}, [%[kUVBiasBGR]] \n" \
73 "ld1r {v31.4s}, [%[kYToRgb]] \n" \
74 "ld2 {v27.8h, v28.8h}, [%[kUVToRB]] \n" \
75 "ld2 {v29.8h, v30.8h}, [%[kUVToG]] \n"
76
77 // clang-format off
78
79 #define YUVTORGB(vR, vG, vB) \
80 "uxtl v0.8h, v0.8b \n" /* Extract Y */ \
81 "shll v2.8h, v1.8b, #8 \n" /* Replicate UV */ \
82 "ushll2 v3.4s, v0.8h, #0 \n" /* Y */ \
83 "ushll v0.4s, v0.4h, #0 \n" \
84 "mul v3.4s, v3.4s, v31.4s \n" \
85 "mul v0.4s, v0.4s, v31.4s \n" \
86 "sqshrun v0.4h, v0.4s, #16 \n" \
87 "sqshrun2 v0.8h, v3.4s, #16 \n" /* Y */ \
88 "uaddw v1.8h, v2.8h, v1.8b \n" /* Replicate UV */ \
89 "mov v2.d[0], v1.d[1] \n" /* Extract V */ \
90 "uxtl v2.8h, v2.8b \n" \
91 "uxtl v1.8h, v1.8b \n" /* Extract U */ \
92 "mul v3.8h, v27.8h, v1.8h \n" \
93 "mul v5.8h, v29.8h, v1.8h \n" \
94 "mul v6.8h, v30.8h, v2.8h \n" \
95 "mul v7.8h, v28.8h, v2.8h \n" \
96 "sqadd v6.8h, v6.8h, v5.8h \n" \
97 "sqadd " #vB ".8h, v24.8h, v0.8h \n" /* B */ \
98 "sqadd " #vG ".8h, v25.8h, v0.8h \n" /* G */ \
99 "sqadd " #vR ".8h, v26.8h, v0.8h \n" /* R */ \
100 "sqadd " #vB ".8h, " #vB ".8h, v3.8h \n" /* B */ \
101 "sqsub " #vG ".8h, " #vG ".8h, v6.8h \n" /* G */ \
102 "sqadd " #vR ".8h, " #vR ".8h, v7.8h \n" /* R */ \
103 "sqshrun " #vB ".8b, " #vB ".8h, #6 \n" /* B */ \
104 "sqshrun " #vG ".8b, " #vG ".8h, #6 \n" /* G */ \
105 "sqshrun " #vR ".8b, " #vR ".8h, #6 \n" /* R */
106
107 // clang-format on
108
I444ToARGBRow_NEON(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)109 void I444ToARGBRow_NEON(const uint8_t* src_y,
110 const uint8_t* src_u,
111 const uint8_t* src_v,
112 uint8_t* dst_argb,
113 const struct YuvConstants* yuvconstants,
114 int width) {
115 asm volatile (
116 YUVTORGB_SETUP
117 "movi v23.8b, #255 \n" /* A */
118 "1: \n"
119 READYUV444
120 "prfm pldl1keep, [%0, 448] \n"
121 YUVTORGB(v22, v21, v20)
122 "prfm pldl1keep, [%1, 448] \n"
123 "prfm pldl1keep, [%2, 448] \n"
124 "subs %w4, %w4, #8 \n"
125 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
126 "b.gt 1b \n"
127 : "+r"(src_y), // %0
128 "+r"(src_u), // %1
129 "+r"(src_v), // %2
130 "+r"(dst_argb), // %3
131 "+r"(width) // %4
132 : [kUVToRB]"r"(&yuvconstants->kUVToRB),
133 [kUVToG]"r"(&yuvconstants->kUVToG),
134 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
135 [kYToRgb]"r"(&yuvconstants->kYToRgb)
136 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
137 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
138 );
139 }
140
I422ToARGBRow_NEON(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)141 void I422ToARGBRow_NEON(const uint8_t* src_y,
142 const uint8_t* src_u,
143 const uint8_t* src_v,
144 uint8_t* dst_argb,
145 const struct YuvConstants* yuvconstants,
146 int width) {
147 asm volatile (
148 YUVTORGB_SETUP
149 "movi v23.8b, #255 \n" /* A */
150
151 "1: \n"
152 READYUV422
153 "prfm pldl1keep, [%0, 448] \n"
154 YUVTORGB(v22, v21, v20)
155 "prfm pldl1keep, [%1, 128] \n"
156 "prfm pldl1keep, [%2, 128] \n"
157 "subs %w4, %w4, #8 \n"
158 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
159 "b.gt 1b \n"
160 : "+r"(src_y), // %0
161 "+r"(src_u), // %1
162 "+r"(src_v), // %2
163 "+r"(dst_argb), // %3
164 "+r"(width) // %4
165 : [kUVToRB]"r"(&yuvconstants->kUVToRB),
166 [kUVToG]"r"(&yuvconstants->kUVToG),
167 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
168 [kYToRgb]"r"(&yuvconstants->kYToRgb)
169 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
170 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
171 );
172 }
173
I422AlphaToARGBRow_NEON(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,const uint8_t * src_a,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)174 void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
175 const uint8_t* src_u,
176 const uint8_t* src_v,
177 const uint8_t* src_a,
178 uint8_t* dst_argb,
179 const struct YuvConstants* yuvconstants,
180 int width) {
181 asm volatile (
182 YUVTORGB_SETUP
183 "1: \n"
184 READYUV422
185 "prfm pldl1keep, [%0, 448] \n"
186 YUVTORGB(v22, v21, v20)
187 "ld1 {v23.8b}, [%3], #8 \n"
188 "prfm pldl1keep, [%1, 128] \n"
189 "prfm pldl1keep, [%2, 128] \n"
190 "prfm pldl1keep, [%3, 448] \n"
191 "subs %w5, %w5, #8 \n"
192 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%4], #32 \n"
193 "b.gt 1b \n"
194 : "+r"(src_y), // %0
195 "+r"(src_u), // %1
196 "+r"(src_v), // %2
197 "+r"(src_a), // %3
198 "+r"(dst_argb), // %4
199 "+r"(width) // %5
200 : [kUVToRB]"r"(&yuvconstants->kUVToRB),
201 [kUVToG]"r"(&yuvconstants->kUVToG),
202 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
203 [kYToRgb]"r"(&yuvconstants->kYToRgb)
204 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
205 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
206 );
207 }
208
I422ToRGBARow_NEON(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_rgba,const struct YuvConstants * yuvconstants,int width)209 void I422ToRGBARow_NEON(const uint8_t* src_y,
210 const uint8_t* src_u,
211 const uint8_t* src_v,
212 uint8_t* dst_rgba,
213 const struct YuvConstants* yuvconstants,
214 int width) {
215 asm volatile (
216 YUVTORGB_SETUP
217 "movi v20.8b, #255 \n" /* A */
218 "1: \n"
219 READYUV422
220 "prfm pldl1keep, [%0, 448] \n"
221 YUVTORGB(v23, v22, v21)
222 "prfm pldl1keep, [%1, 128] \n"
223 "prfm pldl1keep, [%2, 128] \n"
224 "subs %w4, %w4, #8 \n"
225 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
226 "b.gt 1b \n"
227 : "+r"(src_y), // %0
228 "+r"(src_u), // %1
229 "+r"(src_v), // %2
230 "+r"(dst_rgba), // %3
231 "+r"(width) // %4
232 : [kUVToRB]"r"(&yuvconstants->kUVToRB),
233 [kUVToG]"r"(&yuvconstants->kUVToG),
234 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
235 [kYToRgb]"r"(&yuvconstants->kYToRgb)
236 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
237 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
238 );
239 }
240
I422ToRGB24Row_NEON(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_rgb24,const struct YuvConstants * yuvconstants,int width)241 void I422ToRGB24Row_NEON(const uint8_t* src_y,
242 const uint8_t* src_u,
243 const uint8_t* src_v,
244 uint8_t* dst_rgb24,
245 const struct YuvConstants* yuvconstants,
246 int width) {
247 asm volatile (
248 YUVTORGB_SETUP
249 "1: \n"
250 READYUV422
251 "prfm pldl1keep, [%0, 448] \n"
252 YUVTORGB(v22, v21, v20)
253 "prfm pldl1keep, [%1, 128] \n"
254 "prfm pldl1keep, [%2, 128] \n"
255 "subs %w4, %w4, #8 \n"
256 "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n"
257 "b.gt 1b \n"
258 : "+r"(src_y), // %0
259 "+r"(src_u), // %1
260 "+r"(src_v), // %2
261 "+r"(dst_rgb24), // %3
262 "+r"(width) // %4
263 : [kUVToRB]"r"(&yuvconstants->kUVToRB),
264 [kUVToG]"r"(&yuvconstants->kUVToG),
265 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
266 [kYToRgb]"r"(&yuvconstants->kYToRgb)
267 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
268 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
269 );
270 }
271
272 #define ARGBTORGB565 \
273 "shll v0.8h, v22.8b, #8 \n" /* R */ \
274 "shll v21.8h, v21.8b, #8 \n" /* G */ \
275 "shll v20.8h, v20.8b, #8 \n" /* B */ \
276 "sri v0.8h, v21.8h, #5 \n" /* RG */ \
277 "sri v0.8h, v20.8h, #11 \n" /* RGB */
278
279 // clang-format off
280
I422ToRGB565Row_NEON(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_rgb565,const struct YuvConstants * yuvconstants,int width)281 void I422ToRGB565Row_NEON(const uint8_t* src_y,
282 const uint8_t* src_u,
283 const uint8_t* src_v,
284 uint8_t* dst_rgb565,
285 const struct YuvConstants* yuvconstants,
286 int width) {
287 asm volatile(
288 YUVTORGB_SETUP
289 "1: \n"
290 READYUV422
291 YUVTORGB(v22, v21, v20)
292 "prfm pldl1keep, [%0, 448] \n"
293 "subs %w4, %w4, #8 \n"
294 ARGBTORGB565
295 "prfm pldl1keep, [%1, 128] \n"
296 "prfm pldl1keep, [%2, 128] \n"
297 "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565.
298 "b.gt 1b \n"
299 : "+r"(src_y), // %0
300 "+r"(src_u), // %1
301 "+r"(src_v), // %2
302 "+r"(dst_rgb565), // %3
303 "+r"(width) // %4
304 : [kUVToRB] "r"(&yuvconstants->kUVToRB),
305 [kUVToG] "r"(&yuvconstants->kUVToG),
306 [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
307 [kYToRgb] "r"(&yuvconstants->kYToRgb)
308 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
309 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30");
310 }
311
312 #define ARGBTOARGB1555 \
313 "shll v0.8h, v23.8b, #8 \n" /* A */ \
314 "shll v22.8h, v22.8b, #8 \n" /* R */ \
315 "shll v21.8h, v21.8b, #8 \n" /* G */ \
316 "shll v20.8h, v20.8b, #8 \n" /* B */ \
317 "sri v0.8h, v22.8h, #1 \n" /* AR */ \
318 "sri v0.8h, v21.8h, #6 \n" /* ARG */ \
319 "sri v0.8h, v20.8h, #11 \n" /* ARGB */
320
I422ToARGB1555Row_NEON(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb1555,const struct YuvConstants * yuvconstants,int width)321 void I422ToARGB1555Row_NEON(const uint8_t* src_y,
322 const uint8_t* src_u,
323 const uint8_t* src_v,
324 uint8_t* dst_argb1555,
325 const struct YuvConstants* yuvconstants,
326 int width) {
327 asm volatile(
328 YUVTORGB_SETUP
329 "movi v23.8b, #255 \n"
330 "1: \n"
331 READYUV422
332 YUVTORGB(v22, v21, v20)
333 "prfm pldl1keep, [%0, 448] \n"
334 "subs %w4, %w4, #8 \n"
335 ARGBTOARGB1555
336 "prfm pldl1keep, [%1, 128] \n"
337 "prfm pldl1keep, [%2, 128] \n"
338 "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565.
339 "b.gt 1b \n"
340 : "+r"(src_y), // %0
341 "+r"(src_u), // %1
342 "+r"(src_v), // %2
343 "+r"(dst_argb1555), // %3
344 "+r"(width) // %4
345 : [kUVToRB] "r"(&yuvconstants->kUVToRB),
346 [kUVToG] "r"(&yuvconstants->kUVToG),
347 [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
348 [kYToRgb] "r"(&yuvconstants->kYToRgb)
349 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
350 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30");
351 }
352 // clang-format on
353
354 #define ARGBTOARGB4444 \
355 /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f */ \
356 "ushr v20.8b, v20.8b, #4 \n" /* B */ \
357 "bic v21.8b, v21.8b, v4.8b \n" /* G */ \
358 "ushr v22.8b, v22.8b, #4 \n" /* R */ \
359 "bic v23.8b, v23.8b, v4.8b \n" /* A */ \
360 "orr v0.8b, v20.8b, v21.8b \n" /* BG */ \
361 "orr v1.8b, v22.8b, v23.8b \n" /* RA */ \
362 "zip1 v0.16b, v0.16b, v1.16b \n" /* BGRA */
363
I422ToARGB4444Row_NEON(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb4444,const struct YuvConstants * yuvconstants,int width)364 void I422ToARGB4444Row_NEON(const uint8_t* src_y,
365 const uint8_t* src_u,
366 const uint8_t* src_v,
367 uint8_t* dst_argb4444,
368 const struct YuvConstants* yuvconstants,
369 int width) {
370 asm volatile (
371 YUVTORGB_SETUP
372 "movi v4.16b, #0x0f \n" // bits to clear with vbic.
373 "1: \n"
374 READYUV422
375 YUVTORGB(v22, v21, v20)
376 "prfm pldl1keep, [%0, 448] \n"
377 "subs %w4, %w4, #8 \n"
378 "movi v23.8b, #255 \n"
379 ARGBTOARGB4444
380 "prfm pldl1keep, [%1, 128] \n"
381 "prfm pldl1keep, [%2, 128] \n"
382 "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels ARGB4444.
383 "b.gt 1b \n"
384 : "+r"(src_y), // %0
385 "+r"(src_u), // %1
386 "+r"(src_v), // %2
387 "+r"(dst_argb4444), // %3
388 "+r"(width) // %4
389 : [kUVToRB]"r"(&yuvconstants->kUVToRB),
390 [kUVToG]"r"(&yuvconstants->kUVToG),
391 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
392 [kYToRgb]"r"(&yuvconstants->kYToRgb)
393 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
394 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
395 );
396 }
397
I400ToARGBRow_NEON(const uint8_t * src_y,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)398 void I400ToARGBRow_NEON(const uint8_t* src_y,
399 uint8_t* dst_argb,
400 const struct YuvConstants* yuvconstants,
401 int width) {
402 asm volatile (
403 YUVTORGB_SETUP
404 "movi v23.8b, #255 \n"
405 "1: \n"
406 READYUV400
407 YUVTORGB(v22, v21, v20)
408 "prfm pldl1keep, [%0, 448] \n"
409 "subs %w2, %w2, #8 \n"
410 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
411 "b.gt 1b \n"
412 : "+r"(src_y), // %0
413 "+r"(dst_argb), // %1
414 "+r"(width) // %2
415 : [kUVToRB]"r"(&yuvconstants->kUVToRB),
416 [kUVToG]"r"(&yuvconstants->kUVToG),
417 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
418 [kYToRgb]"r"(&yuvconstants->kYToRgb)
419 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
420 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
421 );
422 }
423
J400ToARGBRow_NEON(const uint8_t * src_y,uint8_t * dst_argb,int width)424 void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
425 asm volatile(
426 "movi v23.8b, #255 \n"
427 "1: \n"
428 "ld1 {v20.8b}, [%0], #8 \n"
429 "prfm pldl1keep, [%0, 448] \n"
430 "orr v21.8b, v20.8b, v20.8b \n"
431 "orr v22.8b, v20.8b, v20.8b \n"
432 "subs %w2, %w2, #8 \n"
433 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
434 "b.gt 1b \n"
435 : "+r"(src_y), // %0
436 "+r"(dst_argb), // %1
437 "+r"(width) // %2
438 :
439 : "cc", "memory", "v20", "v21", "v22", "v23");
440 }
441
NV12ToARGBRow_NEON(const uint8_t * src_y,const uint8_t * src_uv,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)442 void NV12ToARGBRow_NEON(const uint8_t* src_y,
443 const uint8_t* src_uv,
444 uint8_t* dst_argb,
445 const struct YuvConstants* yuvconstants,
446 int width) {
447 asm volatile (
448 YUVTORGB_SETUP
449 "movi v23.8b, #255 \n"
450 "1: \n"
451 READNV12
452 "prfm pldl1keep, [%0, 448] \n"
453 YUVTORGB(v22, v21, v20)
454 "prfm pldl1keep, [%1, 256] \n"
455 "subs %w3, %w3, #8 \n"
456 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
457 "b.gt 1b \n"
458 : "+r"(src_y), // %0
459 "+r"(src_uv), // %1
460 "+r"(dst_argb), // %2
461 "+r"(width) // %3
462 : [kUVToRB]"r"(&yuvconstants->kUVToRB),
463 [kUVToG]"r"(&yuvconstants->kUVToG),
464 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
465 [kYToRgb]"r"(&yuvconstants->kYToRgb)
466 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
467 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
468 );
469 }
470
NV21ToARGBRow_NEON(const uint8_t * src_y,const uint8_t * src_vu,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)471 void NV21ToARGBRow_NEON(const uint8_t* src_y,
472 const uint8_t* src_vu,
473 uint8_t* dst_argb,
474 const struct YuvConstants* yuvconstants,
475 int width) {
476 asm volatile (
477 YUVTORGB_SETUP
478 "movi v23.8b, #255 \n"
479 "1: \n"
480 READNV21
481 "prfm pldl1keep, [%0, 448] \n"
482 YUVTORGB(v22, v21, v20)
483 "prfm pldl1keep, [%1, 256] \n"
484 "subs %w3, %w3, #8 \n"
485 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
486 "b.gt 1b \n"
487 : "+r"(src_y), // %0
488 "+r"(src_vu), // %1
489 "+r"(dst_argb), // %2
490 "+r"(width) // %3
491 : [kUVToRB]"r"(&yuvconstants->kUVToRB),
492 [kUVToG]"r"(&yuvconstants->kUVToG),
493 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
494 [kYToRgb]"r"(&yuvconstants->kYToRgb)
495 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
496 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
497 );
498 }
499
NV12ToRGB24Row_NEON(const uint8_t * src_y,const uint8_t * src_uv,uint8_t * dst_rgb24,const struct YuvConstants * yuvconstants,int width)500 void NV12ToRGB24Row_NEON(const uint8_t* src_y,
501 const uint8_t* src_uv,
502 uint8_t* dst_rgb24,
503 const struct YuvConstants* yuvconstants,
504 int width) {
505 asm volatile (
506 YUVTORGB_SETUP
507 "1: \n"
508 READNV12
509 "prfm pldl1keep, [%0, 448] \n"
510 YUVTORGB(v22, v21, v20)
511 "prfm pldl1keep, [%1, 256] \n"
512 "subs %w3, %w3, #8 \n"
513 "st3 {v20.8b,v21.8b,v22.8b}, [%2], #24 \n"
514 "b.gt 1b \n"
515 : "+r"(src_y), // %0
516 "+r"(src_uv), // %1
517 "+r"(dst_rgb24), // %2
518 "+r"(width) // %3
519 : [kUVToRB]"r"(&yuvconstants->kUVToRB),
520 [kUVToG]"r"(&yuvconstants->kUVToG),
521 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
522 [kYToRgb]"r"(&yuvconstants->kYToRgb)
523 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
524 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
525 );
526 }
527
NV21ToRGB24Row_NEON(const uint8_t * src_y,const uint8_t * src_vu,uint8_t * dst_rgb24,const struct YuvConstants * yuvconstants,int width)528 void NV21ToRGB24Row_NEON(const uint8_t* src_y,
529 const uint8_t* src_vu,
530 uint8_t* dst_rgb24,
531 const struct YuvConstants* yuvconstants,
532 int width) {
533 asm volatile (
534 YUVTORGB_SETUP
535 "1: \n"
536 READNV21
537 "prfm pldl1keep, [%0, 448] \n"
538 YUVTORGB(v22, v21, v20)
539 "prfm pldl1keep, [%1, 256] \n"
540 "subs %w3, %w3, #8 \n"
541 "st3 {v20.8b,v21.8b,v22.8b}, [%2], #24 \n"
542 "b.gt 1b \n"
543 : "+r"(src_y), // %0
544 "+r"(src_vu), // %1
545 "+r"(dst_rgb24), // %2
546 "+r"(width) // %3
547 : [kUVToRB]"r"(&yuvconstants->kUVToRB),
548 [kUVToG]"r"(&yuvconstants->kUVToG),
549 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
550 [kYToRgb]"r"(&yuvconstants->kYToRgb)
551 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
552 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
553 );
554 }
555
NV12ToRGB565Row_NEON(const uint8_t * src_y,const uint8_t * src_uv,uint8_t * dst_rgb565,const struct YuvConstants * yuvconstants,int width)556 void NV12ToRGB565Row_NEON(const uint8_t* src_y,
557 const uint8_t* src_uv,
558 uint8_t* dst_rgb565,
559 const struct YuvConstants* yuvconstants,
560 int width) {
561 asm volatile(
562 YUVTORGB_SETUP "1: \n" READNV12
563 "prfm pldl1keep, [%0, 448] \n" YUVTORGB(
564 v22, v21, v20) ARGBTORGB565
565 "prfm pldl1keep, [%1, 256] \n"
566 "subs %w3, %w3, #8 \n"
567 "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels
568 "b.gt 1b \n"
569 : "+r"(src_y), // %0
570 "+r"(src_uv), // %1
571 "+r"(dst_rgb565), // %2
572 "+r"(width) // %3
573 : [kUVToRB] "r"(&yuvconstants->kUVToRB),
574 [kUVToG] "r"(&yuvconstants->kUVToG),
575 [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
576 [kYToRgb] "r"(&yuvconstants->kYToRgb)
577 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
578 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30");
579 }
580
YUY2ToARGBRow_NEON(const uint8_t * src_yuy2,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)581 void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
582 uint8_t* dst_argb,
583 const struct YuvConstants* yuvconstants,
584 int width) {
585 asm volatile (
586 YUVTORGB_SETUP
587 "movi v23.8b, #255 \n"
588 "1: \n"
589 READYUY2
590 "prfm pldl1keep, [%0, 448] \n"
591 YUVTORGB(v22, v21, v20)
592 "subs %w2, %w2, #8 \n"
593 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
594 "b.gt 1b \n"
595 : "+r"(src_yuy2), // %0
596 "+r"(dst_argb), // %1
597 "+r"(width) // %2
598 : [kUVToRB]"r"(&yuvconstants->kUVToRB),
599 [kUVToG]"r"(&yuvconstants->kUVToG),
600 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
601 [kYToRgb]"r"(&yuvconstants->kYToRgb)
602 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
603 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
604 );
605 }
606
UYVYToARGBRow_NEON(const uint8_t * src_uyvy,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)607 void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
608 uint8_t* dst_argb,
609 const struct YuvConstants* yuvconstants,
610 int width) {
611 asm volatile (
612 YUVTORGB_SETUP
613 "movi v23.8b, #255 \n"
614 "1: \n"
615 READUYVY
616 YUVTORGB(v22, v21, v20)
617 "prfm pldl1keep, [%0, 448] \n"
618 "subs %w2, %w2, #8 \n"
619 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n"
620 "b.gt 1b \n"
621 : "+r"(src_uyvy), // %0
622 "+r"(dst_argb), // %1
623 "+r"(width) // %2
624 : [kUVToRB]"r"(&yuvconstants->kUVToRB),
625 [kUVToG]"r"(&yuvconstants->kUVToG),
626 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
627 [kYToRgb]"r"(&yuvconstants->kYToRgb)
628 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
629 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
630 );
631 }
632
633 // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
SplitUVRow_NEON(const uint8_t * src_uv,uint8_t * dst_u,uint8_t * dst_v,int width)634 void SplitUVRow_NEON(const uint8_t* src_uv,
635 uint8_t* dst_u,
636 uint8_t* dst_v,
637 int width) {
638 asm volatile(
639 "1: \n"
640 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV
641 "prfm pldl1keep, [%0, 448] \n"
642 "subs %w3, %w3, #16 \n" // 16 processed per loop
643 "st1 {v0.16b}, [%1], #16 \n" // store U
644 "st1 {v1.16b}, [%2], #16 \n" // store V
645 "b.gt 1b \n"
646 : "+r"(src_uv), // %0
647 "+r"(dst_u), // %1
648 "+r"(dst_v), // %2
649 "+r"(width) // %3 // Output registers
650 : // Input registers
651 : "cc", "memory", "v0", "v1" // Clobber List
652 );
653 }
654
655 // Reads 16 U's and V's and writes out 16 pairs of UV.
MergeUVRow_NEON(const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_uv,int width)656 void MergeUVRow_NEON(const uint8_t* src_u,
657 const uint8_t* src_v,
658 uint8_t* dst_uv,
659 int width) {
660 asm volatile(
661 "1: \n"
662 "ld1 {v0.16b}, [%0], #16 \n" // load U
663 "ld1 {v1.16b}, [%1], #16 \n" // load V
664 "prfm pldl1keep, [%0, 448] \n"
665 "prfm pldl1keep, [%1, 448] \n"
666 "subs %w3, %w3, #16 \n" // 16 processed per loop
667 "st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV
668 "b.gt 1b \n"
669 : "+r"(src_u), // %0
670 "+r"(src_v), // %1
671 "+r"(dst_uv), // %2
672 "+r"(width) // %3 // Output registers
673 : // Input registers
674 : "cc", "memory", "v0", "v1" // Clobber List
675 );
676 }
677
678 // Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b.
SplitRGBRow_NEON(const uint8_t * src_rgb,uint8_t * dst_r,uint8_t * dst_g,uint8_t * dst_b,int width)679 void SplitRGBRow_NEON(const uint8_t* src_rgb,
680 uint8_t* dst_r,
681 uint8_t* dst_g,
682 uint8_t* dst_b,
683 int width) {
684 asm volatile(
685 "1: \n"
686 "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 RGB
687 "prfm pldl1keep, [%0, 448] \n"
688 "subs %w4, %w4, #16 \n" // 16 processed per loop
689 "st1 {v0.16b}, [%1], #16 \n" // store R
690 "st1 {v1.16b}, [%2], #16 \n" // store G
691 "st1 {v2.16b}, [%3], #16 \n" // store B
692 "b.gt 1b \n"
693 : "+r"(src_rgb), // %0
694 "+r"(dst_r), // %1
695 "+r"(dst_g), // %2
696 "+r"(dst_b), // %3
697 "+r"(width) // %4
698 : // Input registers
699 : "cc", "memory", "v0", "v1", "v2" // Clobber List
700 );
701 }
702
703 // Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time
MergeRGBRow_NEON(const uint8_t * src_r,const uint8_t * src_g,const uint8_t * src_b,uint8_t * dst_rgb,int width)704 void MergeRGBRow_NEON(const uint8_t* src_r,
705 const uint8_t* src_g,
706 const uint8_t* src_b,
707 uint8_t* dst_rgb,
708 int width) {
709 asm volatile(
710 "1: \n"
711 "ld1 {v0.16b}, [%0], #16 \n" // load R
712 "ld1 {v1.16b}, [%1], #16 \n" // load G
713 "ld1 {v2.16b}, [%2], #16 \n" // load B
714 "prfm pldl1keep, [%0, 448] \n"
715 "prfm pldl1keep, [%1, 448] \n"
716 "prfm pldl1keep, [%2, 448] \n"
717 "subs %w4, %w4, #16 \n" // 16 processed per loop
718 "st3 {v0.16b,v1.16b,v2.16b}, [%3], #48 \n" // store 16 RGB
719 "prfm pldl1keep, [%0, 448] \n"
720 "b.gt 1b \n"
721 : "+r"(src_r), // %0
722 "+r"(src_g), // %1
723 "+r"(src_b), // %2
724 "+r"(dst_rgb), // %3
725 "+r"(width) // %4
726 : // Input registers
727 : "cc", "memory", "v0", "v1", "v2" // Clobber List
728 );
729 }
730
731 // Copy multiple of 32.
CopyRow_NEON(const uint8_t * src,uint8_t * dst,int width)732 void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
733 asm volatile(
734 "1: \n"
735 "ldp q0, q1, [%0], #32 \n"
736 "prfm pldl1keep, [%0, 448] \n"
737 "subs %w2, %w2, #32 \n" // 32 processed per loop
738 "stp q0, q1, [%1], #32 \n"
739 "b.gt 1b \n"
740 : "+r"(src), // %0
741 "+r"(dst), // %1
742 "+r"(width) // %2 // Output registers
743 : // Input registers
744 : "cc", "memory", "v0", "v1" // Clobber List
745 );
746 }
747
748 // SetRow writes 'width' bytes using an 8 bit value repeated.
SetRow_NEON(uint8_t * dst,uint8_t v8,int width)749 void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
750 asm volatile(
751 "dup v0.16b, %w2 \n" // duplicate 16 bytes
752 "1: \n"
753 "subs %w1, %w1, #16 \n" // 16 bytes per loop
754 "st1 {v0.16b}, [%0], #16 \n" // store
755 "b.gt 1b \n"
756 : "+r"(dst), // %0
757 "+r"(width) // %1
758 : "r"(v8) // %2
759 : "cc", "memory", "v0");
760 }
761
ARGBSetRow_NEON(uint8_t * dst,uint32_t v32,int width)762 void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
763 asm volatile(
764 "dup v0.4s, %w2 \n" // duplicate 4 ints
765 "1: \n"
766 "subs %w1, %w1, #4 \n" // 4 ints per loop
767 "st1 {v0.16b}, [%0], #16 \n" // store
768 "b.gt 1b \n"
769 : "+r"(dst), // %0
770 "+r"(width) // %1
771 : "r"(v32) // %2
772 : "cc", "memory", "v0");
773 }
774
775 // Shuffle table for reversing the bytes.
776 static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
777 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
778
MirrorRow_NEON(const uint8_t * src,uint8_t * dst,int width)779 void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
780 asm volatile(
781 // Start at end of source row.
782 "ld1 {v3.16b}, [%3] \n" // shuffler
783 "add %0, %0, %w2, sxtw \n"
784 "sub %0, %0, #32 \n"
785 "1: \n"
786 "ldr q2, [%0, 16] \n"
787 "ldr q1, [%0], -32 \n" // src -= 32
788 "subs %w2, %w2, #32 \n" // 32 pixels per loop.
789 "tbl v0.16b, {v2.16b}, v3.16b \n"
790 "tbl v1.16b, {v1.16b}, v3.16b \n"
791 "st1 {v0.16b, v1.16b}, [%1], #32 \n" // store 32 pixels
792 "b.gt 1b \n"
793 : "+r"(src), // %0
794 "+r"(dst), // %1
795 "+r"(width) // %2
796 : "r"(&kShuffleMirror) // %3
797 : "cc", "memory", "v0", "v1", "v2", "v3");
798 }
799
800 // Shuffle table for reversing the UV.
801 static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u,
802 6u, 7u, 4u, 5u, 2u, 3u, 0u, 1u};
803
MirrorUVRow_NEON(const uint8_t * src_uv,uint8_t * dst_uv,int width)804 void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
805 asm volatile(
806 // Start at end of source row.
807 "ld1 {v4.16b}, [%3] \n" // shuffler
808 "add %0, %0, %w2, sxtw #1 \n"
809 "sub %0, %0, #32 \n"
810 "1: \n"
811 "ldr q1, [%0, 16] \n"
812 "ldr q0, [%0], -32 \n" // src -= 32
813 "subs %w2, %w2, #16 \n" // 16 pixels per loop.
814 "tbl v2.16b, {v1.16b}, v4.16b \n"
815 "tbl v3.16b, {v0.16b}, v4.16b \n"
816 "st1 {v2.16b, v3.16b}, [%1], #32 \n" // dst += 32
817 "b.gt 1b \n"
818 : "+r"(src_uv), // %0
819 "+r"(dst_uv), // %1
820 "+r"(width) // %2
821 : "r"(&kShuffleMirrorUV) // %3
822 : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
823 }
824
MirrorSplitUVRow_NEON(const uint8_t * src_uv,uint8_t * dst_u,uint8_t * dst_v,int width)825 void MirrorSplitUVRow_NEON(const uint8_t* src_uv,
826 uint8_t* dst_u,
827 uint8_t* dst_v,
828 int width) {
829 asm volatile(
830 // Start at end of source row.
831 "ld1 {v4.16b}, [%4] \n" // shuffler
832 "add %0, %0, %w3, sxtw #1 \n"
833 "sub %0, %0, #32 \n"
834 "1: \n"
835 "ldr q1, [%0, 16] \n"
836 "ldr q0, [%0], -32 \n" // src -= 32
837 "subs %w3, %w3, #16 \n" // 16 pixels per loop.
838 "tbl v2.16b, {v1.16b}, v4.16b \n"
839 "tbl v3.16b, {v0.16b}, v4.16b \n"
840 "uzp1 v0.16b, v2.16b, v3.16b \n" // U
841 "uzp2 v1.16b, v2.16b, v3.16b \n" // V
842 "st1 {v0.16b}, [%1], #16 \n" // dst += 16
843 "st1 {v1.16b}, [%2], #16 \n"
844 "b.gt 1b \n"
845 : "+r"(src_uv), // %0
846 "+r"(dst_u), // %1
847 "+r"(dst_v), // %2
848 "+r"(width) // %3
849 : "r"(&kShuffleMirrorUV) // %4
850 : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
851 }
852
853 // Shuffle table for reversing the ARGB.
854 static const uvec8 kShuffleMirrorARGB = {12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u,
855 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u};
856
ARGBMirrorRow_NEON(const uint8_t * src_argb,uint8_t * dst_argb,int width)857 void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
858 asm volatile(
859 // Start at end of source row.
860 "ld1 {v4.16b}, [%3] \n" // shuffler
861 "add %0, %0, %w2, sxtw #2 \n"
862 "sub %0, %0, #32 \n"
863 "1: \n"
864 "ldr q1, [%0, 16] \n"
865 "ldr q0, [%0], -32 \n" // src -= 32
866 "subs %w2, %w2, #8 \n" // 8 pixels per loop.
867 "tbl v2.16b, {v1.16b}, v4.16b \n"
868 "tbl v3.16b, {v0.16b}, v4.16b \n"
869 "st1 {v2.16b, v3.16b}, [%1], #32 \n" // dst += 32
870 "b.gt 1b \n"
871 : "+r"(src_argb), // %0
872 "+r"(dst_argb), // %1
873 "+r"(width) // %2
874 : "r"(&kShuffleMirrorARGB) // %3
875 : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
876 }
877
RGB24MirrorRow_NEON(const uint8_t * src_rgb24,uint8_t * dst_rgb24,int width)878 void RGB24MirrorRow_NEON(const uint8_t* src_rgb24,
879 uint8_t* dst_rgb24,
880 int width) {
881 asm volatile(
882 "ld1 {v3.16b}, [%4] \n" // shuffler
883 "add %0, %0, %w2, sxtw #1 \n" // Start at end of row.
884 "add %0, %0, %w2, sxtw \n"
885 "sub %0, %0, #48 \n"
886
887 "1: \n"
888 "ld3 {v0.16b, v1.16b, v2.16b}, [%0], %3 \n" // src -= 48
889 "subs %w2, %w2, #16 \n" // 16 pixels per loop.
890 "tbl v0.16b, {v0.16b}, v3.16b \n"
891 "tbl v1.16b, {v1.16b}, v3.16b \n"
892 "tbl v2.16b, {v2.16b}, v3.16b \n"
893 "st3 {v0.16b, v1.16b, v2.16b}, [%1], #48 \n" // dst += 48
894 "b.gt 1b \n"
895 : "+r"(src_rgb24), // %0
896 "+r"(dst_rgb24), // %1
897 "+r"(width) // %2
898 : "r"((ptrdiff_t)-48), // %3
899 "r"(&kShuffleMirror) // %4
900 : "cc", "memory", "v0", "v1", "v2", "v3");
901 }
902
RGB24ToARGBRow_NEON(const uint8_t * src_rgb24,uint8_t * dst_argb,int width)903 void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
904 uint8_t* dst_argb,
905 int width) {
906 asm volatile(
907 "movi v4.8b, #255 \n" // Alpha
908 "1: \n"
909 "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of
910 // RGB24.
911 "prfm pldl1keep, [%0, 448] \n"
912 "subs %w2, %w2, #8 \n" // 8 processed per loop.
913 "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB
914 "b.gt 1b \n"
915 : "+r"(src_rgb24), // %0
916 "+r"(dst_argb), // %1
917 "+r"(width) // %2
918 :
919 : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List
920 );
921 }
922
RAWToARGBRow_NEON(const uint8_t * src_raw,uint8_t * dst_argb,int width)923 void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
924 asm volatile(
925 "movi v5.8b, #255 \n" // Alpha
926 "1: \n"
927 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
928 "prfm pldl1keep, [%0, 448] \n"
929 "subs %w2, %w2, #8 \n" // 8 processed per loop.
930 "orr v3.8b, v1.8b, v1.8b \n" // move g
931 "orr v4.8b, v0.8b, v0.8b \n" // move r
932 "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a
933 "b.gt 1b \n"
934 : "+r"(src_raw), // %0
935 "+r"(dst_argb), // %1
936 "+r"(width) // %2
937 :
938 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List
939 );
940 }
941
RAWToRGBARow_NEON(const uint8_t * src_raw,uint8_t * dst_rgba,int width)942 void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
943 asm volatile(
944 "movi v0.8b, #255 \n" // Alpha
945 "1: \n"
946 "ld3 {v3.8b,v4.8b,v5.8b}, [%0], #24 \n" // read r g b
947 "prfm pldl1keep, [%0, 448] \n"
948 "subs %w2, %w2, #8 \n" // 8 processed per loop.
949 "orr v2.8b, v4.8b, v4.8b \n" // move g
950 "orr v1.8b, v5.8b, v5.8b \n" // move r
951 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store a b g r
952 "b.gt 1b \n"
953 : "+r"(src_raw), // %0
954 "+r"(dst_rgba), // %1
955 "+r"(width) // %2
956 :
957 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List
958 );
959 }
960
RAWToRGB24Row_NEON(const uint8_t * src_raw,uint8_t * dst_rgb24,int width)961 void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
962 asm volatile(
963 "1: \n"
964 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
965 "prfm pldl1keep, [%0, 448] \n"
966 "subs %w2, %w2, #8 \n" // 8 processed per loop.
967 "orr v3.8b, v1.8b, v1.8b \n" // move g
968 "orr v4.8b, v0.8b, v0.8b \n" // move r
969 "st3 {v2.8b,v3.8b,v4.8b}, [%1], #24 \n" // store b g r
970 "b.gt 1b \n"
971 : "+r"(src_raw), // %0
972 "+r"(dst_rgb24), // %1
973 "+r"(width) // %2
974 :
975 : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List
976 );
977 }
978
979 #define RGB565TOARGB \
980 "shrn v6.8b, v0.8h, #5 \n" /* G xxGGGGGG */ \
981 "shl v6.8b, v6.8b, #2 \n" /* G GGGGGG00 upper 6 */ \
982 "ushr v4.8b, v6.8b, #6 \n" /* G 000000GG lower 2 */ \
983 "orr v1.8b, v4.8b, v6.8b \n" /* G */ \
984 "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \
985 "ushr v0.8h, v0.8h, #11 \n" /* R 000RRRRR */ \
986 "xtn2 v2.16b,v0.8h \n" /* R in upper part */ \
987 "shl v2.16b, v2.16b, #3 \n" /* R,B BBBBB000 upper 5 */ \
988 "ushr v0.16b, v2.16b, #5 \n" /* R,B 00000BBB lower 3 */ \
989 "orr v0.16b, v0.16b, v2.16b \n" /* R,B */ \
990 "dup v2.2D, v0.D[1] \n" /* R */
991
RGB565ToARGBRow_NEON(const uint8_t * src_rgb565,uint8_t * dst_argb,int width)992 void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
993 uint8_t* dst_argb,
994 int width) {
995 asm volatile(
996 "movi v3.8b, #255 \n" // Alpha
997 "1: \n"
998 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
999 "prfm pldl1keep, [%0, 448] \n"
1000 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1001 RGB565TOARGB
1002 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
1003 "b.gt 1b \n"
1004 : "+r"(src_rgb565), // %0
1005 "+r"(dst_argb), // %1
1006 "+r"(width) // %2
1007 :
1008 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6" // Clobber List
1009 );
1010 }
1011
1012 #define ARGB1555TOARGB \
1013 "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \
1014 "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \
1015 "xtn v3.8b, v2.8h \n" /* RRRRR000 AAAAAAAA */ \
1016 \
1017 "sshr v2.8h, v0.8h, #15 \n" /* A AAAAAAAA */ \
1018 "xtn2 v3.16b, v2.8h \n" \
1019 \
1020 "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \
1021 "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \
1022 \
1023 "ushr v1.16b, v3.16b, #5 \n" /* R,A 00000RRR lower 3 */ \
1024 "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \
1025 "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \
1026 \
1027 "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \
1028 "orr v2.16b, v1.16b, v3.16b \n" /* R,A */ \
1029 "dup v1.2D, v0.D[1] \n" \
1030 "dup v3.2D, v2.D[1] \n"
1031
1032 // RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
1033 #define RGB555TOARGB \
1034 "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \
1035 "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \
1036 "xtn v3.8b, v2.8h \n" /* RRRRR000 */ \
1037 \
1038 "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \
1039 "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \
1040 \
1041 "ushr v1.16b, v3.16b, #5 \n" /* R 00000RRR lower 3 */ \
1042 "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \
1043 "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \
1044 \
1045 "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \
1046 "orr v2.16b, v1.16b, v3.16b \n" /* R */ \
1047 "dup v1.2D, v0.D[1] \n" /* G */
1048
ARGB1555ToARGBRow_NEON(const uint8_t * src_argb1555,uint8_t * dst_argb,int width)1049 void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
1050 uint8_t* dst_argb,
1051 int width) {
1052 asm volatile(
1053 "movi v3.8b, #255 \n" // Alpha
1054 "1: \n"
1055 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
1056 "prfm pldl1keep, [%0, 448] \n"
1057 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1058 ARGB1555TOARGB
1059 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
1060 "b.gt 1b \n"
1061 : "+r"(src_argb1555), // %0
1062 "+r"(dst_argb), // %1
1063 "+r"(width) // %2
1064 :
1065 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
1066 );
1067 }
1068
1069 // Convert v0.8h to b = v0.8b g = v1.8b r = v2.8b
1070 // clobbers v3
1071 #define ARGB4444TOARGB \
1072 "shrn v1.8b, v0.8h, #8 \n" /* v1(l) AR */ \
1073 "xtn2 v1.16b, v0.8h \n" /* v1(h) GB */ \
1074 "shl v2.16b, v1.16b, #4 \n" /* B,R BBBB0000 */ \
1075 "ushr v3.16b, v1.16b, #4 \n" /* G,A 0000GGGG */ \
1076 "ushr v0.16b, v2.16b, #4 \n" /* B,R 0000BBBB */ \
1077 "shl v1.16b, v3.16b, #4 \n" /* G,A GGGG0000 */ \
1078 "orr v2.16b, v0.16b, v2.16b \n" /* B,R BBBBBBBB */ \
1079 "orr v3.16b, v1.16b, v3.16b \n" /* G,A GGGGGGGG */ \
1080 "dup v0.2D, v2.D[1] \n" \
1081 "dup v1.2D, v3.D[1] \n"
1082
ARGB4444ToARGBRow_NEON(const uint8_t * src_argb4444,uint8_t * dst_argb,int width)1083 void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
1084 uint8_t* dst_argb,
1085 int width) {
1086 asm volatile(
1087 "1: \n"
1088 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
1089 "prfm pldl1keep, [%0, 448] \n"
1090 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1091 ARGB4444TOARGB
1092 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
1093 "b.gt 1b \n"
1094 : "+r"(src_argb4444), // %0
1095 "+r"(dst_argb), // %1
1096 "+r"(width) // %2
1097 :
1098 : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List
1099 );
1100 }
1101
ARGBToRGB24Row_NEON(const uint8_t * src_argb,uint8_t * dst_rgb24,int width)1102 void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
1103 uint8_t* dst_rgb24,
1104 int width) {
1105 asm volatile(
1106 "1: \n"
1107 "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB
1108 "prfm pldl1keep, [%0, 448] \n"
1109 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1110 "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of
1111 // RGB24
1112 "b.gt 1b \n"
1113 : "+r"(src_argb), // %0
1114 "+r"(dst_rgb24), // %1
1115 "+r"(width) // %2
1116 :
1117 : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List
1118 );
1119 }
1120
ARGBToRAWRow_NEON(const uint8_t * src_argb,uint8_t * dst_raw,int width)1121 void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
1122 asm volatile(
1123 "1: \n"
1124 "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a
1125 "prfm pldl1keep, [%0, 448] \n"
1126 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1127 "orr v4.8b, v2.8b, v2.8b \n" // mov g
1128 "orr v5.8b, v1.8b, v1.8b \n" // mov b
1129 "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b
1130 "b.gt 1b \n"
1131 : "+r"(src_argb), // %0
1132 "+r"(dst_raw), // %1
1133 "+r"(width) // %2
1134 :
1135 : "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List
1136 );
1137 }
1138
YUY2ToYRow_NEON(const uint8_t * src_yuy2,uint8_t * dst_y,int width)1139 void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
1140 asm volatile(
1141 "1: \n"
1142 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2.
1143 "prfm pldl1keep, [%0, 448] \n"
1144 "subs %w2, %w2, #16 \n" // 16 processed per loop.
1145 "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y.
1146 "b.gt 1b \n"
1147 : "+r"(src_yuy2), // %0
1148 "+r"(dst_y), // %1
1149 "+r"(width) // %2
1150 :
1151 : "cc", "memory", "v0", "v1" // Clobber List
1152 );
1153 }
1154
UYVYToYRow_NEON(const uint8_t * src_uyvy,uint8_t * dst_y,int width)1155 void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
1156 asm volatile(
1157 "1: \n"
1158 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY.
1159 "prfm pldl1keep, [%0, 448] \n"
1160 "subs %w2, %w2, #16 \n" // 16 processed per loop.
1161 "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y.
1162 "b.gt 1b \n"
1163 : "+r"(src_uyvy), // %0
1164 "+r"(dst_y), // %1
1165 "+r"(width) // %2
1166 :
1167 : "cc", "memory", "v0", "v1" // Clobber List
1168 );
1169 }
1170
YUY2ToUV422Row_NEON(const uint8_t * src_yuy2,uint8_t * dst_u,uint8_t * dst_v,int width)1171 void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
1172 uint8_t* dst_u,
1173 uint8_t* dst_v,
1174 int width) {
1175 asm volatile(
1176 "1: \n"
1177 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2
1178 "prfm pldl1keep, [%0, 448] \n"
1179 "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
1180 "st1 {v1.8b}, [%1], #8 \n" // store 8 U.
1181 "st1 {v3.8b}, [%2], #8 \n" // store 8 V.
1182 "b.gt 1b \n"
1183 : "+r"(src_yuy2), // %0
1184 "+r"(dst_u), // %1
1185 "+r"(dst_v), // %2
1186 "+r"(width) // %3
1187 :
1188 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
1189 );
1190 }
1191
UYVYToUV422Row_NEON(const uint8_t * src_uyvy,uint8_t * dst_u,uint8_t * dst_v,int width)1192 void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
1193 uint8_t* dst_u,
1194 uint8_t* dst_v,
1195 int width) {
1196 asm volatile(
1197 "1: \n"
1198 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY
1199 "prfm pldl1keep, [%0, 448] \n"
1200 "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
1201 "st1 {v0.8b}, [%1], #8 \n" // store 8 U.
1202 "st1 {v2.8b}, [%2], #8 \n" // store 8 V.
1203 "b.gt 1b \n"
1204 : "+r"(src_uyvy), // %0
1205 "+r"(dst_u), // %1
1206 "+r"(dst_v), // %2
1207 "+r"(width) // %3
1208 :
1209 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
1210 );
1211 }
1212
YUY2ToUVRow_NEON(const uint8_t * src_yuy2,int stride_yuy2,uint8_t * dst_u,uint8_t * dst_v,int width)1213 void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
1214 int stride_yuy2,
1215 uint8_t* dst_u,
1216 uint8_t* dst_v,
1217 int width) {
1218 const uint8_t* src_yuy2b = src_yuy2 + stride_yuy2;
1219 asm volatile(
1220 "1: \n"
1221 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
1222 "prfm pldl1keep, [%0, 448] \n"
1223 "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
1224 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
1225 "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U
1226 "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V
1227 "st1 {v1.8b}, [%2], #8 \n" // store 8 U.
1228 "st1 {v3.8b}, [%3], #8 \n" // store 8 V.
1229 "b.gt 1b \n"
1230 : "+r"(src_yuy2), // %0
1231 "+r"(src_yuy2b), // %1
1232 "+r"(dst_u), // %2
1233 "+r"(dst_v), // %3
1234 "+r"(width) // %4
1235 :
1236 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
1237 "v7" // Clobber List
1238 );
1239 }
1240
UYVYToUVRow_NEON(const uint8_t * src_uyvy,int stride_uyvy,uint8_t * dst_u,uint8_t * dst_v,int width)1241 void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
1242 int stride_uyvy,
1243 uint8_t* dst_u,
1244 uint8_t* dst_v,
1245 int width) {
1246 const uint8_t* src_uyvyb = src_uyvy + stride_uyvy;
1247 asm volatile(
1248 "1: \n"
1249 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
1250 "prfm pldl1keep, [%0, 448] \n"
1251 "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
1252 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
1253 "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U
1254 "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V
1255 "st1 {v0.8b}, [%2], #8 \n" // store 8 U.
1256 "st1 {v2.8b}, [%3], #8 \n" // store 8 V.
1257 "b.gt 1b \n"
1258 : "+r"(src_uyvy), // %0
1259 "+r"(src_uyvyb), // %1
1260 "+r"(dst_u), // %2
1261 "+r"(dst_v), // %3
1262 "+r"(width) // %4
1263 :
1264 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
1265 "v7" // Clobber List
1266 );
1267 }
1268
1269 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
ARGBShuffleRow_NEON(const uint8_t * src_argb,uint8_t * dst_argb,const uint8_t * shuffler,int width)1270 void ARGBShuffleRow_NEON(const uint8_t* src_argb,
1271 uint8_t* dst_argb,
1272 const uint8_t* shuffler,
1273 int width) {
1274 asm volatile(
1275 "ld1 {v2.16b}, [%3] \n" // shuffler
1276 "1: \n"
1277 "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels.
1278 "prfm pldl1keep, [%0, 448] \n"
1279 "subs %w2, %w2, #4 \n" // 4 processed per loop
1280 "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels
1281 "st1 {v1.16b}, [%1], #16 \n" // store 4.
1282 "b.gt 1b \n"
1283 : "+r"(src_argb), // %0
1284 "+r"(dst_argb), // %1
1285 "+r"(width) // %2
1286 : "r"(shuffler) // %3
1287 : "cc", "memory", "v0", "v1", "v2" // Clobber List
1288 );
1289 }
1290
I422ToYUY2Row_NEON(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_yuy2,int width)1291 void I422ToYUY2Row_NEON(const uint8_t* src_y,
1292 const uint8_t* src_u,
1293 const uint8_t* src_v,
1294 uint8_t* dst_yuy2,
1295 int width) {
1296 asm volatile(
1297 "1: \n"
1298 "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys
1299 "prfm pldl1keep, [%0, 448] \n"
1300 "orr v2.8b, v1.8b, v1.8b \n"
1301 "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us
1302 "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs
1303 "subs %w4, %w4, #16 \n" // 16 pixels
1304 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
1305 "b.gt 1b \n"
1306 : "+r"(src_y), // %0
1307 "+r"(src_u), // %1
1308 "+r"(src_v), // %2
1309 "+r"(dst_yuy2), // %3
1310 "+r"(width) // %4
1311 :
1312 : "cc", "memory", "v0", "v1", "v2", "v3");
1313 }
1314
I422ToUYVYRow_NEON(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_uyvy,int width)1315 void I422ToUYVYRow_NEON(const uint8_t* src_y,
1316 const uint8_t* src_u,
1317 const uint8_t* src_v,
1318 uint8_t* dst_uyvy,
1319 int width) {
1320 asm volatile(
1321 "1: \n"
1322 "ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys
1323 "prfm pldl1keep, [%0, 448] \n"
1324 "orr v3.8b, v2.8b, v2.8b \n"
1325 "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us
1326 "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs
1327 "subs %w4, %w4, #16 \n" // 16 pixels
1328 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
1329 "b.gt 1b \n"
1330 : "+r"(src_y), // %0
1331 "+r"(src_u), // %1
1332 "+r"(src_v), // %2
1333 "+r"(dst_uyvy), // %3
1334 "+r"(width) // %4
1335 :
1336 : "cc", "memory", "v0", "v1", "v2", "v3");
1337 }
1338
ARGBToRGB565Row_NEON(const uint8_t * src_argb,uint8_t * dst_rgb565,int width)1339 void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
1340 uint8_t* dst_rgb565,
1341 int width) {
1342 asm volatile(
1343 "1: \n"
1344 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8
1345 // pixels
1346 "prfm pldl1keep, [%0, 448] \n"
1347 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1348 ARGBTORGB565
1349 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565.
1350 "b.gt 1b \n"
1351 : "+r"(src_argb), // %0
1352 "+r"(dst_rgb565), // %1
1353 "+r"(width) // %2
1354 :
1355 : "cc", "memory", "v0", "v20", "v21", "v22", "v23");
1356 }
1357
ARGBToRGB565DitherRow_NEON(const uint8_t * src_argb,uint8_t * dst_rgb,const uint32_t dither4,int width)1358 void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
1359 uint8_t* dst_rgb,
1360 const uint32_t dither4,
1361 int width) {
1362 asm volatile(
1363 "dup v1.4s, %w2 \n" // dither4
1364 "1: \n"
1365 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" // load 8
1366 // pixels
1367 "prfm pldl1keep, [%0, 448] \n"
1368 "subs %w3, %w3, #8 \n" // 8 processed per loop.
1369 "uqadd v20.8b, v20.8b, v1.8b \n"
1370 "uqadd v21.8b, v21.8b, v1.8b \n"
1371 "uqadd v22.8b, v22.8b, v1.8b \n" ARGBTORGB565
1372 "st1 {v0.16b}, [%0], #16 \n" // store 8 pixels RGB565.
1373 "b.gt 1b \n"
1374 : "+r"(dst_rgb) // %0
1375 : "r"(src_argb), // %1
1376 "r"(dither4), // %2
1377 "r"(width) // %3
1378 : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23");
1379 }
1380
ARGBToARGB1555Row_NEON(const uint8_t * src_argb,uint8_t * dst_argb1555,int width)1381 void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
1382 uint8_t* dst_argb1555,
1383 int width) {
1384 asm volatile(
1385 "1: \n"
1386 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8
1387 // pixels
1388 "prfm pldl1keep, [%0, 448] \n"
1389 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1390 ARGBTOARGB1555
1391 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels
1392 "b.gt 1b \n"
1393 : "+r"(src_argb), // %0
1394 "+r"(dst_argb1555), // %1
1395 "+r"(width) // %2
1396 :
1397 : "cc", "memory", "v0", "v20", "v21", "v22", "v23");
1398 }
1399
ARGBToARGB4444Row_NEON(const uint8_t * src_argb,uint8_t * dst_argb4444,int width)1400 void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
1401 uint8_t* dst_argb4444,
1402 int width) {
1403 asm volatile(
1404 "movi v4.16b, #0x0f \n" // bits to clear with
1405 // vbic.
1406 "1: \n"
1407 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8
1408 // pixels
1409 "prfm pldl1keep, [%0, 448] \n"
1410 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1411 ARGBTOARGB4444
1412 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels
1413 "b.gt 1b \n"
1414 : "+r"(src_argb), // %0
1415 "+r"(dst_argb4444), // %1
1416 "+r"(width) // %2
1417 :
1418 : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23");
1419 }
1420
ARGBToYRow_NEON(const uint8_t * src_argb,uint8_t * dst_y,int width)1421 void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
1422 asm volatile(
1423 "movi v4.8b, #25 \n" // B * 0.1016 coefficient
1424 "movi v5.8b, #129 \n" // G * 0.5078 coefficient
1425 "movi v6.8b, #66 \n" // R * 0.2578 coefficient
1426 "movi v7.8b, #16 \n" // Add 16 constant
1427 "1: \n"
1428 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
1429 "prfm pldl1keep, [%0, 448] \n"
1430 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1431 "umull v3.8h, v0.8b, v4.8b \n" // B
1432 "umlal v3.8h, v1.8b, v5.8b \n" // G
1433 "umlal v3.8h, v2.8b, v6.8b \n" // R
1434 "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y
1435 "uqadd v0.8b, v0.8b, v7.8b \n"
1436 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
1437 "b.gt 1b \n"
1438 : "+r"(src_argb), // %0
1439 "+r"(dst_y), // %1
1440 "+r"(width) // %2
1441 :
1442 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
1443 }
1444
ARGBExtractAlphaRow_NEON(const uint8_t * src_argb,uint8_t * dst_a,int width)1445 void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
1446 uint8_t* dst_a,
1447 int width) {
1448 asm volatile(
1449 "1: \n"
1450 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16
1451 "prfm pldl1keep, [%0, 448] \n"
1452 "subs %w2, %w2, #16 \n" // 16 processed per loop
1453 "st1 {v3.16b}, [%1], #16 \n" // store 16 A's.
1454 "b.gt 1b \n"
1455 : "+r"(src_argb), // %0
1456 "+r"(dst_a), // %1
1457 "+r"(width) // %2
1458 :
1459 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
1460 );
1461 }
1462
ARGBToYJRow_NEON(const uint8_t * src_argb,uint8_t * dst_y,int width)1463 void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
1464 asm volatile(
1465 "movi v4.8b, #29 \n" // B * 0.1140 coefficient
1466 "movi v5.8b, #150 \n" // G * 0.5870 coefficient
1467 "movi v6.8b, #77 \n" // R * 0.2990 coefficient
1468 "1: \n"
1469 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
1470 "prfm pldl1keep, [%0, 448] \n"
1471 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1472 "umull v3.8h, v0.8b, v4.8b \n" // B
1473 "umlal v3.8h, v1.8b, v5.8b \n" // G
1474 "umlal v3.8h, v2.8b, v6.8b \n" // R
1475 "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y
1476 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
1477 "b.gt 1b \n"
1478 : "+r"(src_argb), // %0
1479 "+r"(dst_y), // %1
1480 "+r"(width) // %2
1481 :
1482 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
1483 }
1484
RGBAToYJRow_NEON(const uint8_t * src_argb,uint8_t * dst_y,int width)1485 void RGBAToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
1486 asm volatile(
1487 "movi v4.8b, #29 \n" // B * 0.1140 coefficient
1488 "movi v5.8b, #150 \n" // G * 0.5870 coefficient
1489 "movi v6.8b, #77 \n" // R * 0.2990 coefficient
1490 "1: \n"
1491 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 RGBA
1492 "prfm pldl1keep, [%0, 448] \n"
1493 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1494 "umull v0.8h, v1.8b, v4.8b \n" // B
1495 "umlal v0.8h, v2.8b, v5.8b \n" // G
1496 "umlal v0.8h, v3.8b, v6.8b \n" // R
1497 "uqrshrn v3.8b, v0.8h, #8 \n" // 16 bit to 8 bit Y
1498 "st1 {v3.8b}, [%1], #8 \n" // store 8 pixels Y.
1499 "b.gt 1b \n"
1500 : "+r"(src_argb), // %0
1501 "+r"(dst_y), // %1
1502 "+r"(width) // %2
1503 :
1504 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
1505 }
1506
1507 // 8x1 pixels.
ARGBToUV444Row_NEON(const uint8_t * src_argb,uint8_t * dst_u,uint8_t * dst_v,int width)1508 void ARGBToUV444Row_NEON(const uint8_t* src_argb,
1509 uint8_t* dst_u,
1510 uint8_t* dst_v,
1511 int width) {
1512 asm volatile(
1513 "movi v24.8b, #112 \n" // UB / VR 0.875
1514 // coefficient
1515 "movi v25.8b, #74 \n" // UG -0.5781 coefficient
1516 "movi v26.8b, #38 \n" // UR -0.2969 coefficient
1517 "movi v27.8b, #18 \n" // VB -0.1406 coefficient
1518 "movi v28.8b, #94 \n" // VG -0.7344 coefficient
1519 "movi v29.16b,#0x80 \n" // 128.5
1520 "1: \n"
1521 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
1522 "prfm pldl1keep, [%0, 448] \n"
1523 "subs %w3, %w3, #8 \n" // 8 processed per loop.
1524 "umull v4.8h, v0.8b, v24.8b \n" // B
1525 "umlsl v4.8h, v1.8b, v25.8b \n" // G
1526 "umlsl v4.8h, v2.8b, v26.8b \n" // R
1527 "add v4.8h, v4.8h, v29.8h \n" // +128 -> unsigned
1528
1529 "umull v3.8h, v2.8b, v24.8b \n" // R
1530 "umlsl v3.8h, v1.8b, v28.8b \n" // G
1531 "umlsl v3.8h, v0.8b, v27.8b \n" // B
1532 "add v3.8h, v3.8h, v29.8h \n" // +128 -> unsigned
1533
1534 "uqshrn v0.8b, v4.8h, #8 \n" // 16 bit to 8 bit U
1535 "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V
1536
1537 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U.
1538 "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V.
1539 "b.gt 1b \n"
1540 : "+r"(src_argb), // %0
1541 "+r"(dst_u), // %1
1542 "+r"(dst_v), // %2
1543 "+r"(width) // %3
1544 :
1545 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26",
1546 "v27", "v28", "v29");
1547 }
1548
1549 #define RGBTOUV_SETUP_REG \
1550 "movi v20.8h, #56, lsl #0 \n" /* UB/VR coefficient (0.875) / 2 */ \
1551 "movi v21.8h, #37, lsl #0 \n" /* UG coefficient (-0.5781) / 2 */ \
1552 "movi v22.8h, #19, lsl #0 \n" /* UR coefficient (-0.2969) / 2 */ \
1553 "movi v23.8h, #9, lsl #0 \n" /* VB coefficient (-0.1406) / 2 */ \
1554 "movi v24.8h, #47, lsl #0 \n" /* VG coefficient (-0.7344) / 2 */ \
1555 "movi v25.16b, #0x80 \n" /* 128.5 (0x8080 in 16-bit) */
1556
1557 // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
1558 // clang-format off
1559 #define RGBTOUV(QB, QG, QR) \
1560 "mul v3.8h, " #QB ",v20.8h \n" /* B */ \
1561 "mul v4.8h, " #QR ",v20.8h \n" /* R */ \
1562 "mls v3.8h, " #QG ",v21.8h \n" /* G */ \
1563 "mls v4.8h, " #QG ",v24.8h \n" /* G */ \
1564 "mls v3.8h, " #QR ",v22.8h \n" /* R */ \
1565 "mls v4.8h, " #QB ",v23.8h \n" /* B */ \
1566 "add v3.8h, v3.8h, v25.8h \n" /* +128 -> unsigned */ \
1567 "add v4.8h, v4.8h, v25.8h \n" /* +128 -> unsigned */ \
1568 "uqshrn v0.8b, v3.8h, #8 \n" /* 16 bit to 8 bit U */ \
1569 "uqshrn v1.8b, v4.8h, #8 \n" /* 16 bit to 8 bit V */
1570 // clang-format on
1571
1572 // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
1573 // TODO(fbarchard): consider ptrdiff_t for all strides.
1574
ARGBToUVRow_NEON(const uint8_t * src_argb,int src_stride_argb,uint8_t * dst_u,uint8_t * dst_v,int width)1575 void ARGBToUVRow_NEON(const uint8_t* src_argb,
1576 int src_stride_argb,
1577 uint8_t* dst_u,
1578 uint8_t* dst_v,
1579 int width) {
1580 const uint8_t* src_argb_1 = src_argb + src_stride_argb;
1581 asm volatile (
1582 RGBTOUV_SETUP_REG
1583 "1: \n"
1584 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
1585 "prfm pldl1keep, [%0, 448] \n"
1586 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
1587 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
1588 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
1589
1590 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
1591 "prfm pldl1keep, [%1, 448] \n"
1592 "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
1593 "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
1594 "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
1595
1596 "urshr v0.8h, v0.8h, #1 \n" // 2x average
1597 "urshr v1.8h, v1.8h, #1 \n"
1598 "urshr v2.8h, v2.8h, #1 \n"
1599
1600 "subs %w4, %w4, #16 \n" // 16 processed per loop.
1601 RGBTOUV(v0.8h, v1.8h, v2.8h)
1602 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
1603 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
1604 "b.gt 1b \n"
1605 : "+r"(src_argb), // %0
1606 "+r"(src_argb_1), // %1
1607 "+r"(dst_u), // %2
1608 "+r"(dst_v), // %3
1609 "+r"(width) // %4
1610 :
1611 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1612 "v20", "v21", "v22", "v23", "v24", "v25"
1613 );
1614 }
1615
ARGBToUVJRow_NEON(const uint8_t * src_argb,int src_stride_argb,uint8_t * dst_u,uint8_t * dst_v,int width)1616 void ARGBToUVJRow_NEON(const uint8_t* src_argb,
1617 int src_stride_argb,
1618 uint8_t* dst_u,
1619 uint8_t* dst_v,
1620 int width) {
1621 const uint8_t* src_argb_1 = src_argb + src_stride_argb;
1622 asm volatile (
1623 "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2
1624 "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2
1625 "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2
1626 "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2
1627 "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2
1628 "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit)
1629 "1: \n"
1630 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
1631 "prfm pldl1keep, [%0, 448] \n"
1632 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
1633 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
1634 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
1635 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
1636 "prfm pldl1keep, [%1, 448] \n"
1637 "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
1638 "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
1639 "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
1640
1641 "urshr v0.8h, v0.8h, #1 \n" // 2x average
1642 "urshr v1.8h, v1.8h, #1 \n"
1643 "urshr v2.8h, v2.8h, #1 \n"
1644
1645 "subs %w4, %w4, #16 \n" // 32 processed per loop.
1646 RGBTOUV(v0.8h, v1.8h, v2.8h)
1647 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
1648 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
1649 "b.gt 1b \n"
1650 : "+r"(src_argb), // %0
1651 "+r"(src_argb_1), // %1
1652 "+r"(dst_u), // %2
1653 "+r"(dst_v), // %3
1654 "+r"(width) // %4
1655 :
1656 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1657 "v20", "v21", "v22", "v23", "v24", "v25"
1658 );
1659 }
1660
BGRAToUVRow_NEON(const uint8_t * src_bgra,int src_stride_bgra,uint8_t * dst_u,uint8_t * dst_v,int width)1661 void BGRAToUVRow_NEON(const uint8_t* src_bgra,
1662 int src_stride_bgra,
1663 uint8_t* dst_u,
1664 uint8_t* dst_v,
1665 int width) {
1666 const uint8_t* src_bgra_1 = src_bgra + src_stride_bgra;
1667 asm volatile (
1668 RGBTOUV_SETUP_REG
1669 "1: \n"
1670 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
1671 "prfm pldl1keep, [%0, 448] \n"
1672 "uaddlp v0.8h, v3.16b \n" // B 16 bytes -> 8 shorts.
1673 "uaddlp v3.8h, v2.16b \n" // G 16 bytes -> 8 shorts.
1674 "uaddlp v2.8h, v1.16b \n" // R 16 bytes -> 8 shorts.
1675 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more
1676 "prfm pldl1keep, [%1, 448] \n"
1677 "uadalp v0.8h, v7.16b \n" // B 16 bytes -> 8 shorts.
1678 "uadalp v3.8h, v6.16b \n" // G 16 bytes -> 8 shorts.
1679 "uadalp v2.8h, v5.16b \n" // R 16 bytes -> 8 shorts.
1680
1681 "urshr v0.8h, v0.8h, #1 \n" // 2x average
1682 "urshr v1.8h, v3.8h, #1 \n"
1683 "urshr v2.8h, v2.8h, #1 \n"
1684
1685 "subs %w4, %w4, #16 \n" // 32 processed per loop.
1686 RGBTOUV(v0.8h, v1.8h, v2.8h)
1687 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
1688 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
1689 "b.gt 1b \n"
1690 : "+r"(src_bgra), // %0
1691 "+r"(src_bgra_1), // %1
1692 "+r"(dst_u), // %2
1693 "+r"(dst_v), // %3
1694 "+r"(width) // %4
1695 :
1696 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1697 "v20", "v21", "v22", "v23", "v24", "v25"
1698 );
1699 }
1700
ABGRToUVRow_NEON(const uint8_t * src_abgr,int src_stride_abgr,uint8_t * dst_u,uint8_t * dst_v,int width)1701 void ABGRToUVRow_NEON(const uint8_t* src_abgr,
1702 int src_stride_abgr,
1703 uint8_t* dst_u,
1704 uint8_t* dst_v,
1705 int width) {
1706 const uint8_t* src_abgr_1 = src_abgr + src_stride_abgr;
1707 asm volatile (
1708 RGBTOUV_SETUP_REG
1709 "1: \n"
1710 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
1711 "prfm pldl1keep, [%0, 448] \n"
1712 "uaddlp v3.8h, v2.16b \n" // B 16 bytes -> 8 shorts.
1713 "uaddlp v2.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
1714 "uaddlp v1.8h, v0.16b \n" // R 16 bytes -> 8 shorts.
1715 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more.
1716 "prfm pldl1keep, [%1, 448] \n"
1717 "uadalp v3.8h, v6.16b \n" // B 16 bytes -> 8 shorts.
1718 "uadalp v2.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
1719 "uadalp v1.8h, v4.16b \n" // R 16 bytes -> 8 shorts.
1720
1721 "urshr v0.8h, v3.8h, #1 \n" // 2x average
1722 "urshr v2.8h, v2.8h, #1 \n"
1723 "urshr v1.8h, v1.8h, #1 \n"
1724
1725 "subs %w4, %w4, #16 \n" // 32 processed per loop.
1726 RGBTOUV(v0.8h, v2.8h, v1.8h)
1727 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
1728 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
1729 "b.gt 1b \n"
1730 : "+r"(src_abgr), // %0
1731 "+r"(src_abgr_1), // %1
1732 "+r"(dst_u), // %2
1733 "+r"(dst_v), // %3
1734 "+r"(width) // %4
1735 :
1736 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1737 "v20", "v21", "v22", "v23", "v24", "v25"
1738 );
1739 }
1740
RGBAToUVRow_NEON(const uint8_t * src_rgba,int src_stride_rgba,uint8_t * dst_u,uint8_t * dst_v,int width)1741 void RGBAToUVRow_NEON(const uint8_t* src_rgba,
1742 int src_stride_rgba,
1743 uint8_t* dst_u,
1744 uint8_t* dst_v,
1745 int width) {
1746 const uint8_t* src_rgba_1 = src_rgba + src_stride_rgba;
1747 asm volatile (
1748 RGBTOUV_SETUP_REG
1749 "1: \n"
1750 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
1751 "prfm pldl1keep, [%0, 448] \n"
1752 "uaddlp v0.8h, v1.16b \n" // B 16 bytes -> 8 shorts.
1753 "uaddlp v1.8h, v2.16b \n" // G 16 bytes -> 8 shorts.
1754 "uaddlp v2.8h, v3.16b \n" // R 16 bytes -> 8 shorts.
1755 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more.
1756 "prfm pldl1keep, [%1, 448] \n"
1757 "uadalp v0.8h, v5.16b \n" // B 16 bytes -> 8 shorts.
1758 "uadalp v1.8h, v6.16b \n" // G 16 bytes -> 8 shorts.
1759 "uadalp v2.8h, v7.16b \n" // R 16 bytes -> 8 shorts.
1760
1761 "urshr v0.8h, v0.8h, #1 \n" // 2x average
1762 "urshr v1.8h, v1.8h, #1 \n"
1763 "urshr v2.8h, v2.8h, #1 \n"
1764
1765 "subs %w4, %w4, #16 \n" // 32 processed per loop.
1766 RGBTOUV(v0.8h, v1.8h, v2.8h)
1767 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
1768 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
1769 "b.gt 1b \n"
1770 : "+r"(src_rgba), // %0
1771 "+r"(src_rgba_1), // %1
1772 "+r"(dst_u), // %2
1773 "+r"(dst_v), // %3
1774 "+r"(width) // %4
1775 :
1776 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1777 "v20", "v21", "v22", "v23", "v24", "v25"
1778 );
1779 }
1780
RGB24ToUVRow_NEON(const uint8_t * src_rgb24,int src_stride_rgb24,uint8_t * dst_u,uint8_t * dst_v,int width)1781 void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
1782 int src_stride_rgb24,
1783 uint8_t* dst_u,
1784 uint8_t* dst_v,
1785 int width) {
1786 const uint8_t* src_rgb24_1 = src_rgb24 + src_stride_rgb24;
1787 asm volatile (
1788 RGBTOUV_SETUP_REG
1789 "1: \n"
1790 "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels.
1791 "prfm pldl1keep, [%0, 448] \n"
1792 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
1793 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
1794 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
1795 "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 16 more.
1796 "prfm pldl1keep, [%1, 448] \n"
1797 "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
1798 "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
1799 "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
1800
1801 "urshr v0.8h, v0.8h, #1 \n" // 2x average
1802 "urshr v1.8h, v1.8h, #1 \n"
1803 "urshr v2.8h, v2.8h, #1 \n"
1804
1805 "subs %w4, %w4, #16 \n" // 32 processed per loop.
1806 RGBTOUV(v0.8h, v1.8h, v2.8h)
1807 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
1808 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
1809 "b.gt 1b \n"
1810 : "+r"(src_rgb24), // %0
1811 "+r"(src_rgb24_1), // %1
1812 "+r"(dst_u), // %2
1813 "+r"(dst_v), // %3
1814 "+r"(width) // %4
1815 :
1816 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1817 "v20", "v21", "v22", "v23", "v24", "v25"
1818 );
1819 }
1820
RAWToUVRow_NEON(const uint8_t * src_raw,int src_stride_raw,uint8_t * dst_u,uint8_t * dst_v,int width)1821 void RAWToUVRow_NEON(const uint8_t* src_raw,
1822 int src_stride_raw,
1823 uint8_t* dst_u,
1824 uint8_t* dst_v,
1825 int width) {
1826 const uint8_t* src_raw_1 = src_raw + src_stride_raw;
1827 asm volatile (
1828 RGBTOUV_SETUP_REG
1829 "1: \n"
1830 "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 8 RAW pixels.
1831 "prfm pldl1keep, [%0, 448] \n"
1832 "uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts.
1833 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
1834 "uaddlp v0.8h, v0.16b \n" // R 16 bytes -> 8 shorts.
1835 "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 8 more RAW pixels
1836 "prfm pldl1keep, [%1, 448] \n"
1837 "uadalp v2.8h, v6.16b \n" // B 16 bytes -> 8 shorts.
1838 "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
1839 "uadalp v0.8h, v4.16b \n" // R 16 bytes -> 8 shorts.
1840
1841 "urshr v2.8h, v2.8h, #1 \n" // 2x average
1842 "urshr v1.8h, v1.8h, #1 \n"
1843 "urshr v0.8h, v0.8h, #1 \n"
1844
1845 "subs %w4, %w4, #16 \n" // 32 processed per loop.
1846 RGBTOUV(v2.8h, v1.8h, v0.8h)
1847 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
1848 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
1849 "b.gt 1b \n"
1850 : "+r"(src_raw), // %0
1851 "+r"(src_raw_1), // %1
1852 "+r"(dst_u), // %2
1853 "+r"(dst_v), // %3
1854 "+r"(width) // %4
1855 :
1856 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1857 "v20", "v21", "v22", "v23", "v24", "v25"
1858 );
1859 }
1860
1861 // 16x2 pixels -> 8x1. width is number of rgb pixels. e.g. 16.
RGB565ToUVRow_NEON(const uint8_t * src_rgb565,int src_stride_rgb565,uint8_t * dst_u,uint8_t * dst_v,int width)1862 void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
1863 int src_stride_rgb565,
1864 uint8_t* dst_u,
1865 uint8_t* dst_v,
1866 int width) {
1867 const uint8_t* src_rgb565_1 = src_rgb565 + src_stride_rgb565;
1868 asm volatile(
1869 RGBTOUV_SETUP_REG
1870 "1: \n"
1871 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
1872 "prfm pldl1keep, [%0, 448] \n"
1873 RGB565TOARGB
1874 "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
1875 "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
1876 "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
1877 "ld1 {v0.16b}, [%0], #16 \n" // next 8 RGB565 pixels.
1878 RGB565TOARGB
1879 "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
1880 "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
1881 "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
1882
1883 "ld1 {v0.16b}, [%1], #16 \n" // load 8 RGB565 pixels.
1884 "prfm pldl1keep, [%1, 448] \n"
1885 RGB565TOARGB
1886 "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
1887 "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
1888 "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
1889 "ld1 {v0.16b}, [%1], #16 \n" // next 8 RGB565 pixels.
1890 RGB565TOARGB
1891 "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
1892 "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
1893 "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
1894
1895 "ins v16.D[1], v26.D[0] \n"
1896 "ins v17.D[1], v27.D[0] \n"
1897 "ins v18.D[1], v28.D[0] \n"
1898
1899 "urshr v0.8h, v16.8h, #1 \n" // 2x average
1900 "urshr v1.8h, v17.8h, #1 \n"
1901 "urshr v2.8h, v18.8h, #1 \n"
1902
1903 "subs %w4, %w4, #16 \n" // 16 processed per loop.
1904 RGBTOUV(v0.8h, v1.8h, v2.8h)
1905 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
1906 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
1907 "b.gt 1b \n"
1908 : "+r"(src_rgb565), // %0
1909 "+r"(src_rgb565_1), // %1
1910 "+r"(dst_u), // %2
1911 "+r"(dst_v), // %3
1912 "+r"(width) // %4
1913 :
1914 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17",
1915 "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
1916 "v28");
1917 }
1918
1919 // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
ARGB1555ToUVRow_NEON(const uint8_t * src_argb1555,int src_stride_argb1555,uint8_t * dst_u,uint8_t * dst_v,int width)1920 void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
1921 int src_stride_argb1555,
1922 uint8_t* dst_u,
1923 uint8_t* dst_v,
1924 int width) {
1925 const uint8_t* src_argb1555_1 = src_argb1555 + src_stride_argb1555;
1926 asm volatile(
1927 RGBTOUV_SETUP_REG
1928 "1: \n"
1929 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
1930 "prfm pldl1keep, [%0, 448] \n"
1931 RGB555TOARGB
1932 "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
1933 "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
1934 "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
1935 "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB1555 pixels.
1936 RGB555TOARGB
1937 "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
1938 "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
1939 "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
1940
1941 "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB1555 pixels.
1942 "prfm pldl1keep, [%1, 448] \n"
1943 RGB555TOARGB
1944 "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
1945 "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
1946 "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
1947 "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB1555 pixels.
1948 RGB555TOARGB
1949 "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
1950 "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
1951 "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
1952
1953 "ins v16.D[1], v26.D[0] \n"
1954 "ins v17.D[1], v27.D[0] \n"
1955 "ins v18.D[1], v28.D[0] \n"
1956
1957 "urshr v0.8h, v16.8h, #1 \n" // 2x average
1958 "urshr v1.8h, v17.8h, #1 \n"
1959 "urshr v2.8h, v18.8h, #1 \n"
1960
1961 "subs %w4, %w4, #16 \n" // 16 processed per loop.
1962 RGBTOUV(v0.8h, v1.8h, v2.8h)
1963 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
1964 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
1965 "b.gt 1b \n"
1966 : "+r"(src_argb1555), // %0
1967 "+r"(src_argb1555_1), // %1
1968 "+r"(dst_u), // %2
1969 "+r"(dst_v), // %3
1970 "+r"(width) // %4
1971 :
1972 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17",
1973 "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
1974 "v28");
1975 }
1976
1977 // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
ARGB4444ToUVRow_NEON(const uint8_t * src_argb4444,int src_stride_argb4444,uint8_t * dst_u,uint8_t * dst_v,int width)1978 void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
1979 int src_stride_argb4444,
1980 uint8_t* dst_u,
1981 uint8_t* dst_v,
1982 int width) {
1983 const uint8_t* src_argb4444_1 = src_argb4444 + src_stride_argb4444;
1984 asm volatile(
1985 RGBTOUV_SETUP_REG // sets v20-v25
1986 "1: \n"
1987 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
1988 "prfm pldl1keep, [%0, 448] \n"
1989 ARGB4444TOARGB
1990 "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
1991 "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
1992 "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
1993 "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB4444 pixels.
1994 ARGB4444TOARGB
1995 "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
1996 "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
1997 "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
1998
1999 "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB4444 pixels.
2000 "prfm pldl1keep, [%1, 448] \n"
2001 ARGB4444TOARGB
2002 "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
2003 "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
2004 "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
2005 "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB4444 pixels.
2006 ARGB4444TOARGB
2007 "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
2008 "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
2009 "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
2010
2011 "ins v16.D[1], v26.D[0] \n"
2012 "ins v17.D[1], v27.D[0] \n"
2013 "ins v18.D[1], v28.D[0] \n"
2014
2015 "urshr v0.8h, v16.8h, #1 \n" // 2x average
2016 "urshr v1.8h, v17.8h, #1 \n"
2017 "urshr v2.8h, v18.8h, #1 \n"
2018
2019 "subs %w4, %w4, #16 \n" // 16 processed per loop.
2020 RGBTOUV(v0.8h, v1.8h, v2.8h)
2021 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
2022 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
2023 "b.gt 1b \n"
2024 : "+r"(src_argb4444), // %0
2025 "+r"(src_argb4444_1), // %1
2026 "+r"(dst_u), // %2
2027 "+r"(dst_v), // %3
2028 "+r"(width) // %4
2029 :
2030 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17",
2031 "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
2032 "v28"
2033
2034 );
2035 }
2036
RGB565ToYRow_NEON(const uint8_t * src_rgb565,uint8_t * dst_y,int width)2037 void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
2038 asm volatile(
2039 "movi v24.8b, #25 \n" // B * 0.1016 coefficient
2040 "movi v25.8b, #129 \n" // G * 0.5078 coefficient
2041 "movi v26.8b, #66 \n" // R * 0.2578 coefficient
2042 "movi v27.8b, #16 \n" // Add 16 constant
2043 "1: \n"
2044 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
2045 "prfm pldl1keep, [%0, 448] \n"
2046 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2047 RGB565TOARGB
2048 "umull v3.8h, v0.8b, v24.8b \n" // B
2049 "umlal v3.8h, v1.8b, v25.8b \n" // G
2050 "umlal v3.8h, v2.8b, v26.8b \n" // R
2051 "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y
2052 "uqadd v0.8b, v0.8b, v27.8b \n"
2053 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
2054 "b.gt 1b \n"
2055 : "+r"(src_rgb565), // %0
2056 "+r"(dst_y), // %1
2057 "+r"(width) // %2
2058 :
2059 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6", "v24", "v25", "v26",
2060 "v27");
2061 }
2062
ARGB1555ToYRow_NEON(const uint8_t * src_argb1555,uint8_t * dst_y,int width)2063 void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
2064 uint8_t* dst_y,
2065 int width) {
2066 asm volatile(
2067 "movi v4.8b, #25 \n" // B * 0.1016 coefficient
2068 "movi v5.8b, #129 \n" // G * 0.5078 coefficient
2069 "movi v6.8b, #66 \n" // R * 0.2578 coefficient
2070 "movi v7.8b, #16 \n" // Add 16 constant
2071 "1: \n"
2072 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
2073 "prfm pldl1keep, [%0, 448] \n"
2074 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2075 ARGB1555TOARGB
2076 "umull v3.8h, v0.8b, v4.8b \n" // B
2077 "umlal v3.8h, v1.8b, v5.8b \n" // G
2078 "umlal v3.8h, v2.8b, v6.8b \n" // R
2079 "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y
2080 "uqadd v0.8b, v0.8b, v7.8b \n"
2081 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
2082 "b.gt 1b \n"
2083 : "+r"(src_argb1555), // %0
2084 "+r"(dst_y), // %1
2085 "+r"(width) // %2
2086 :
2087 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
2088 }
2089
ARGB4444ToYRow_NEON(const uint8_t * src_argb4444,uint8_t * dst_y,int width)2090 void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
2091 uint8_t* dst_y,
2092 int width) {
2093 asm volatile(
2094 "movi v24.8b, #25 \n" // B * 0.1016 coefficient
2095 "movi v25.8b, #129 \n" // G * 0.5078 coefficient
2096 "movi v26.8b, #66 \n" // R * 0.2578 coefficient
2097 "movi v27.8b, #16 \n" // Add 16 constant
2098 "1: \n"
2099 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
2100 "prfm pldl1keep, [%0, 448] \n"
2101 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2102 ARGB4444TOARGB
2103 "umull v3.8h, v0.8b, v24.8b \n" // B
2104 "umlal v3.8h, v1.8b, v25.8b \n" // G
2105 "umlal v3.8h, v2.8b, v26.8b \n" // R
2106 "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y
2107 "uqadd v0.8b, v0.8b, v27.8b \n"
2108 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
2109 "b.gt 1b \n"
2110 : "+r"(src_argb4444), // %0
2111 "+r"(dst_y), // %1
2112 "+r"(width) // %2
2113 :
2114 : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27");
2115 }
2116
BGRAToYRow_NEON(const uint8_t * src_bgra,uint8_t * dst_y,int width)2117 void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
2118 asm volatile(
2119 "movi v4.8b, #66 \n" // R * 0.2578 coefficient
2120 "movi v5.8b, #129 \n" // G * 0.5078 coefficient
2121 "movi v6.8b, #25 \n" // B * 0.1016 coefficient
2122 "movi v7.8b, #16 \n" // Add 16 constant
2123 "1: \n"
2124 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
2125 "prfm pldl1keep, [%0, 448] \n"
2126 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2127 "umull v16.8h, v1.8b, v4.8b \n" // R
2128 "umlal v16.8h, v2.8b, v5.8b \n" // G
2129 "umlal v16.8h, v3.8b, v6.8b \n" // B
2130 "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y
2131 "uqadd v0.8b, v0.8b, v7.8b \n"
2132 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
2133 "b.gt 1b \n"
2134 : "+r"(src_bgra), // %0
2135 "+r"(dst_y), // %1
2136 "+r"(width) // %2
2137 :
2138 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
2139 }
2140
ABGRToYRow_NEON(const uint8_t * src_abgr,uint8_t * dst_y,int width)2141 void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
2142 asm volatile(
2143 "movi v6.8b, #25 \n" // B * 0.1016 coefficient
2144 "movi v5.8b, #129 \n" // G * 0.5078 coefficient
2145 "movi v4.8b, #66 \n" // R * 0.2578 coefficient
2146 "movi v7.8b, #16 \n" // Add 16 constant
2147 "1: \n"
2148 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
2149 "prfm pldl1keep, [%0, 448] \n"
2150 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2151 "umull v16.8h, v0.8b, v4.8b \n" // R
2152 "umlal v16.8h, v1.8b, v5.8b \n" // G
2153 "umlal v16.8h, v2.8b, v6.8b \n" // B
2154 "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y
2155 "uqadd v0.8b, v0.8b, v7.8b \n"
2156 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
2157 "b.gt 1b \n"
2158 : "+r"(src_abgr), // %0
2159 "+r"(dst_y), // %1
2160 "+r"(width) // %2
2161 :
2162 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
2163 }
2164
RGBAToYRow_NEON(const uint8_t * src_rgba,uint8_t * dst_y,int width)2165 void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
2166 asm volatile(
2167 "movi v4.8b, #25 \n" // B * 0.1016 coefficient
2168 "movi v5.8b, #129 \n" // G * 0.5078 coefficient
2169 "movi v6.8b, #66 \n" // R * 0.2578 coefficient
2170 "movi v7.8b, #16 \n" // Add 16 constant
2171 "1: \n"
2172 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
2173 "prfm pldl1keep, [%0, 448] \n"
2174 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2175 "umull v16.8h, v1.8b, v4.8b \n" // B
2176 "umlal v16.8h, v2.8b, v5.8b \n" // G
2177 "umlal v16.8h, v3.8b, v6.8b \n" // R
2178 "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y
2179 "uqadd v0.8b, v0.8b, v7.8b \n"
2180 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
2181 "b.gt 1b \n"
2182 : "+r"(src_rgba), // %0
2183 "+r"(dst_y), // %1
2184 "+r"(width) // %2
2185 :
2186 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
2187 }
2188
RGB24ToYRow_NEON(const uint8_t * src_rgb24,uint8_t * dst_y,int width)2189 void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
2190 asm volatile(
2191 "movi v4.8b, #25 \n" // B * 0.1016 coefficient
2192 "movi v5.8b, #129 \n" // G * 0.5078 coefficient
2193 "movi v6.8b, #66 \n" // R * 0.2578 coefficient
2194 "movi v7.8b, #16 \n" // Add 16 constant
2195 "1: \n"
2196 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
2197 "prfm pldl1keep, [%0, 448] \n"
2198 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2199 "umull v16.8h, v0.8b, v4.8b \n" // B
2200 "umlal v16.8h, v1.8b, v5.8b \n" // G
2201 "umlal v16.8h, v2.8b, v6.8b \n" // R
2202 "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y
2203 "uqadd v0.8b, v0.8b, v7.8b \n"
2204 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
2205 "b.gt 1b \n"
2206 : "+r"(src_rgb24), // %0
2207 "+r"(dst_y), // %1
2208 "+r"(width) // %2
2209 :
2210 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
2211 }
2212
RAWToYRow_NEON(const uint8_t * src_raw,uint8_t * dst_y,int width)2213 void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) {
2214 asm volatile(
2215 "movi v6.8b, #25 \n" // B * 0.1016 coefficient
2216 "movi v5.8b, #129 \n" // G * 0.5078 coefficient
2217 "movi v4.8b, #66 \n" // R * 0.2578 coefficient
2218 "movi v7.8b, #16 \n" // Add 16 constant
2219 "1: \n"
2220 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
2221 "prfm pldl1keep, [%0, 448] \n"
2222 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2223 "umull v16.8h, v0.8b, v4.8b \n" // B
2224 "umlal v16.8h, v1.8b, v5.8b \n" // G
2225 "umlal v16.8h, v2.8b, v6.8b \n" // R
2226 "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y
2227 "uqadd v0.8b, v0.8b, v7.8b \n"
2228 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
2229 "b.gt 1b \n"
2230 : "+r"(src_raw), // %0
2231 "+r"(dst_y), // %1
2232 "+r"(width) // %2
2233 :
2234 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
2235 }
2236
RGB24ToYJRow_NEON(const uint8_t * src_rgb24,uint8_t * dst_yj,int width)2237 void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
2238 asm volatile(
2239 "movi v4.8b, #29 \n" // B * 0.1140 coefficient
2240 "movi v5.8b, #150 \n" // G * 0.5870 coefficient
2241 "movi v6.8b, #77 \n" // R * 0.2990 coefficient
2242 "1: \n"
2243 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
2244 "prfm pldl1keep, [%0, 448] \n"
2245 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2246 "umull v0.8h, v0.8b, v4.8b \n" // B
2247 "umlal v0.8h, v1.8b, v5.8b \n" // G
2248 "umlal v0.8h, v2.8b, v6.8b \n" // R
2249 "uqrshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit Y
2250 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
2251 "b.gt 1b \n"
2252 : "+r"(src_rgb24), // %0
2253 "+r"(dst_yj), // %1
2254 "+r"(width) // %2
2255 :
2256 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
2257 }
2258
RAWToYJRow_NEON(const uint8_t * src_raw,uint8_t * dst_yj,int width)2259 void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
2260 asm volatile(
2261 "movi v6.8b, #29 \n" // B * 0.1140 coefficient
2262 "movi v5.8b, #150 \n" // G * 0.5870 coefficient
2263 "movi v4.8b, #77 \n" // R * 0.2990 coefficient
2264 "1: \n"
2265 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
2266 "prfm pldl1keep, [%0, 448] \n"
2267 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2268 "umull v0.8h, v0.8b, v4.8b \n" // B
2269 "umlal v0.8h, v1.8b, v5.8b \n" // G
2270 "umlal v0.8h, v2.8b, v6.8b \n" // R
2271 "uqrshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit Y
2272 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
2273 "b.gt 1b \n"
2274 : "+r"(src_raw), // %0
2275 "+r"(dst_yj), // %1
2276 "+r"(width) // %2
2277 :
2278 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
2279 }
2280
2281 // Bilinear filter 16x2 -> 16x1
InterpolateRow_NEON(uint8_t * dst_ptr,const uint8_t * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)2282 void InterpolateRow_NEON(uint8_t* dst_ptr,
2283 const uint8_t* src_ptr,
2284 ptrdiff_t src_stride,
2285 int dst_width,
2286 int source_y_fraction) {
2287 int y1_fraction = source_y_fraction;
2288 int y0_fraction = 256 - y1_fraction;
2289 const uint8_t* src_ptr1 = src_ptr + src_stride;
2290 asm volatile(
2291 "cmp %w4, #0 \n"
2292 "b.eq 100f \n"
2293 "cmp %w4, #128 \n"
2294 "b.eq 50f \n"
2295
2296 "dup v5.16b, %w4 \n"
2297 "dup v4.16b, %w5 \n"
2298 // General purpose row blend.
2299 "1: \n"
2300 "ld1 {v0.16b}, [%1], #16 \n"
2301 "ld1 {v1.16b}, [%2], #16 \n"
2302 "prfm pldl1keep, [%1, 448] \n"
2303 "prfm pldl1keep, [%2, 448] \n"
2304 "subs %w3, %w3, #16 \n"
2305 "umull v2.8h, v0.8b, v4.8b \n"
2306 "umull2 v3.8h, v0.16b, v4.16b \n"
2307 "umlal v2.8h, v1.8b, v5.8b \n"
2308 "umlal2 v3.8h, v1.16b, v5.16b \n"
2309 "rshrn v0.8b, v2.8h, #8 \n"
2310 "rshrn2 v0.16b, v3.8h, #8 \n"
2311 "st1 {v0.16b}, [%0], #16 \n"
2312 "b.gt 1b \n"
2313 "b 99f \n"
2314
2315 // Blend 50 / 50.
2316 "50: \n"
2317 "ld1 {v0.16b}, [%1], #16 \n"
2318 "ld1 {v1.16b}, [%2], #16 \n"
2319 "prfm pldl1keep, [%1, 448] \n"
2320 "prfm pldl1keep, [%2, 448] \n"
2321 "subs %w3, %w3, #16 \n"
2322 "urhadd v0.16b, v0.16b, v1.16b \n"
2323 "st1 {v0.16b}, [%0], #16 \n"
2324 "b.gt 50b \n"
2325 "b 99f \n"
2326
2327 // Blend 100 / 0 - Copy row unchanged.
2328 "100: \n"
2329 "ld1 {v0.16b}, [%1], #16 \n"
2330 "prfm pldl1keep, [%1, 448] \n"
2331 "subs %w3, %w3, #16 \n"
2332 "st1 {v0.16b}, [%0], #16 \n"
2333 "b.gt 100b \n"
2334
2335 "99: \n"
2336 : "+r"(dst_ptr), // %0
2337 "+r"(src_ptr), // %1
2338 "+r"(src_ptr1), // %2
2339 "+r"(dst_width), // %3
2340 "+r"(y1_fraction), // %4
2341 "+r"(y0_fraction) // %5
2342 :
2343 : "cc", "memory", "v0", "v1", "v3", "v4", "v5");
2344 }
2345
2346 // dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
ARGBBlendRow_NEON(const uint8_t * src_argb0,const uint8_t * src_argb1,uint8_t * dst_argb,int width)2347 void ARGBBlendRow_NEON(const uint8_t* src_argb0,
2348 const uint8_t* src_argb1,
2349 uint8_t* dst_argb,
2350 int width) {
2351 asm volatile(
2352 "subs %w3, %w3, #8 \n"
2353 "b.lt 89f \n"
2354 // Blend 8 pixels.
2355 "8: \n"
2356 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0
2357 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 ARGB1
2358 "prfm pldl1keep, [%0, 448] \n"
2359 "prfm pldl1keep, [%1, 448] \n"
2360 "subs %w3, %w3, #8 \n" // 8 processed per loop.
2361 "umull v16.8h, v4.8b, v3.8b \n" // db * a
2362 "umull v17.8h, v5.8b, v3.8b \n" // dg * a
2363 "umull v18.8h, v6.8b, v3.8b \n" // dr * a
2364 "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8
2365 "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8
2366 "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8
2367 "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256)
2368 "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256)
2369 "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256)
2370 "uqadd v0.8b, v0.8b, v4.8b \n" // + sb
2371 "uqadd v1.8b, v1.8b, v5.8b \n" // + sg
2372 "uqadd v2.8b, v2.8b, v6.8b \n" // + sr
2373 "movi v3.8b, #255 \n" // a = 255
2374 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
2375 // pixels
2376 "b.ge 8b \n"
2377
2378 "89: \n"
2379 "adds %w3, %w3, #8-1 \n"
2380 "b.lt 99f \n"
2381
2382 // Blend 1 pixels.
2383 "1: \n"
2384 "ld4 {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n" // load 1 pixel
2385 // ARGB0.
2386 "ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel
2387 // ARGB1.
2388 "prfm pldl1keep, [%0, 448] \n"
2389 "prfm pldl1keep, [%1, 448] \n"
2390 "subs %w3, %w3, #1 \n" // 1 processed per loop.
2391 "umull v16.8h, v4.8b, v3.8b \n" // db * a
2392 "umull v17.8h, v5.8b, v3.8b \n" // dg * a
2393 "umull v18.8h, v6.8b, v3.8b \n" // dr * a
2394 "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8
2395 "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8
2396 "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8
2397 "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256)
2398 "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256)
2399 "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256)
2400 "uqadd v0.8b, v0.8b, v4.8b \n" // + sb
2401 "uqadd v1.8b, v1.8b, v5.8b \n" // + sg
2402 "uqadd v2.8b, v2.8b, v6.8b \n" // + sr
2403 "movi v3.8b, #255 \n" // a = 255
2404 "st4 {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n" // store 1 pixel.
2405 "b.ge 1b \n"
2406
2407 "99: \n"
2408
2409 : "+r"(src_argb0), // %0
2410 "+r"(src_argb1), // %1
2411 "+r"(dst_argb), // %2
2412 "+r"(width) // %3
2413 :
2414 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
2415 "v17", "v18");
2416 }
2417
2418 // Attenuate 8 pixels at a time.
ARGBAttenuateRow_NEON(const uint8_t * src_argb,uint8_t * dst_argb,int width)2419 void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
2420 uint8_t* dst_argb,
2421 int width) {
2422 asm volatile(
2423 // Attenuate 8 pixels.
2424 "1: \n"
2425 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
2426 "prfm pldl1keep, [%0, 448] \n"
2427 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2428 "umull v4.8h, v0.8b, v3.8b \n" // b * a
2429 "umull v5.8h, v1.8b, v3.8b \n" // g * a
2430 "umull v6.8h, v2.8b, v3.8b \n" // r * a
2431 "uqrshrn v0.8b, v4.8h, #8 \n" // b >>= 8
2432 "uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8
2433 "uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8
2434 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
2435 "b.gt 1b \n"
2436 : "+r"(src_argb), // %0
2437 "+r"(dst_argb), // %1
2438 "+r"(width) // %2
2439 :
2440 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
2441 }
2442
2443 // Quantize 8 ARGB pixels (32 bytes).
2444 // dst = (dst * scale >> 16) * interval_size + interval_offset;
ARGBQuantizeRow_NEON(uint8_t * dst_argb,int scale,int interval_size,int interval_offset,int width)2445 void ARGBQuantizeRow_NEON(uint8_t* dst_argb,
2446 int scale,
2447 int interval_size,
2448 int interval_offset,
2449 int width) {
2450 asm volatile(
2451 "dup v4.8h, %w2 \n"
2452 "ushr v4.8h, v4.8h, #1 \n" // scale >>= 1
2453 "dup v5.8h, %w3 \n" // interval multiply.
2454 "dup v6.8h, %w4 \n" // interval add
2455
2456 // 8 pixel loop.
2457 "1: \n"
2458 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB.
2459 "prfm pldl1keep, [%0, 448] \n"
2460 "subs %w1, %w1, #8 \n" // 8 processed per loop.
2461 "uxtl v0.8h, v0.8b \n" // b (0 .. 255)
2462 "uxtl v1.8h, v1.8b \n"
2463 "uxtl v2.8h, v2.8b \n"
2464 "sqdmulh v0.8h, v0.8h, v4.8h \n" // b * scale
2465 "sqdmulh v1.8h, v1.8h, v4.8h \n" // g
2466 "sqdmulh v2.8h, v2.8h, v4.8h \n" // r
2467 "mul v0.8h, v0.8h, v5.8h \n" // b * interval_size
2468 "mul v1.8h, v1.8h, v5.8h \n" // g
2469 "mul v2.8h, v2.8h, v5.8h \n" // r
2470 "add v0.8h, v0.8h, v6.8h \n" // b + interval_offset
2471 "add v1.8h, v1.8h, v6.8h \n" // g
2472 "add v2.8h, v2.8h, v6.8h \n" // r
2473 "uqxtn v0.8b, v0.8h \n"
2474 "uqxtn v1.8b, v1.8h \n"
2475 "uqxtn v2.8b, v2.8h \n"
2476 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 ARGB
2477 "b.gt 1b \n"
2478 : "+r"(dst_argb), // %0
2479 "+r"(width) // %1
2480 : "r"(scale), // %2
2481 "r"(interval_size), // %3
2482 "r"(interval_offset) // %4
2483 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
2484 }
2485
2486 // Shade 8 pixels at a time by specified value.
2487 // NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
2488 // Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
ARGBShadeRow_NEON(const uint8_t * src_argb,uint8_t * dst_argb,int width,uint32_t value)2489 void ARGBShadeRow_NEON(const uint8_t* src_argb,
2490 uint8_t* dst_argb,
2491 int width,
2492 uint32_t value) {
2493 asm volatile(
2494 "dup v0.4s, %w3 \n" // duplicate scale value.
2495 "zip1 v0.8b, v0.8b, v0.8b \n" // v0.8b aarrggbb.
2496 "ushr v0.8h, v0.8h, #1 \n" // scale / 2.
2497
2498 // 8 pixel loop.
2499 "1: \n"
2500 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB
2501 "prfm pldl1keep, [%0, 448] \n"
2502 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2503 "uxtl v4.8h, v4.8b \n" // b (0 .. 255)
2504 "uxtl v5.8h, v5.8b \n"
2505 "uxtl v6.8h, v6.8b \n"
2506 "uxtl v7.8h, v7.8b \n"
2507 "sqrdmulh v4.8h, v4.8h, v0.h[0] \n" // b * scale * 2
2508 "sqrdmulh v5.8h, v5.8h, v0.h[1] \n" // g
2509 "sqrdmulh v6.8h, v6.8h, v0.h[2] \n" // r
2510 "sqrdmulh v7.8h, v7.8h, v0.h[3] \n" // a
2511 "uqxtn v4.8b, v4.8h \n"
2512 "uqxtn v5.8b, v5.8h \n"
2513 "uqxtn v6.8b, v6.8h \n"
2514 "uqxtn v7.8b, v7.8h \n"
2515 "st4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 ARGB
2516 "b.gt 1b \n"
2517 : "+r"(src_argb), // %0
2518 "+r"(dst_argb), // %1
2519 "+r"(width) // %2
2520 : "r"(value) // %3
2521 : "cc", "memory", "v0", "v4", "v5", "v6", "v7");
2522 }
2523
2524 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
2525 // Similar to ARGBToYJ but stores ARGB.
2526 // C code is (29 * b + 150 * g + 77 * r + 128) >> 8;
ARGBGrayRow_NEON(const uint8_t * src_argb,uint8_t * dst_argb,int width)2527 void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
2528 asm volatile(
2529 "movi v24.8b, #29 \n" // B * 0.1140 coefficient
2530 "movi v25.8b, #150 \n" // G * 0.5870 coefficient
2531 "movi v26.8b, #77 \n" // R * 0.2990 coefficient
2532 "1: \n"
2533 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
2534 "prfm pldl1keep, [%0, 448] \n"
2535 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2536 "umull v4.8h, v0.8b, v24.8b \n" // B
2537 "umlal v4.8h, v1.8b, v25.8b \n" // G
2538 "umlal v4.8h, v2.8b, v26.8b \n" // R
2539 "uqrshrn v0.8b, v4.8h, #8 \n" // 16 bit to 8 bit B
2540 "orr v1.8b, v0.8b, v0.8b \n" // G
2541 "orr v2.8b, v0.8b, v0.8b \n" // R
2542 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 pixels.
2543 "b.gt 1b \n"
2544 : "+r"(src_argb), // %0
2545 "+r"(dst_argb), // %1
2546 "+r"(width) // %2
2547 :
2548 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26");
2549 }
2550
2551 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
2552 // b = (r * 35 + g * 68 + b * 17) >> 7
2553 // g = (r * 45 + g * 88 + b * 22) >> 7
2554 // r = (r * 50 + g * 98 + b * 24) >> 7
2555
ARGBSepiaRow_NEON(uint8_t * dst_argb,int width)2556 void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) {
2557 asm volatile(
2558 "movi v20.8b, #17 \n" // BB coefficient
2559 "movi v21.8b, #68 \n" // BG coefficient
2560 "movi v22.8b, #35 \n" // BR coefficient
2561 "movi v24.8b, #22 \n" // GB coefficient
2562 "movi v25.8b, #88 \n" // GG coefficient
2563 "movi v26.8b, #45 \n" // GR coefficient
2564 "movi v28.8b, #24 \n" // BB coefficient
2565 "movi v29.8b, #98 \n" // BG coefficient
2566 "movi v30.8b, #50 \n" // BR coefficient
2567 "1: \n"
2568 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels.
2569 "prfm pldl1keep, [%0, 448] \n"
2570 "subs %w1, %w1, #8 \n" // 8 processed per loop.
2571 "umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B
2572 "umlal v4.8h, v1.8b, v21.8b \n" // G
2573 "umlal v4.8h, v2.8b, v22.8b \n" // R
2574 "umull v5.8h, v0.8b, v24.8b \n" // B to Sepia G
2575 "umlal v5.8h, v1.8b, v25.8b \n" // G
2576 "umlal v5.8h, v2.8b, v26.8b \n" // R
2577 "umull v6.8h, v0.8b, v28.8b \n" // B to Sepia R
2578 "umlal v6.8h, v1.8b, v29.8b \n" // G
2579 "umlal v6.8h, v2.8b, v30.8b \n" // R
2580 "uqshrn v0.8b, v4.8h, #7 \n" // 16 bit to 8 bit B
2581 "uqshrn v1.8b, v5.8h, #7 \n" // 16 bit to 8 bit G
2582 "uqshrn v2.8b, v6.8h, #7 \n" // 16 bit to 8 bit R
2583 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 pixels.
2584 "b.gt 1b \n"
2585 : "+r"(dst_argb), // %0
2586 "+r"(width) // %1
2587 :
2588 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
2589 "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30");
2590 }
2591
2592 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
2593 // TODO(fbarchard): Was same as Sepia except matrix is provided. This function
2594 // needs to saturate. Consider doing a non-saturating version.
ARGBColorMatrixRow_NEON(const uint8_t * src_argb,uint8_t * dst_argb,const int8_t * matrix_argb,int width)2595 void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
2596 uint8_t* dst_argb,
2597 const int8_t* matrix_argb,
2598 int width) {
2599 asm volatile(
2600 "ld1 {v2.16b}, [%3] \n" // load 3 ARGB vectors.
2601 "sxtl v0.8h, v2.8b \n" // B,G coefficients s16.
2602 "sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16.
2603
2604 "1: \n"
2605 "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 ARGB
2606 "prfm pldl1keep, [%0, 448] \n"
2607 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2608 "uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit
2609 "uxtl v17.8h, v17.8b \n" // g
2610 "uxtl v18.8h, v18.8b \n" // r
2611 "uxtl v19.8h, v19.8b \n" // a
2612 "mul v22.8h, v16.8h, v0.h[0] \n" // B = B * Matrix B
2613 "mul v23.8h, v16.8h, v0.h[4] \n" // G = B * Matrix G
2614 "mul v24.8h, v16.8h, v1.h[0] \n" // R = B * Matrix R
2615 "mul v25.8h, v16.8h, v1.h[4] \n" // A = B * Matrix A
2616 "mul v4.8h, v17.8h, v0.h[1] \n" // B += G * Matrix B
2617 "mul v5.8h, v17.8h, v0.h[5] \n" // G += G * Matrix G
2618 "mul v6.8h, v17.8h, v1.h[1] \n" // R += G * Matrix R
2619 "mul v7.8h, v17.8h, v1.h[5] \n" // A += G * Matrix A
2620 "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
2621 "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
2622 "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
2623 "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
2624 "mul v4.8h, v18.8h, v0.h[2] \n" // B += R * Matrix B
2625 "mul v5.8h, v18.8h, v0.h[6] \n" // G += R * Matrix G
2626 "mul v6.8h, v18.8h, v1.h[2] \n" // R += R * Matrix R
2627 "mul v7.8h, v18.8h, v1.h[6] \n" // A += R * Matrix A
2628 "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
2629 "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
2630 "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
2631 "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
2632 "mul v4.8h, v19.8h, v0.h[3] \n" // B += A * Matrix B
2633 "mul v5.8h, v19.8h, v0.h[7] \n" // G += A * Matrix G
2634 "mul v6.8h, v19.8h, v1.h[3] \n" // R += A * Matrix R
2635 "mul v7.8h, v19.8h, v1.h[7] \n" // A += A * Matrix A
2636 "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
2637 "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
2638 "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
2639 "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
2640 "sqshrun v16.8b, v22.8h, #6 \n" // 16 bit to 8 bit B
2641 "sqshrun v17.8b, v23.8h, #6 \n" // 16 bit to 8 bit G
2642 "sqshrun v18.8b, v24.8h, #6 \n" // 16 bit to 8 bit R
2643 "sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A
2644 "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 ARGB
2645 "b.gt 1b \n"
2646 : "+r"(src_argb), // %0
2647 "+r"(dst_argb), // %1
2648 "+r"(width) // %2
2649 : "r"(matrix_argb) // %3
2650 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
2651 "v17", "v18", "v19", "v22", "v23", "v24", "v25");
2652 }
2653
2654 // TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable.
2655 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
ARGBMultiplyRow_NEON(const uint8_t * src_argb0,const uint8_t * src_argb1,uint8_t * dst_argb,int width)2656 void ARGBMultiplyRow_NEON(const uint8_t* src_argb0,
2657 const uint8_t* src_argb1,
2658 uint8_t* dst_argb,
2659 int width) {
2660 asm volatile(
2661 // 8 pixel loop.
2662 "1: \n"
2663 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
2664 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
2665 "prfm pldl1keep, [%0, 448] \n"
2666 "prfm pldl1keep, [%1, 448] \n"
2667 "subs %w3, %w3, #8 \n" // 8 processed per loop.
2668 "umull v0.8h, v0.8b, v4.8b \n" // multiply B
2669 "umull v1.8h, v1.8b, v5.8b \n" // multiply G
2670 "umull v2.8h, v2.8b, v6.8b \n" // multiply R
2671 "umull v3.8h, v3.8b, v7.8b \n" // multiply A
2672 "rshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit B
2673 "rshrn v1.8b, v1.8h, #8 \n" // 16 bit to 8 bit G
2674 "rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R
2675 "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A
2676 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
2677 "b.gt 1b \n"
2678 : "+r"(src_argb0), // %0
2679 "+r"(src_argb1), // %1
2680 "+r"(dst_argb), // %2
2681 "+r"(width) // %3
2682 :
2683 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
2684 }
2685
2686 // Add 2 rows of ARGB pixels together, 8 pixels at a time.
ARGBAddRow_NEON(const uint8_t * src_argb0,const uint8_t * src_argb1,uint8_t * dst_argb,int width)2687 void ARGBAddRow_NEON(const uint8_t* src_argb0,
2688 const uint8_t* src_argb1,
2689 uint8_t* dst_argb,
2690 int width) {
2691 asm volatile(
2692 // 8 pixel loop.
2693 "1: \n"
2694 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
2695 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
2696 "prfm pldl1keep, [%0, 448] \n"
2697 "prfm pldl1keep, [%1, 448] \n"
2698 "subs %w3, %w3, #8 \n" // 8 processed per loop.
2699 "uqadd v0.8b, v0.8b, v4.8b \n"
2700 "uqadd v1.8b, v1.8b, v5.8b \n"
2701 "uqadd v2.8b, v2.8b, v6.8b \n"
2702 "uqadd v3.8b, v3.8b, v7.8b \n"
2703 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
2704 "b.gt 1b \n"
2705 : "+r"(src_argb0), // %0
2706 "+r"(src_argb1), // %1
2707 "+r"(dst_argb), // %2
2708 "+r"(width) // %3
2709 :
2710 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
2711 }
2712
2713 // Subtract 2 rows of ARGB pixels, 8 pixels at a time.
ARGBSubtractRow_NEON(const uint8_t * src_argb0,const uint8_t * src_argb1,uint8_t * dst_argb,int width)2714 void ARGBSubtractRow_NEON(const uint8_t* src_argb0,
2715 const uint8_t* src_argb1,
2716 uint8_t* dst_argb,
2717 int width) {
2718 asm volatile(
2719 // 8 pixel loop.
2720 "1: \n"
2721 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
2722 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
2723 "prfm pldl1keep, [%0, 448] \n"
2724 "prfm pldl1keep, [%1, 448] \n"
2725 "subs %w3, %w3, #8 \n" // 8 processed per loop.
2726 "uqsub v0.8b, v0.8b, v4.8b \n"
2727 "uqsub v1.8b, v1.8b, v5.8b \n"
2728 "uqsub v2.8b, v2.8b, v6.8b \n"
2729 "uqsub v3.8b, v3.8b, v7.8b \n"
2730 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
2731 "b.gt 1b \n"
2732 : "+r"(src_argb0), // %0
2733 "+r"(src_argb1), // %1
2734 "+r"(dst_argb), // %2
2735 "+r"(width) // %3
2736 :
2737 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
2738 }
2739
2740 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
2741 // A = 255
2742 // R = Sobel
2743 // G = Sobel
2744 // B = Sobel
SobelRow_NEON(const uint8_t * src_sobelx,const uint8_t * src_sobely,uint8_t * dst_argb,int width)2745 void SobelRow_NEON(const uint8_t* src_sobelx,
2746 const uint8_t* src_sobely,
2747 uint8_t* dst_argb,
2748 int width) {
2749 asm volatile(
2750 "movi v3.8b, #255 \n" // alpha
2751 // 8 pixel loop.
2752 "1: \n"
2753 "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx.
2754 "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely.
2755 "prfm pldl1keep, [%0, 448] \n"
2756 "prfm pldl1keep, [%1, 448] \n"
2757 "subs %w3, %w3, #8 \n" // 8 processed per loop.
2758 "uqadd v0.8b, v0.8b, v1.8b \n" // add
2759 "orr v1.8b, v0.8b, v0.8b \n"
2760 "orr v2.8b, v0.8b, v0.8b \n"
2761 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
2762 "b.gt 1b \n"
2763 : "+r"(src_sobelx), // %0
2764 "+r"(src_sobely), // %1
2765 "+r"(dst_argb), // %2
2766 "+r"(width) // %3
2767 :
2768 : "cc", "memory", "v0", "v1", "v2", "v3");
2769 }
2770
2771 // Adds Sobel X and Sobel Y and stores Sobel into plane.
SobelToPlaneRow_NEON(const uint8_t * src_sobelx,const uint8_t * src_sobely,uint8_t * dst_y,int width)2772 void SobelToPlaneRow_NEON(const uint8_t* src_sobelx,
2773 const uint8_t* src_sobely,
2774 uint8_t* dst_y,
2775 int width) {
2776 asm volatile(
2777 // 16 pixel loop.
2778 "1: \n"
2779 "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx.
2780 "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely.
2781 "prfm pldl1keep, [%0, 448] \n"
2782 "prfm pldl1keep, [%1, 448] \n"
2783 "subs %w3, %w3, #16 \n" // 16 processed per loop.
2784 "uqadd v0.16b, v0.16b, v1.16b \n" // add
2785 "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels.
2786 "b.gt 1b \n"
2787 : "+r"(src_sobelx), // %0
2788 "+r"(src_sobely), // %1
2789 "+r"(dst_y), // %2
2790 "+r"(width) // %3
2791 :
2792 : "cc", "memory", "v0", "v1");
2793 }
2794
2795 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
2796 // A = 255
2797 // R = Sobel X
2798 // G = Sobel
2799 // B = Sobel Y
SobelXYRow_NEON(const uint8_t * src_sobelx,const uint8_t * src_sobely,uint8_t * dst_argb,int width)2800 void SobelXYRow_NEON(const uint8_t* src_sobelx,
2801 const uint8_t* src_sobely,
2802 uint8_t* dst_argb,
2803 int width) {
2804 asm volatile(
2805 "movi v3.8b, #255 \n" // alpha
2806 // 8 pixel loop.
2807 "1: \n"
2808 "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx.
2809 "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely.
2810 "prfm pldl1keep, [%0, 448] \n"
2811 "prfm pldl1keep, [%1, 448] \n"
2812 "subs %w3, %w3, #8 \n" // 8 processed per loop.
2813 "uqadd v1.8b, v0.8b, v2.8b \n" // add
2814 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
2815 "b.gt 1b \n"
2816 : "+r"(src_sobelx), // %0
2817 "+r"(src_sobely), // %1
2818 "+r"(dst_argb), // %2
2819 "+r"(width) // %3
2820 :
2821 : "cc", "memory", "v0", "v1", "v2", "v3");
2822 }
2823
2824 // SobelX as a matrix is
2825 // -1 0 1
2826 // -2 0 2
2827 // -1 0 1
SobelXRow_NEON(const uint8_t * src_y0,const uint8_t * src_y1,const uint8_t * src_y2,uint8_t * dst_sobelx,int width)2828 void SobelXRow_NEON(const uint8_t* src_y0,
2829 const uint8_t* src_y1,
2830 const uint8_t* src_y2,
2831 uint8_t* dst_sobelx,
2832 int width) {
2833 asm volatile(
2834 "1: \n"
2835 "ld1 {v0.8b}, [%0],%5 \n" // top
2836 "ld1 {v1.8b}, [%0],%6 \n"
2837 "prfm pldl1keep, [%0, 448] \n"
2838 "usubl v0.8h, v0.8b, v1.8b \n"
2839 "ld1 {v2.8b}, [%1],%5 \n" // center * 2
2840 "ld1 {v3.8b}, [%1],%6 \n"
2841 "prfm pldl1keep, [%1, 448] \n"
2842 "usubl v1.8h, v2.8b, v3.8b \n"
2843 "add v0.8h, v0.8h, v1.8h \n"
2844 "add v0.8h, v0.8h, v1.8h \n"
2845 "ld1 {v2.8b}, [%2],%5 \n" // bottom
2846 "ld1 {v3.8b}, [%2],%6 \n"
2847 "prfm pldl1keep, [%2, 448] \n"
2848 "subs %w4, %w4, #8 \n" // 8 pixels
2849 "usubl v1.8h, v2.8b, v3.8b \n"
2850 "add v0.8h, v0.8h, v1.8h \n"
2851 "abs v0.8h, v0.8h \n"
2852 "uqxtn v0.8b, v0.8h \n"
2853 "st1 {v0.8b}, [%3], #8 \n" // store 8 sobelx
2854 "b.gt 1b \n"
2855 : "+r"(src_y0), // %0
2856 "+r"(src_y1), // %1
2857 "+r"(src_y2), // %2
2858 "+r"(dst_sobelx), // %3
2859 "+r"(width) // %4
2860 : "r"(2LL), // %5
2861 "r"(6LL) // %6
2862 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
2863 );
2864 }
2865
2866 // SobelY as a matrix is
2867 // -1 -2 -1
2868 // 0 0 0
2869 // 1 2 1
SobelYRow_NEON(const uint8_t * src_y0,const uint8_t * src_y1,uint8_t * dst_sobely,int width)2870 void SobelYRow_NEON(const uint8_t* src_y0,
2871 const uint8_t* src_y1,
2872 uint8_t* dst_sobely,
2873 int width) {
2874 asm volatile(
2875 "1: \n"
2876 "ld1 {v0.8b}, [%0],%4 \n" // left
2877 "ld1 {v1.8b}, [%1],%4 \n"
2878 "usubl v0.8h, v0.8b, v1.8b \n"
2879 "ld1 {v2.8b}, [%0],%4 \n" // center * 2
2880 "ld1 {v3.8b}, [%1],%4 \n"
2881 "usubl v1.8h, v2.8b, v3.8b \n"
2882 "add v0.8h, v0.8h, v1.8h \n"
2883 "add v0.8h, v0.8h, v1.8h \n"
2884 "ld1 {v2.8b}, [%0],%5 \n" // right
2885 "ld1 {v3.8b}, [%1],%5 \n"
2886 "prfm pldl1keep, [%0, 448] \n"
2887 "prfm pldl1keep, [%1, 448] \n"
2888 "subs %w3, %w3, #8 \n" // 8 pixels
2889 "usubl v1.8h, v2.8b, v3.8b \n"
2890 "add v0.8h, v0.8h, v1.8h \n"
2891 "abs v0.8h, v0.8h \n"
2892 "uqxtn v0.8b, v0.8h \n"
2893 "st1 {v0.8b}, [%2], #8 \n" // store 8 sobely
2894 "b.gt 1b \n"
2895 : "+r"(src_y0), // %0
2896 "+r"(src_y1), // %1
2897 "+r"(dst_sobely), // %2
2898 "+r"(width) // %3
2899 : "r"(1LL), // %4
2900 "r"(6LL) // %5
2901 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
2902 );
2903 }
2904
2905 // Caveat - rounds float to half float whereas scaling version truncates.
HalfFloat1Row_NEON(const uint16_t * src,uint16_t * dst,float,int width)2906 void HalfFloat1Row_NEON(const uint16_t* src,
2907 uint16_t* dst,
2908 float /*unused*/,
2909 int width) {
2910 asm volatile(
2911 "1: \n"
2912 "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts
2913 "prfm pldl1keep, [%0, 448] \n"
2914 "subs %w2, %w2, #8 \n" // 8 pixels per loop
2915 "uxtl v2.4s, v1.4h \n" // 8 int's
2916 "uxtl2 v3.4s, v1.8h \n"
2917 "scvtf v2.4s, v2.4s \n" // 8 floats
2918 "scvtf v3.4s, v3.4s \n"
2919 "fcvtn v1.4h, v2.4s \n" // 8 half floats
2920 "fcvtn2 v1.8h, v3.4s \n"
2921 "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts
2922 "b.gt 1b \n"
2923 : "+r"(src), // %0
2924 "+r"(dst), // %1
2925 "+r"(width) // %2
2926 :
2927 : "cc", "memory", "v1", "v2", "v3");
2928 }
2929
HalfFloatRow_NEON(const uint16_t * src,uint16_t * dst,float scale,int width)2930 void HalfFloatRow_NEON(const uint16_t* src,
2931 uint16_t* dst,
2932 float scale,
2933 int width) {
2934 asm volatile(
2935 "1: \n"
2936 "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts
2937 "prfm pldl1keep, [%0, 448] \n"
2938 "subs %w2, %w2, #8 \n" // 8 pixels per loop
2939 "uxtl v2.4s, v1.4h \n" // 8 int's
2940 "uxtl2 v3.4s, v1.8h \n"
2941 "scvtf v2.4s, v2.4s \n" // 8 floats
2942 "scvtf v3.4s, v3.4s \n"
2943 "fmul v2.4s, v2.4s, %3.s[0] \n" // adjust exponent
2944 "fmul v3.4s, v3.4s, %3.s[0] \n"
2945 "uqshrn v1.4h, v2.4s, #13 \n" // isolate halffloat
2946 "uqshrn2 v1.8h, v3.4s, #13 \n"
2947 "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts
2948 "b.gt 1b \n"
2949 : "+r"(src), // %0
2950 "+r"(dst), // %1
2951 "+r"(width) // %2
2952 : "w"(scale * 1.9259299444e-34f) // %3
2953 : "cc", "memory", "v1", "v2", "v3");
2954 }
2955
ByteToFloatRow_NEON(const uint8_t * src,float * dst,float scale,int width)2956 void ByteToFloatRow_NEON(const uint8_t* src,
2957 float* dst,
2958 float scale,
2959 int width) {
2960 asm volatile(
2961 "1: \n"
2962 "ld1 {v1.8b}, [%0], #8 \n" // load 8 bytes
2963 "prfm pldl1keep, [%0, 448] \n"
2964 "subs %w2, %w2, #8 \n" // 8 pixels per loop
2965 "uxtl v1.8h, v1.8b \n" // 8 shorts
2966 "uxtl v2.4s, v1.4h \n" // 8 ints
2967 "uxtl2 v3.4s, v1.8h \n"
2968 "scvtf v2.4s, v2.4s \n" // 8 floats
2969 "scvtf v3.4s, v3.4s \n"
2970 "fmul v2.4s, v2.4s, %3.s[0] \n" // scale
2971 "fmul v3.4s, v3.4s, %3.s[0] \n"
2972 "st1 {v2.16b, v3.16b}, [%1], #32 \n" // store 8 floats
2973 "b.gt 1b \n"
2974 : "+r"(src), // %0
2975 "+r"(dst), // %1
2976 "+r"(width) // %2
2977 : "w"(scale) // %3
2978 : "cc", "memory", "v1", "v2", "v3");
2979 }
2980
ScaleMaxSamples_NEON(const float * src,float * dst,float scale,int width)2981 float ScaleMaxSamples_NEON(const float* src,
2982 float* dst,
2983 float scale,
2984 int width) {
2985 float fmax;
2986 asm volatile(
2987 "movi v5.4s, #0 \n" // max
2988 "movi v6.4s, #0 \n"
2989
2990 "1: \n"
2991 "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples
2992 "prfm pldl1keep, [%0, 448] \n"
2993 "subs %w2, %w2, #8 \n" // 8 processed per loop
2994 "fmul v3.4s, v1.4s, %4.s[0] \n" // scale
2995 "fmul v4.4s, v2.4s, %4.s[0] \n" // scale
2996 "fmax v5.4s, v5.4s, v1.4s \n" // max
2997 "fmax v6.4s, v6.4s, v2.4s \n"
2998 "st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples
2999 "b.gt 1b \n"
3000 "fmax v5.4s, v5.4s, v6.4s \n" // max
3001 "fmaxv %s3, v5.4s \n" // signed max acculator
3002 : "+r"(src), // %0
3003 "+r"(dst), // %1
3004 "+r"(width), // %2
3005 "=w"(fmax) // %3
3006 : "w"(scale) // %4
3007 : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6");
3008 return fmax;
3009 }
3010
ScaleSumSamples_NEON(const float * src,float * dst,float scale,int width)3011 float ScaleSumSamples_NEON(const float* src,
3012 float* dst,
3013 float scale,
3014 int width) {
3015 float fsum;
3016 asm volatile(
3017 "movi v5.4s, #0 \n" // max
3018 "movi v6.4s, #0 \n" // max
3019
3020 "1: \n"
3021 "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples
3022 "prfm pldl1keep, [%0, 448] \n"
3023 "subs %w2, %w2, #8 \n" // 8 processed per loop
3024 "fmul v3.4s, v1.4s, %4.s[0] \n" // scale
3025 "fmul v4.4s, v2.4s, %4.s[0] \n"
3026 "fmla v5.4s, v1.4s, v1.4s \n" // sum of squares
3027 "fmla v6.4s, v2.4s, v2.4s \n"
3028 "st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples
3029 "b.gt 1b \n"
3030 "faddp v5.4s, v5.4s, v6.4s \n"
3031 "faddp v5.4s, v5.4s, v5.4s \n"
3032 "faddp %3.4s, v5.4s, v5.4s \n" // sum
3033 : "+r"(src), // %0
3034 "+r"(dst), // %1
3035 "+r"(width), // %2
3036 "=w"(fsum) // %3
3037 : "w"(scale) // %4
3038 : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6");
3039 return fsum;
3040 }
3041
ScaleSamples_NEON(const float * src,float * dst,float scale,int width)3042 void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) {
3043 asm volatile(
3044 "1: \n"
3045 "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples
3046 "prfm pldl1keep, [%0, 448] \n"
3047 "subs %w2, %w2, #8 \n" // 8 processed per loop
3048 "fmul v1.4s, v1.4s, %3.s[0] \n" // scale
3049 "fmul v2.4s, v2.4s, %3.s[0] \n" // scale
3050 "st1 {v1.4s, v2.4s}, [%1], #32 \n" // store 8 samples
3051 "b.gt 1b \n"
3052 : "+r"(src), // %0
3053 "+r"(dst), // %1
3054 "+r"(width) // %2
3055 : "w"(scale) // %3
3056 : "cc", "memory", "v1", "v2");
3057 }
3058
3059 // filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
GaussCol_NEON(const uint16_t * src0,const uint16_t * src1,const uint16_t * src2,const uint16_t * src3,const uint16_t * src4,uint32_t * dst,int width)3060 void GaussCol_NEON(const uint16_t* src0,
3061 const uint16_t* src1,
3062 const uint16_t* src2,
3063 const uint16_t* src3,
3064 const uint16_t* src4,
3065 uint32_t* dst,
3066 int width) {
3067 asm volatile(
3068 "movi v6.8h, #4 \n" // constant 4
3069 "movi v7.8h, #6 \n" // constant 6
3070
3071 "1: \n"
3072 "ld1 {v1.8h}, [%0], #16 \n" // load 8 samples, 5 rows
3073 "ld1 {v2.8h}, [%4], #16 \n"
3074 "uaddl v0.4s, v1.4h, v2.4h \n" // * 1
3075 "prfm pldl1keep, [%0, 448] \n"
3076 "uaddl2 v1.4s, v1.8h, v2.8h \n" // * 1
3077 "ld1 {v2.8h}, [%1], #16 \n"
3078 "umlal v0.4s, v2.4h, v6.4h \n" // * 4
3079 "prfm pldl1keep, [%1, 448] \n"
3080 "umlal2 v1.4s, v2.8h, v6.8h \n" // * 4
3081 "ld1 {v2.8h}, [%2], #16 \n"
3082 "umlal v0.4s, v2.4h, v7.4h \n" // * 6
3083 "prfm pldl1keep, [%2, 448] \n"
3084 "umlal2 v1.4s, v2.8h, v7.8h \n" // * 6
3085 "ld1 {v2.8h}, [%3], #16 \n"
3086 "umlal v0.4s, v2.4h, v6.4h \n" // * 4
3087 "prfm pldl1keep, [%3, 448] \n"
3088 "umlal2 v1.4s, v2.8h, v6.8h \n" // * 4
3089 "subs %w6, %w6, #8 \n" // 8 processed per loop
3090 "st1 {v0.4s,v1.4s}, [%5], #32 \n" // store 8 samples
3091 "prfm pldl1keep, [%4, 448] \n"
3092 "b.gt 1b \n"
3093 : "+r"(src0), // %0
3094 "+r"(src1), // %1
3095 "+r"(src2), // %2
3096 "+r"(src3), // %3
3097 "+r"(src4), // %4
3098 "+r"(dst), // %5
3099 "+r"(width) // %6
3100 :
3101 : "cc", "memory", "v0", "v1", "v2", "v6", "v7");
3102 }
3103
3104 // filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
GaussRow_NEON(const uint32_t * src,uint16_t * dst,int width)3105 void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {
3106 const uint32_t* src1 = src + 1;
3107 const uint32_t* src2 = src + 2;
3108 const uint32_t* src3 = src + 3;
3109 asm volatile(
3110 "movi v6.4s, #4 \n" // constant 4
3111 "movi v7.4s, #6 \n" // constant 6
3112
3113 "1: \n"
3114 "ld1 {v0.4s,v1.4s,v2.4s}, [%0], %6 \n" // load 12 source samples
3115 "add v0.4s, v0.4s, v1.4s \n" // * 1
3116 "add v1.4s, v1.4s, v2.4s \n" // * 1
3117 "ld1 {v2.4s,v3.4s}, [%2], #32 \n"
3118 "mla v0.4s, v2.4s, v7.4s \n" // * 6
3119 "mla v1.4s, v3.4s, v7.4s \n" // * 6
3120 "ld1 {v2.4s,v3.4s}, [%1], #32 \n"
3121 "ld1 {v4.4s,v5.4s}, [%3], #32 \n"
3122 "add v2.4s, v2.4s, v4.4s \n" // add rows for * 4
3123 "add v3.4s, v3.4s, v5.4s \n"
3124 "prfm pldl1keep, [%0, 448] \n"
3125 "mla v0.4s, v2.4s, v6.4s \n" // * 4
3126 "mla v1.4s, v3.4s, v6.4s \n" // * 4
3127 "subs %w5, %w5, #8 \n" // 8 processed per loop
3128 "uqrshrn v0.4h, v0.4s, #8 \n" // round and pack
3129 "uqrshrn2 v0.8h, v1.4s, #8 \n"
3130 "st1 {v0.8h}, [%4], #16 \n" // store 8 samples
3131 "b.gt 1b \n"
3132 : "+r"(src), // %0
3133 "+r"(src1), // %1
3134 "+r"(src2), // %2
3135 "+r"(src3), // %3
3136 "+r"(dst), // %4
3137 "+r"(width) // %5
3138 : "r"(32LL) // %6
3139 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
3140 }
3141
3142 static const vecf32 kGaussCoefficients = {4.0f, 6.0f, 1.0f / 256.0f, 0.0f};
3143
3144 // filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
GaussCol_F32_NEON(const float * src0,const float * src1,const float * src2,const float * src3,const float * src4,float * dst,int width)3145 void GaussCol_F32_NEON(const float* src0,
3146 const float* src1,
3147 const float* src2,
3148 const float* src3,
3149 const float* src4,
3150 float* dst,
3151 int width) {
3152 asm volatile(
3153 "ld2r {v6.4s, v7.4s}, [%7] \n" // constants 4 and 6
3154
3155 "1: \n"
3156 "ld1 {v0.4s, v1.4s}, [%0], #32 \n" // load 8 samples, 5 rows
3157 "ld1 {v2.4s, v3.4s}, [%1], #32 \n"
3158 "fmla v0.4s, v2.4s, v6.4s \n" // * 4
3159 "ld1 {v4.4s, v5.4s}, [%2], #32 \n"
3160 "fmla v1.4s, v3.4s, v6.4s \n"
3161 "prfm pldl1keep, [%0, 448] \n"
3162 "fmla v0.4s, v4.4s, v7.4s \n" // * 6
3163 "ld1 {v2.4s, v3.4s}, [%3], #32 \n"
3164 "fmla v1.4s, v5.4s, v7.4s \n"
3165 "prfm pldl1keep, [%1, 448] \n"
3166 "fmla v0.4s, v2.4s, v6.4s \n" // * 4
3167 "ld1 {v4.4s, v5.4s}, [%4], #32 \n"
3168 "fmla v1.4s, v3.4s, v6.4s \n"
3169 "prfm pldl1keep, [%2, 448] \n"
3170 "fadd v0.4s, v0.4s, v4.4s \n" // * 1
3171 "prfm pldl1keep, [%3, 448] \n"
3172 "fadd v1.4s, v1.4s, v5.4s \n"
3173 "prfm pldl1keep, [%4, 448] \n"
3174 "subs %w6, %w6, #8 \n" // 8 processed per loop
3175 "st1 {v0.4s, v1.4s}, [%5], #32 \n" // store 8 samples
3176 "b.gt 1b \n"
3177 : "+r"(src0), // %0
3178 "+r"(src1), // %1
3179 "+r"(src2), // %2
3180 "+r"(src3), // %3
3181 "+r"(src4), // %4
3182 "+r"(dst), // %5
3183 "+r"(width) // %6
3184 : "r"(&kGaussCoefficients) // %7
3185 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
3186 }
3187
3188 // filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
GaussRow_F32_NEON(const float * src,float * dst,int width)3189 void GaussRow_F32_NEON(const float* src, float* dst, int width) {
3190 asm volatile(
3191 "ld3r {v6.4s, v7.4s, v8.4s}, [%3] \n" // constants 4, 6, 1/256
3192
3193 "1: \n"
3194 "ld1 {v0.4s, v1.4s, v2.4s}, [%0], %4 \n" // load 12 samples, 5
3195 // rows
3196 "fadd v0.4s, v0.4s, v1.4s \n" // * 1
3197 "ld1 {v4.4s, v5.4s}, [%0], %5 \n"
3198 "fadd v1.4s, v1.4s, v2.4s \n"
3199 "fmla v0.4s, v4.4s, v7.4s \n" // * 6
3200 "ld1 {v2.4s, v3.4s}, [%0], %4 \n"
3201 "fmla v1.4s, v5.4s, v7.4s \n"
3202 "ld1 {v4.4s, v5.4s}, [%0], %6 \n"
3203 "fadd v2.4s, v2.4s, v4.4s \n"
3204 "fadd v3.4s, v3.4s, v5.4s \n"
3205 "fmla v0.4s, v2.4s, v6.4s \n" // * 4
3206 "fmla v1.4s, v3.4s, v6.4s \n"
3207 "prfm pldl1keep, [%0, 448] \n"
3208 "fmul v0.4s, v0.4s, v8.4s \n" // / 256
3209 "fmul v1.4s, v1.4s, v8.4s \n"
3210 "subs %w2, %w2, #8 \n" // 8 processed per loop
3211 "st1 {v0.4s, v1.4s}, [%1], #32 \n" // store 8 samples
3212 "b.gt 1b \n"
3213 : "+r"(src), // %0
3214 "+r"(dst), // %1
3215 "+r"(width) // %2
3216 : "r"(&kGaussCoefficients), // %3
3217 "r"(8LL), // %4
3218 "r"(-4LL), // %5
3219 "r"(20LL) // %6
3220 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8");
3221 }
3222
3223 // Convert biplanar NV21 to packed YUV24
NV21ToYUV24Row_NEON(const uint8_t * src_y,const uint8_t * src_vu,uint8_t * dst_yuv24,int width)3224 void NV21ToYUV24Row_NEON(const uint8_t* src_y,
3225 const uint8_t* src_vu,
3226 uint8_t* dst_yuv24,
3227 int width) {
3228 asm volatile(
3229 "1: \n"
3230 "ld1 {v2.16b}, [%0], #16 \n" // load 16 Y values
3231 "ld2 {v0.8b, v1.8b}, [%1], #16 \n" // load 8 VU values
3232 "prfm pldl1keep, [%0, 448] \n"
3233 "prfm pldl1keep, [%1, 448] \n"
3234 "zip1 v0.16b, v0.16b, v0.16b \n" // replicate V values
3235 "zip1 v1.16b, v1.16b, v1.16b \n" // replicate U values
3236 "subs %w3, %w3, #16 \n" // 16 pixels per loop
3237 "st3 {v0.16b,v1.16b,v2.16b}, [%2], #48 \n" // store 16 YUV pixels
3238 "b.gt 1b \n"
3239 : "+r"(src_y), // %0
3240 "+r"(src_vu), // %1
3241 "+r"(dst_yuv24), // %2
3242 "+r"(width) // %3
3243 :
3244 : "cc", "memory", "v0", "v1", "v2");
3245 }
3246
AYUVToUVRow_NEON(const uint8_t * src_ayuv,int src_stride_ayuv,uint8_t * dst_uv,int width)3247 void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
3248 int src_stride_ayuv,
3249 uint8_t* dst_uv,
3250 int width) {
3251 const uint8_t* src_ayuv_1 = src_ayuv + src_stride_ayuv;
3252 asm volatile(
3253
3254 "1: \n"
3255 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ayuv
3256 "prfm pldl1keep, [%0, 448] \n"
3257 "uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts.
3258 "uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts.
3259 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
3260 "prfm pldl1keep, [%1, 448] \n"
3261 "uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts.
3262 "uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts.
3263 "uqrshrn v3.8b, v0.8h, #2 \n" // 2x2 average
3264 "uqrshrn v2.8b, v1.8h, #2 \n"
3265 "subs %w3, %w3, #16 \n" // 16 processed per loop.
3266 "st2 {v2.8b,v3.8b}, [%2], #16 \n" // store 8 pixels UV.
3267 "b.gt 1b \n"
3268 : "+r"(src_ayuv), // %0
3269 "+r"(src_ayuv_1), // %1
3270 "+r"(dst_uv), // %2
3271 "+r"(width) // %3
3272 :
3273 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
3274 }
3275
AYUVToVURow_NEON(const uint8_t * src_ayuv,int src_stride_ayuv,uint8_t * dst_vu,int width)3276 void AYUVToVURow_NEON(const uint8_t* src_ayuv,
3277 int src_stride_ayuv,
3278 uint8_t* dst_vu,
3279 int width) {
3280 const uint8_t* src_ayuv_1 = src_ayuv + src_stride_ayuv;
3281 asm volatile(
3282
3283 "1: \n"
3284 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ayuv
3285 "prfm pldl1keep, [%0, 448] \n"
3286 "uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts.
3287 "uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts.
3288 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
3289 "prfm pldl1keep, [%1, 448] \n"
3290 "uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts.
3291 "uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts.
3292 "uqrshrn v0.8b, v0.8h, #2 \n" // 2x2 average
3293 "uqrshrn v1.8b, v1.8h, #2 \n"
3294 "subs %w3, %w3, #16 \n" // 16 processed per loop.
3295 "st2 {v0.8b,v1.8b}, [%2], #16 \n" // store 8 pixels VU.
3296 "b.gt 1b \n"
3297 : "+r"(src_ayuv), // %0
3298 "+r"(src_ayuv_1), // %1
3299 "+r"(dst_vu), // %2
3300 "+r"(width) // %3
3301 :
3302 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
3303 }
3304
3305 // Copy row of AYUV Y's into Y
AYUVToYRow_NEON(const uint8_t * src_ayuv,uint8_t * dst_y,int width)3306 void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
3307 asm volatile(
3308 "1: \n"
3309 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16
3310 "prfm pldl1keep, [%0, 448] \n"
3311 "subs %w2, %w2, #16 \n" // 16 pixels per loop
3312 "st1 {v2.16b}, [%1], #16 \n" // store 16 Y pixels
3313 "b.gt 1b \n"
3314 : "+r"(src_ayuv), // %0
3315 "+r"(dst_y), // %1
3316 "+r"(width) // %2
3317 :
3318 : "cc", "memory", "v0", "v1", "v2", "v3");
3319 }
3320
3321 // Shuffle table for swapping UV bytes.
3322 static const uvec8 kShuffleSwapUV = {1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u,
3323 9u, 8u, 11u, 10u, 13u, 12u, 15u, 14u};
3324
3325 // Convert UV plane of NV12 to VU of NV21.
SwapUVRow_NEON(const uint8_t * src_uv,uint8_t * dst_vu,int width)3326 void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
3327 asm volatile(
3328 "ld1 {v2.16b}, [%3] \n" // shuffler
3329 "1: \n"
3330 "ld1 {v0.16b}, [%0], 16 \n" // load 16 UV values
3331 "ld1 {v1.16b}, [%0], 16 \n"
3332 "prfm pldl1keep, [%0, 448] \n"
3333 "subs %w2, %w2, #16 \n" // 16 pixels per loop
3334 "tbl v0.16b, {v0.16b}, v2.16b \n"
3335 "tbl v1.16b, {v1.16b}, v2.16b \n"
3336 "stp q0, q1, [%1], 32 \n" // store 16 VU pixels
3337 "b.gt 1b \n"
3338 : "+r"(src_uv), // %0
3339 "+r"(dst_vu), // %1
3340 "+r"(width) // %2
3341 : "r"(&kShuffleSwapUV) // %3
3342 : "cc", "memory", "v0", "v1", "v2");
3343 }
3344
HalfMergeUVRow_NEON(const uint8_t * src_u,int src_stride_u,const uint8_t * src_v,int src_stride_v,uint8_t * dst_uv,int width)3345 void HalfMergeUVRow_NEON(const uint8_t* src_u,
3346 int src_stride_u,
3347 const uint8_t* src_v,
3348 int src_stride_v,
3349 uint8_t* dst_uv,
3350 int width) {
3351 const uint8_t* src_u_1 = src_u + src_stride_u;
3352 const uint8_t* src_v_1 = src_v + src_stride_v;
3353 asm volatile(
3354 "1: \n"
3355 "ld1 {v0.16b}, [%0], #16 \n" // load 16 U values
3356 "ld1 {v1.16b}, [%2], #16 \n" // load 16 V values
3357 "ld1 {v2.16b}, [%1], #16 \n"
3358 "ld1 {v3.16b}, [%3], #16 \n"
3359 "uaddlp v0.8h, v0.16b \n" // half size
3360 "prfm pldl1keep, [%0, 448] \n"
3361 "uaddlp v1.8h, v1.16b \n"
3362 "prfm pldl1keep, [%2, 448] \n"
3363 "uadalp v0.8h, v2.16b \n"
3364 "prfm pldl1keep, [%1, 448] \n"
3365 "uadalp v1.8h, v3.16b \n"
3366 "prfm pldl1keep, [%3, 448] \n"
3367 "uqrshrn v0.8b, v0.8h, #2 \n"
3368 "uqrshrn v1.8b, v1.8h, #2 \n"
3369 "subs %w5, %w5, #16 \n" // 16 src pixels per loop
3370 "st2 {v0.8b, v1.8b}, [%4], #16 \n" // store 8 UV pixels
3371 "b.gt 1b \n"
3372 : "+r"(src_u), // %0
3373 "+r"(src_u_1), // %1
3374 "+r"(src_v), // %2
3375 "+r"(src_v_1), // %3
3376 "+r"(dst_uv), // %4
3377 "+r"(width) // %5
3378 :
3379 : "cc", "memory", "v0", "v1", "v2", "v3");
3380 }
3381
3382 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
3383
3384 #ifdef __cplusplus
3385 } // extern "C"
3386 } // namespace libyuv
3387 #endif
3388