1 /*
2 * Copyright 2014 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "libyuv/row.h"
12
13 #ifdef __cplusplus
14 namespace libyuv {
15 extern "C" {
16 #endif
17
18 // This module is for GCC Neon armv8 64 bit.
19 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
20
21 // Read 8 Y, 4 U and 4 V from 422
22 #define READYUV422 \
23 MEMACCESS(0) \
24 "ld1 {v0.8b}, [%0], #8 \n" \
25 MEMACCESS(1) \
26 "ld1 {v1.s}[0], [%1], #4 \n" \
27 MEMACCESS(2) \
28 "ld1 {v1.s}[1], [%2], #4 \n"
29
30 // Read 8 Y, 2 U and 2 V from 422
31 #define READYUV411 \
32 MEMACCESS(0) \
33 "ld1 {v0.8b}, [%0], #8 \n" \
34 MEMACCESS(1) \
35 "ld1 {v2.h}[0], [%1], #2 \n" \
36 MEMACCESS(2) \
37 "ld1 {v2.h}[1], [%2], #2 \n" \
38 "zip1 v1.8b, v2.8b, v2.8b \n"
39
40 // Read 8 Y, 8 U and 8 V from 444
41 #define READYUV444 \
42 MEMACCESS(0) \
43 "ld1 {v0.8b}, [%0], #8 \n" \
44 MEMACCESS(1) \
45 "ld1 {v1.d}[0], [%1], #8 \n" \
46 MEMACCESS(2) \
47 "ld1 {v1.d}[1], [%2], #8 \n" \
48 "uaddlp v1.8h, v1.16b \n" \
49 "rshrn v1.8b, v1.8h, #1 \n"
50
51 // Read 8 Y, and set 4 U and 4 V to 128
52 #define READYUV400 \
53 MEMACCESS(0) \
54 "ld1 {v0.8b}, [%0], #8 \n" \
55 "movi v1.8b , #128 \n"
56
57 // Read 8 Y and 4 UV from NV12
58 #define READNV12 \
59 MEMACCESS(0) \
60 "ld1 {v0.8b}, [%0], #8 \n" \
61 MEMACCESS(1) \
62 "ld1 {v2.8b}, [%1], #8 \n" \
63 "uzp1 v1.8b, v2.8b, v2.8b \n" \
64 "uzp2 v3.8b, v2.8b, v2.8b \n" \
65 "ins v1.s[1], v3.s[0] \n"
66
67 // Read 8 Y and 4 VU from NV21
68 #define READNV21 \
69 MEMACCESS(0) \
70 "ld1 {v0.8b}, [%0], #8 \n" \
71 MEMACCESS(1) \
72 "ld1 {v2.8b}, [%1], #8 \n" \
73 "uzp1 v3.8b, v2.8b, v2.8b \n" \
74 "uzp2 v1.8b, v2.8b, v2.8b \n" \
75 "ins v1.s[1], v3.s[0] \n"
76
77 // Read 8 YUY2
78 #define READYUY2 \
79 MEMACCESS(0) \
80 "ld2 {v0.8b, v1.8b}, [%0], #16 \n" \
81 "uzp2 v3.8b, v1.8b, v1.8b \n" \
82 "uzp1 v1.8b, v1.8b, v1.8b \n" \
83 "ins v1.s[1], v3.s[0] \n"
84
85 // Read 8 UYVY
86 #define READUYVY \
87 MEMACCESS(0) \
88 "ld2 {v2.8b, v3.8b}, [%0], #16 \n" \
89 "orr v0.8b, v3.8b, v3.8b \n" \
90 "uzp1 v1.8b, v2.8b, v2.8b \n" \
91 "uzp2 v3.8b, v2.8b, v2.8b \n" \
92 "ins v1.s[1], v3.s[0] \n"
93
94 #define YUV422TORGB_SETUP_REG \
95 "ld1r {v24.8h}, [%[kUVBiasBGR]], #2 \n" \
96 "ld1r {v25.8h}, [%[kUVBiasBGR]], #2 \n" \
97 "ld1r {v26.8h}, [%[kUVBiasBGR]] \n" \
98 "ld1r {v31.4s}, [%[kYToRgb]] \n" \
99 "movi v27.8h, #128 \n" \
100 "movi v28.8h, #102 \n" \
101 "movi v29.8h, #25 \n" \
102 "movi v30.8h, #52 \n"
103
104 #define YUV422TORGB(vR, vG, vB) \
105 "uxtl v0.8h, v0.8b \n" /* Extract Y */ \
106 "shll v2.8h, v1.8b, #8 \n" /* Replicate UV */ \
107 "ushll2 v3.4s, v0.8h, #0 \n" /* Y */ \
108 "ushll v0.4s, v0.4h, #0 \n" \
109 "mul v3.4s, v3.4s, v31.4s \n" \
110 "mul v0.4s, v0.4s, v31.4s \n" \
111 "sqshrun v0.4h, v0.4s, #16 \n" \
112 "sqshrun2 v0.8h, v3.4s, #16 \n" /* Y */ \
113 "uaddw v1.8h, v2.8h, v1.8b \n" /* Replicate UV */ \
114 "mov v2.d[0], v1.d[1] \n" /* Extract V */ \
115 "uxtl v2.8h, v2.8b \n" \
116 "uxtl v1.8h, v1.8b \n" /* Extract U */ \
117 "mul v3.8h, v1.8h, v27.8h \n" \
118 "mul v5.8h, v1.8h, v29.8h \n" \
119 "mul v6.8h, v2.8h, v30.8h \n" \
120 "mul v7.8h, v2.8h, v28.8h \n" \
121 "sqadd v6.8h, v6.8h, v5.8h \n" \
122 "sqadd " #vB ".8h, v24.8h, v0.8h \n" /* B */ \
123 "sqadd " #vG ".8h, v25.8h, v0.8h \n" /* G */ \
124 "sqadd " #vR ".8h, v26.8h, v0.8h \n" /* R */ \
125 "sqadd " #vB ".8h, " #vB ".8h, v3.8h \n" /* B */ \
126 "sqsub " #vG ".8h, " #vG ".8h, v6.8h \n" /* G */ \
127 "sqadd " #vR ".8h, " #vR ".8h, v7.8h \n" /* R */ \
128 "sqshrun " #vB ".8b, " #vB ".8h, #6 \n" /* B */ \
129 "sqshrun " #vG ".8b, " #vG ".8h, #6 \n" /* G */ \
130 "sqshrun " #vR ".8b, " #vR ".8h, #6 \n" /* R */ \
131
132 // YUV to RGB conversion constants.
133 // Y contribution to R,G,B. Scale and bias.
134 #define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
135 #define YGB 1160 /* 1.164 * 64 * 16 - adjusted for even error distribution */
136
137 // U and V contributions to R,G,B.
138 #define UB -128 /* -min(128, round(2.018 * 64)) */
139 #define UG 25 /* -round(-0.391 * 64) */
140 #define VG 52 /* -round(-0.813 * 64) */
141 #define VR -102 /* -round(1.596 * 64) */
142
143 // Bias values to subtract 16 from Y and 128 from U and V.
144 #define BB (UB * 128 - YGB)
145 #define BG (UG * 128 + VG * 128 - YGB)
146 #define BR (VR * 128 - YGB)
147
148 static vec16 kUVBiasBGR = { BB, BG, BR, 0, 0, 0, 0, 0 };
149 static vec32 kYToRgb = { 0x0101 * YG, 0, 0, 0 };
150
151 #undef YG
152 #undef YGB
153 #undef UB
154 #undef UG
155 #undef VG
156 #undef VR
157 #undef BB
158 #undef BG
159 #undef BR
160
161 #define RGBTOUV_SETUP_REG \
162 "movi v20.8h, #56, lsl #0 \n" /* UB/VR coefficient (0.875) / 2 */ \
163 "movi v21.8h, #37, lsl #0 \n" /* UG coefficient (-0.5781) / 2 */ \
164 "movi v22.8h, #19, lsl #0 \n" /* UR coefficient (-0.2969) / 2 */ \
165 "movi v23.8h, #9, lsl #0 \n" /* VB coefficient (-0.1406) / 2 */ \
166 "movi v24.8h, #47, lsl #0 \n" /* VG coefficient (-0.7344) / 2 */ \
167 "movi v25.16b, #0x80 \n" /* 128.5 (0x8080 in 16-bit) */
168
169
170 #ifdef HAS_I444TOARGBROW_NEON
I444ToARGBRow_NEON(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_argb,int width)171 void I444ToARGBRow_NEON(const uint8* src_y,
172 const uint8* src_u,
173 const uint8* src_v,
174 uint8* dst_argb,
175 int width) {
176 asm volatile (
177 YUV422TORGB_SETUP_REG
178 "1: \n"
179 READYUV444
180 YUV422TORGB(v22, v21, v20)
181 "subs %w4, %w4, #8 \n"
182 "movi v23.8b, #255 \n" /* A */
183 MEMACCESS(3)
184 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
185 "b.gt 1b \n"
186 : "+r"(src_y), // %0
187 "+r"(src_u), // %1
188 "+r"(src_v), // %2
189 "+r"(dst_argb), // %3
190 "+r"(width) // %4
191 : [kUVBiasBGR]"r"(&kUVBiasBGR),
192 [kYToRgb]"r"(&kYToRgb)
193 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
194 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
195 );
196 }
197 #endif // HAS_I444TOARGBROW_NEON
198
199 #ifdef HAS_I422TOARGBROW_NEON
I422ToARGBRow_NEON(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_argb,int width)200 void I422ToARGBRow_NEON(const uint8* src_y,
201 const uint8* src_u,
202 const uint8* src_v,
203 uint8* dst_argb,
204 int width) {
205 asm volatile (
206 YUV422TORGB_SETUP_REG
207 "1: \n"
208 READYUV422
209 YUV422TORGB(v22, v21, v20)
210 "subs %w4, %w4, #8 \n"
211 "movi v23.8b, #255 \n" /* A */
212 MEMACCESS(3)
213 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
214 "b.gt 1b \n"
215 : "+r"(src_y), // %0
216 "+r"(src_u), // %1
217 "+r"(src_v), // %2
218 "+r"(dst_argb), // %3
219 "+r"(width) // %4
220 : [kUVBiasBGR]"r"(&kUVBiasBGR),
221 [kYToRgb]"r"(&kYToRgb)
222 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
223 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
224 );
225 }
226 #endif // HAS_I422TOARGBROW_NEON
227
228 #ifdef HAS_I411TOARGBROW_NEON
I411ToARGBRow_NEON(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_argb,int width)229 void I411ToARGBRow_NEON(const uint8* src_y,
230 const uint8* src_u,
231 const uint8* src_v,
232 uint8* dst_argb,
233 int width) {
234 asm volatile (
235 YUV422TORGB_SETUP_REG
236 "1: \n"
237 READYUV411
238 YUV422TORGB(v22, v21, v20)
239 "subs %w4, %w4, #8 \n"
240 "movi v23.8b, #255 \n" /* A */
241 MEMACCESS(3)
242 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
243 "b.gt 1b \n"
244 : "+r"(src_y), // %0
245 "+r"(src_u), // %1
246 "+r"(src_v), // %2
247 "+r"(dst_argb), // %3
248 "+r"(width) // %4
249 : [kUVBiasBGR]"r"(&kUVBiasBGR),
250 [kYToRgb]"r"(&kYToRgb)
251 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
252 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
253 );
254 }
255 #endif // HAS_I411TOARGBROW_NEON
256
257 #ifdef HAS_I422TOBGRAROW_NEON
I422ToBGRARow_NEON(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_bgra,int width)258 void I422ToBGRARow_NEON(const uint8* src_y,
259 const uint8* src_u,
260 const uint8* src_v,
261 uint8* dst_bgra,
262 int width) {
263 asm volatile (
264 YUV422TORGB_SETUP_REG
265 "1: \n"
266 READYUV422
267 YUV422TORGB(v21, v22, v23)
268 "subs %w4, %w4, #8 \n"
269 "movi v20.8b, #255 \n" /* A */
270 MEMACCESS(3)
271 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
272 "b.gt 1b \n"
273 : "+r"(src_y), // %0
274 "+r"(src_u), // %1
275 "+r"(src_v), // %2
276 "+r"(dst_bgra), // %3
277 "+r"(width) // %4
278 : [kUVBiasBGR]"r"(&kUVBiasBGR),
279 [kYToRgb]"r"(&kYToRgb)
280 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
281 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
282 );
283 }
284 #endif // HAS_I422TOBGRAROW_NEON
285
286 #ifdef HAS_I422TOABGRROW_NEON
I422ToABGRRow_NEON(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_abgr,int width)287 void I422ToABGRRow_NEON(const uint8* src_y,
288 const uint8* src_u,
289 const uint8* src_v,
290 uint8* dst_abgr,
291 int width) {
292 asm volatile (
293 YUV422TORGB_SETUP_REG
294 "1: \n"
295 READYUV422
296 YUV422TORGB(v20, v21, v22)
297 "subs %w4, %w4, #8 \n"
298 "movi v23.8b, #255 \n" /* A */
299 MEMACCESS(3)
300 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
301 "b.gt 1b \n"
302 : "+r"(src_y), // %0
303 "+r"(src_u), // %1
304 "+r"(src_v), // %2
305 "+r"(dst_abgr), // %3
306 "+r"(width) // %4
307 : [kUVBiasBGR]"r"(&kUVBiasBGR),
308 [kYToRgb]"r"(&kYToRgb)
309 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
310 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
311 );
312 }
313 #endif // HAS_I422TOABGRROW_NEON
314
315 #ifdef HAS_I422TORGBAROW_NEON
I422ToRGBARow_NEON(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_rgba,int width)316 void I422ToRGBARow_NEON(const uint8* src_y,
317 const uint8* src_u,
318 const uint8* src_v,
319 uint8* dst_rgba,
320 int width) {
321 asm volatile (
322 YUV422TORGB_SETUP_REG
323 "1: \n"
324 READYUV422
325 YUV422TORGB(v23, v22, v21)
326 "subs %w4, %w4, #8 \n"
327 "movi v20.8b, #255 \n" /* A */
328 MEMACCESS(3)
329 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
330 "b.gt 1b \n"
331 : "+r"(src_y), // %0
332 "+r"(src_u), // %1
333 "+r"(src_v), // %2
334 "+r"(dst_rgba), // %3
335 "+r"(width) // %4
336 : [kUVBiasBGR]"r"(&kUVBiasBGR),
337 [kYToRgb]"r"(&kYToRgb)
338 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
339 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
340 );
341 }
342 #endif // HAS_I422TORGBAROW_NEON
343
344 #ifdef HAS_I422TORGB24ROW_NEON
I422ToRGB24Row_NEON(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_rgb24,int width)345 void I422ToRGB24Row_NEON(const uint8* src_y,
346 const uint8* src_u,
347 const uint8* src_v,
348 uint8* dst_rgb24,
349 int width) {
350 asm volatile (
351 YUV422TORGB_SETUP_REG
352 "1: \n"
353 READYUV422
354 YUV422TORGB(v22, v21, v20)
355 "subs %w4, %w4, #8 \n"
356 MEMACCESS(3)
357 "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n"
358 "b.gt 1b \n"
359 : "+r"(src_y), // %0
360 "+r"(src_u), // %1
361 "+r"(src_v), // %2
362 "+r"(dst_rgb24), // %3
363 "+r"(width) // %4
364 : [kUVBiasBGR]"r"(&kUVBiasBGR),
365 [kYToRgb]"r"(&kYToRgb)
366 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
367 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
368 );
369 }
370 #endif // HAS_I422TORGB24ROW_NEON
371
372 #ifdef HAS_I422TORAWROW_NEON
I422ToRAWRow_NEON(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_raw,int width)373 void I422ToRAWRow_NEON(const uint8* src_y,
374 const uint8* src_u,
375 const uint8* src_v,
376 uint8* dst_raw,
377 int width) {
378 asm volatile (
379 YUV422TORGB_SETUP_REG
380 "1: \n"
381 READYUV422
382 YUV422TORGB(v20, v21, v22)
383 "subs %w4, %w4, #8 \n"
384 MEMACCESS(3)
385 "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n"
386 "b.gt 1b \n"
387 : "+r"(src_y), // %0
388 "+r"(src_u), // %1
389 "+r"(src_v), // %2
390 "+r"(dst_raw), // %3
391 "+r"(width) // %4
392 : [kUVBiasBGR]"r"(&kUVBiasBGR),
393 [kYToRgb]"r"(&kYToRgb)
394 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
395 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
396 );
397 }
398 #endif // HAS_I422TORAWROW_NEON
399
400 #define ARGBTORGB565 \
401 "shll v0.8h, v22.8b, #8 \n" /* R */ \
402 "shll v20.8h, v20.8b, #8 \n" /* B */ \
403 "shll v21.8h, v21.8b, #8 \n" /* G */ \
404 "sri v0.8h, v21.8h, #5 \n" /* RG */ \
405 "sri v0.8h, v20.8h, #11 \n" /* RGB */
406
407 #ifdef HAS_I422TORGB565ROW_NEON
I422ToRGB565Row_NEON(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_rgb565,int width)408 void I422ToRGB565Row_NEON(const uint8* src_y,
409 const uint8* src_u,
410 const uint8* src_v,
411 uint8* dst_rgb565,
412 int width) {
413 asm volatile (
414 YUV422TORGB_SETUP_REG
415 "1: \n"
416 READYUV422
417 YUV422TORGB(v22, v21, v20)
418 "subs %w4, %w4, #8 \n"
419 ARGBTORGB565
420 MEMACCESS(3)
421 "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565.
422 "b.gt 1b \n"
423 : "+r"(src_y), // %0
424 "+r"(src_u), // %1
425 "+r"(src_v), // %2
426 "+r"(dst_rgb565), // %3
427 "+r"(width) // %4
428 : [kUVBiasBGR]"r"(&kUVBiasBGR),
429 [kYToRgb]"r"(&kYToRgb)
430 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
431 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
432 );
433 }
434 #endif // HAS_I422TORGB565ROW_NEON
435
436 #define ARGBTOARGB1555 \
437 "shll v0.8h, v23.8b, #8 \n" /* A */ \
438 "shll v22.8h, v22.8b, #8 \n" /* R */ \
439 "shll v20.8h, v20.8b, #8 \n" /* B */ \
440 "shll v21.8h, v21.8b, #8 \n" /* G */ \
441 "sri v0.8h, v22.8h, #1 \n" /* AR */ \
442 "sri v0.8h, v21.8h, #6 \n" /* ARG */ \
443 "sri v0.8h, v20.8h, #11 \n" /* ARGB */
444
445 #ifdef HAS_I422TOARGB1555ROW_NEON
I422ToARGB1555Row_NEON(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_argb1555,int width)446 void I422ToARGB1555Row_NEON(const uint8* src_y,
447 const uint8* src_u,
448 const uint8* src_v,
449 uint8* dst_argb1555,
450 int width) {
451 asm volatile (
452 YUV422TORGB_SETUP_REG
453 "1: \n"
454 READYUV422
455 YUV422TORGB(v22, v21, v20)
456 "subs %w4, %w4, #8 \n"
457 "movi v23.8b, #255 \n"
458 ARGBTOARGB1555
459 MEMACCESS(3)
460 "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565.
461 "b.gt 1b \n"
462 : "+r"(src_y), // %0
463 "+r"(src_u), // %1
464 "+r"(src_v), // %2
465 "+r"(dst_argb1555), // %3
466 "+r"(width) // %4
467 : [kUVBiasBGR]"r"(&kUVBiasBGR),
468 [kYToRgb]"r"(&kYToRgb)
469 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
470 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
471 );
472 }
473 #endif // HAS_I422TOARGB1555ROW_NEON
474
475 #define ARGBTOARGB4444 \
476 /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f */ \
477 "ushr v20.8b, v20.8b, #4 \n" /* B */ \
478 "bic v21.8b, v21.8b, v4.8b \n" /* G */ \
479 "ushr v22.8b, v22.8b, #4 \n" /* R */ \
480 "bic v23.8b, v23.8b, v4.8b \n" /* A */ \
481 "orr v0.8b, v20.8b, v21.8b \n" /* BG */ \
482 "orr v1.8b, v22.8b, v23.8b \n" /* RA */ \
483 "zip1 v0.16b, v0.16b, v1.16b \n" /* BGRA */
484
485 #ifdef HAS_I422TOARGB4444ROW_NEON
I422ToARGB4444Row_NEON(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_argb4444,int width)486 void I422ToARGB4444Row_NEON(const uint8* src_y,
487 const uint8* src_u,
488 const uint8* src_v,
489 uint8* dst_argb4444,
490 int width) {
491 asm volatile (
492 YUV422TORGB_SETUP_REG
493 "movi v4.16b, #0x0f \n" // bits to clear with vbic.
494 "1: \n"
495 READYUV422
496 YUV422TORGB(v22, v21, v20)
497 "subs %w4, %w4, #8 \n"
498 "movi v23.8b, #255 \n"
499 ARGBTOARGB4444
500 MEMACCESS(3)
501 "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels ARGB4444.
502 "b.gt 1b \n"
503 : "+r"(src_y), // %0
504 "+r"(src_u), // %1
505 "+r"(src_v), // %2
506 "+r"(dst_argb4444), // %3
507 "+r"(width) // %4
508 : [kUVBiasBGR]"r"(&kUVBiasBGR),
509 [kYToRgb]"r"(&kYToRgb)
510 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
511 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
512 );
513 }
514 #endif // HAS_I422TOARGB4444ROW_NEON
515
516 #ifdef HAS_I400TOARGBROW_NEON
I400ToARGBRow_NEON(const uint8 * src_y,uint8 * dst_argb,int width)517 void I400ToARGBRow_NEON(const uint8* src_y,
518 uint8* dst_argb,
519 int width) {
520 int64 width64 = (int64)(width);
521 asm volatile (
522 YUV422TORGB_SETUP_REG
523 "1: \n"
524 READYUV400
525 YUV422TORGB(v22, v21, v20)
526 "subs %w2, %w2, #8 \n"
527 "movi v23.8b, #255 \n"
528 MEMACCESS(1)
529 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
530 "b.gt 1b \n"
531 : "+r"(src_y), // %0
532 "+r"(dst_argb), // %1
533 "+r"(width64) // %2
534 : [kUVBiasBGR]"r"(&kUVBiasBGR),
535 [kYToRgb]"r"(&kYToRgb)
536 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
537 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
538 );
539 }
540 #endif // HAS_I400TOARGBROW_NEON
541
542 #ifdef HAS_J400TOARGBROW_NEON
J400ToARGBRow_NEON(const uint8 * src_y,uint8 * dst_argb,int width)543 void J400ToARGBRow_NEON(const uint8* src_y,
544 uint8* dst_argb,
545 int width) {
546 asm volatile (
547 "movi v23.8b, #255 \n"
548 "1: \n"
549 MEMACCESS(0)
550 "ld1 {v20.8b}, [%0], #8 \n"
551 "orr v21.8b, v20.8b, v20.8b \n"
552 "orr v22.8b, v20.8b, v20.8b \n"
553 "subs %w2, %w2, #8 \n"
554 MEMACCESS(1)
555 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
556 "b.gt 1b \n"
557 : "+r"(src_y), // %0
558 "+r"(dst_argb), // %1
559 "+r"(width) // %2
560 :
561 : "cc", "memory", "v20", "v21", "v22", "v23"
562 );
563 }
564 #endif // HAS_J400TOARGBROW_NEON
565
566 #ifdef HAS_NV12TOARGBROW_NEON
NV12ToARGBRow_NEON(const uint8 * src_y,const uint8 * src_uv,uint8 * dst_argb,int width)567 void NV12ToARGBRow_NEON(const uint8* src_y,
568 const uint8* src_uv,
569 uint8* dst_argb,
570 int width) {
571 asm volatile (
572 YUV422TORGB_SETUP_REG
573 "1: \n"
574 READNV12
575 YUV422TORGB(v22, v21, v20)
576 "subs %w3, %w3, #8 \n"
577 "movi v23.8b, #255 \n"
578 MEMACCESS(2)
579 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
580 "b.gt 1b \n"
581 : "+r"(src_y), // %0
582 "+r"(src_uv), // %1
583 "+r"(dst_argb), // %2
584 "+r"(width) // %3
585 : [kUVBiasBGR]"r"(&kUVBiasBGR),
586 [kYToRgb]"r"(&kYToRgb)
587 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
588 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
589 );
590 }
591 #endif // HAS_NV12TOARGBROW_NEON
592
593 #ifdef HAS_NV21TOARGBROW_NEON
NV21ToARGBRow_NEON(const uint8 * src_y,const uint8 * src_uv,uint8 * dst_argb,int width)594 void NV21ToARGBRow_NEON(const uint8* src_y,
595 const uint8* src_uv,
596 uint8* dst_argb,
597 int width) {
598 asm volatile (
599 YUV422TORGB_SETUP_REG
600 "1: \n"
601 READNV21
602 YUV422TORGB(v22, v21, v20)
603 "subs %w3, %w3, #8 \n"
604 "movi v23.8b, #255 \n"
605 MEMACCESS(2)
606 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
607 "b.gt 1b \n"
608 : "+r"(src_y), // %0
609 "+r"(src_uv), // %1
610 "+r"(dst_argb), // %2
611 "+r"(width) // %3
612 : [kUVBiasBGR]"r"(&kUVBiasBGR),
613 [kYToRgb]"r"(&kYToRgb)
614 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
615 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
616 );
617 }
618 #endif // HAS_NV21TOARGBROW_NEON
619
620 #ifdef HAS_NV12TORGB565ROW_NEON
NV12ToRGB565Row_NEON(const uint8 * src_y,const uint8 * src_uv,uint8 * dst_rgb565,int width)621 void NV12ToRGB565Row_NEON(const uint8* src_y,
622 const uint8* src_uv,
623 uint8* dst_rgb565,
624 int width) {
625 asm volatile (
626 YUV422TORGB_SETUP_REG
627 "1: \n"
628 READNV12
629 YUV422TORGB(v22, v21, v20)
630 "subs %w3, %w3, #8 \n"
631 ARGBTORGB565
632 MEMACCESS(2)
633 "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels RGB565.
634 "b.gt 1b \n"
635 : "+r"(src_y), // %0
636 "+r"(src_uv), // %1
637 "+r"(dst_rgb565), // %2
638 "+r"(width) // %3
639 : [kUVBiasBGR]"r"(&kUVBiasBGR),
640 [kYToRgb]"r"(&kYToRgb)
641 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
642 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
643 );
644 }
645 #endif // HAS_NV12TORGB565ROW_NEON
646
647 #ifdef HAS_NV21TORGB565ROW_NEON
NV21ToRGB565Row_NEON(const uint8 * src_y,const uint8 * src_uv,uint8 * dst_rgb565,int width)648 void NV21ToRGB565Row_NEON(const uint8* src_y,
649 const uint8* src_uv,
650 uint8* dst_rgb565,
651 int width) {
652 asm volatile (
653 YUV422TORGB_SETUP_REG
654 "1: \n"
655 READNV21
656 YUV422TORGB(v22, v21, v20)
657 "subs %w3, %w3, #8 \n"
658 ARGBTORGB565
659 MEMACCESS(2)
660 "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels RGB565.
661 "b.gt 1b \n"
662 : "+r"(src_y), // %0
663 "+r"(src_uv), // %1
664 "+r"(dst_rgb565), // %2
665 "+r"(width) // %3
666 : [kUVBiasBGR]"r"(&kUVBiasBGR),
667 [kYToRgb]"r"(&kYToRgb)
668 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
669 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
670 );
671 }
672 #endif // HAS_NV21TORGB565ROW_NEON
673
674 #ifdef HAS_YUY2TOARGBROW_NEON
YUY2ToARGBRow_NEON(const uint8 * src_yuy2,uint8 * dst_argb,int width)675 void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
676 uint8* dst_argb,
677 int width) {
678 int64 width64 = (int64)(width);
679 asm volatile (
680 YUV422TORGB_SETUP_REG
681 "1: \n"
682 READYUY2
683 YUV422TORGB(v22, v21, v20)
684 "subs %w2, %w2, #8 \n"
685 "movi v23.8b, #255 \n"
686 MEMACCESS(1)
687 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
688 "b.gt 1b \n"
689 : "+r"(src_yuy2), // %0
690 "+r"(dst_argb), // %1
691 "+r"(width64) // %2
692 : [kUVBiasBGR]"r"(&kUVBiasBGR),
693 [kYToRgb]"r"(&kYToRgb)
694 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
695 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
696 );
697 }
698 #endif // HAS_YUY2TOARGBROW_NEON
699
700 #ifdef HAS_UYVYTOARGBROW_NEON
UYVYToARGBRow_NEON(const uint8 * src_uyvy,uint8 * dst_argb,int width)701 void UYVYToARGBRow_NEON(const uint8* src_uyvy,
702 uint8* dst_argb,
703 int width) {
704 int64 width64 = (int64)(width);
705 asm volatile (
706 YUV422TORGB_SETUP_REG
707 "1: \n"
708 READUYVY
709 YUV422TORGB(v22, v21, v20)
710 "subs %w2, %w2, #8 \n"
711 "movi v23.8b, #255 \n"
712 MEMACCESS(1)
713 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n"
714 "b.gt 1b \n"
715 : "+r"(src_uyvy), // %0
716 "+r"(dst_argb), // %1
717 "+r"(width64) // %2
718 : [kUVBiasBGR]"r"(&kUVBiasBGR),
719 [kYToRgb]"r"(&kYToRgb)
720 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
721 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
722 );
723 }
724 #endif // HAS_UYVYTOARGBROW_NEON
725
726 // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
727 #ifdef HAS_SPLITUVROW_NEON
SplitUVRow_NEON(const uint8 * src_uv,uint8 * dst_u,uint8 * dst_v,int width)728 void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
729 int width) {
730 asm volatile (
731 "1: \n"
732 MEMACCESS(0)
733 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV
734 "subs %w3, %w3, #16 \n" // 16 processed per loop
735 MEMACCESS(1)
736 "st1 {v0.16b}, [%1], #16 \n" // store U
737 MEMACCESS(2)
738 "st1 {v1.16b}, [%2], #16 \n" // store V
739 "b.gt 1b \n"
740 : "+r"(src_uv), // %0
741 "+r"(dst_u), // %1
742 "+r"(dst_v), // %2
743 "+r"(width) // %3 // Output registers
744 : // Input registers
745 : "cc", "memory", "v0", "v1" // Clobber List
746 );
747 }
748 #endif // HAS_SPLITUVROW_NEON
749
750 // Reads 16 U's and V's and writes out 16 pairs of UV.
751 #ifdef HAS_MERGEUVROW_NEON
MergeUVRow_NEON(const uint8 * src_u,const uint8 * src_v,uint8 * dst_uv,int width)752 void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
753 int width) {
754 asm volatile (
755 "1: \n"
756 MEMACCESS(0)
757 "ld1 {v0.16b}, [%0], #16 \n" // load U
758 MEMACCESS(1)
759 "ld1 {v1.16b}, [%1], #16 \n" // load V
760 "subs %w3, %w3, #16 \n" // 16 processed per loop
761 MEMACCESS(2)
762 "st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV
763 "b.gt 1b \n"
764 :
765 "+r"(src_u), // %0
766 "+r"(src_v), // %1
767 "+r"(dst_uv), // %2
768 "+r"(width) // %3 // Output registers
769 : // Input registers
770 : "cc", "memory", "v0", "v1" // Clobber List
771 );
772 }
773 #endif // HAS_MERGEUVROW_NEON
774
775 // Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15.
776 #ifdef HAS_COPYROW_NEON
CopyRow_NEON(const uint8 * src,uint8 * dst,int count)777 void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
778 asm volatile (
779 "1: \n"
780 MEMACCESS(0)
781 "ld1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 32
782 "subs %w2, %w2, #32 \n" // 32 processed per loop
783 MEMACCESS(1)
784 "st1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 32
785 "b.gt 1b \n"
786 : "+r"(src), // %0
787 "+r"(dst), // %1
788 "+r"(count) // %2 // Output registers
789 : // Input registers
790 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
791 );
792 }
793 #endif // HAS_COPYROW_NEON
794
795 // SetRow writes 'count' bytes using an 8 bit value repeated.
SetRow_NEON(uint8 * dst,uint8 v8,int count)796 void SetRow_NEON(uint8* dst, uint8 v8, int count) {
797 asm volatile (
798 "dup v0.16b, %w2 \n" // duplicate 16 bytes
799 "1: \n"
800 "subs %w1, %w1, #16 \n" // 16 bytes per loop
801 MEMACCESS(0)
802 "st1 {v0.16b}, [%0], #16 \n" // store
803 "b.gt 1b \n"
804 : "+r"(dst), // %0
805 "+r"(count) // %1
806 : "r"(v8) // %2
807 : "cc", "memory", "v0"
808 );
809 }
810
ARGBSetRow_NEON(uint8 * dst,uint32 v32,int count)811 void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) {
812 asm volatile (
813 "dup v0.4s, %w2 \n" // duplicate 4 ints
814 "1: \n"
815 "subs %w1, %w1, #4 \n" // 4 ints per loop
816 MEMACCESS(0)
817 "st1 {v0.16b}, [%0], #16 \n" // store
818 "b.gt 1b \n"
819 : "+r"(dst), // %0
820 "+r"(count) // %1
821 : "r"(v32) // %2
822 : "cc", "memory", "v0"
823 );
824 }
825
826 #ifdef HAS_MIRRORROW_NEON
MirrorRow_NEON(const uint8 * src,uint8 * dst,int width)827 void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
828 int64 width64 = (int64) width;
829 asm volatile (
830 // Start at end of source row.
831 "add %0, %0, %2 \n"
832 "sub %0, %0, #16 \n"
833
834 "1: \n"
835 MEMACCESS(0)
836 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16
837 "subs %2, %2, #16 \n" // 16 pixels per loop.
838 "rev64 v0.16b, v0.16b \n"
839 MEMACCESS(1)
840 "st1 {v0.D}[1], [%1], #8 \n" // dst += 16
841 MEMACCESS(1)
842 "st1 {v0.D}[0], [%1], #8 \n"
843 "b.gt 1b \n"
844 : "+r"(src), // %0
845 "+r"(dst), // %1
846 "+r"(width64) // %2
847 : "r"((ptrdiff_t)-16) // %3
848 : "cc", "memory", "v0"
849 );
850 }
851 #endif // HAS_MIRRORROW_NEON
852
853 #ifdef HAS_MIRRORUVROW_NEON
MirrorUVRow_NEON(const uint8 * src_uv,uint8 * dst_u,uint8 * dst_v,int width)854 void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
855 int width) {
856 int64 width64 = (int64) width;
857 asm volatile (
858 // Start at end of source row.
859 "add %0, %0, %3, lsl #1 \n"
860 "sub %0, %0, #16 \n"
861
862 "1: \n"
863 MEMACCESS(0)
864 "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16
865 "subs %3, %3, #8 \n" // 8 pixels per loop.
866 "rev64 v0.8b, v0.8b \n"
867 "rev64 v1.8b, v1.8b \n"
868 MEMACCESS(1)
869 "st1 {v0.8b}, [%1], #8 \n" // dst += 8
870 MEMACCESS(2)
871 "st1 {v1.8b}, [%2], #8 \n"
872 "b.gt 1b \n"
873 : "+r"(src_uv), // %0
874 "+r"(dst_u), // %1
875 "+r"(dst_v), // %2
876 "+r"(width64) // %3
877 : "r"((ptrdiff_t)-16) // %4
878 : "cc", "memory", "v0", "v1"
879 );
880 }
881 #endif // HAS_MIRRORUVROW_NEON
882
883 #ifdef HAS_ARGBMIRRORROW_NEON
ARGBMirrorRow_NEON(const uint8 * src,uint8 * dst,int width)884 void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
885 int64 width64 = (int64) width;
886 asm volatile (
887 // Start at end of source row.
888 "add %0, %0, %2, lsl #2 \n"
889 "sub %0, %0, #16 \n"
890
891 "1: \n"
892 MEMACCESS(0)
893 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16
894 "subs %2, %2, #4 \n" // 4 pixels per loop.
895 "rev64 v0.4s, v0.4s \n"
896 MEMACCESS(1)
897 "st1 {v0.D}[1], [%1], #8 \n" // dst += 16
898 MEMACCESS(1)
899 "st1 {v0.D}[0], [%1], #8 \n"
900 "b.gt 1b \n"
901 : "+r"(src), // %0
902 "+r"(dst), // %1
903 "+r"(width64) // %2
904 : "r"((ptrdiff_t)-16) // %3
905 : "cc", "memory", "v0"
906 );
907 }
908 #endif // HAS_ARGBMIRRORROW_NEON
909
910 #ifdef HAS_RGB24TOARGBROW_NEON
RGB24ToARGBRow_NEON(const uint8 * src_rgb24,uint8 * dst_argb,int pix)911 void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
912 asm volatile (
913 "movi v4.8b, #255 \n" // Alpha
914 "1: \n"
915 MEMACCESS(0)
916 "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24.
917 "subs %w2, %w2, #8 \n" // 8 processed per loop.
918 MEMACCESS(1)
919 "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB pixels
920 "b.gt 1b \n"
921 : "+r"(src_rgb24), // %0
922 "+r"(dst_argb), // %1
923 "+r"(pix) // %2
924 :
925 : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List
926 );
927 }
928 #endif // HAS_RGB24TOARGBROW_NEON
929
930 #ifdef HAS_RAWTOARGBROW_NEON
RAWToARGBRow_NEON(const uint8 * src_raw,uint8 * dst_argb,int pix)931 void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {
932 asm volatile (
933 "movi v5.8b, #255 \n" // Alpha
934 "1: \n"
935 MEMACCESS(0)
936 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
937 "subs %w2, %w2, #8 \n" // 8 processed per loop.
938 "orr v3.8b, v1.8b, v1.8b \n" // move g
939 "orr v4.8b, v0.8b, v0.8b \n" // move r
940 MEMACCESS(1)
941 "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a
942 "b.gt 1b \n"
943 : "+r"(src_raw), // %0
944 "+r"(dst_argb), // %1
945 "+r"(pix) // %2
946 :
947 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List
948 );
949 }
950 #endif // HAS_RAWTOARGBROW_NEON
951
952 #define RGB565TOARGB \
953 "shrn v6.8b, v0.8h, #5 \n" /* G xxGGGGGG */ \
954 "shl v6.8b, v6.8b, #2 \n" /* G GGGGGG00 upper 6 */ \
955 "ushr v4.8b, v6.8b, #6 \n" /* G 000000GG lower 2 */ \
956 "orr v1.8b, v4.8b, v6.8b \n" /* G */ \
957 "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \
958 "ushr v0.8h, v0.8h, #11 \n" /* R 000RRRRR */ \
959 "xtn2 v2.16b,v0.8h \n" /* R in upper part */ \
960 "shl v2.16b, v2.16b, #3 \n" /* R,B BBBBB000 upper 5 */ \
961 "ushr v0.16b, v2.16b, #5 \n" /* R,B 00000BBB lower 3 */ \
962 "orr v0.16b, v0.16b, v2.16b \n" /* R,B */ \
963 "dup v2.2D, v0.D[1] \n" /* R */
964
965 #ifdef HAS_RGB565TOARGBROW_NEON
RGB565ToARGBRow_NEON(const uint8 * src_rgb565,uint8 * dst_argb,int pix)966 void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) {
967 asm volatile (
968 "movi v3.8b, #255 \n" // Alpha
969 "1: \n"
970 MEMACCESS(0)
971 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
972 "subs %w2, %w2, #8 \n" // 8 processed per loop.
973 RGB565TOARGB
974 MEMACCESS(1)
975 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels
976 "b.gt 1b \n"
977 : "+r"(src_rgb565), // %0
978 "+r"(dst_argb), // %1
979 "+r"(pix) // %2
980 :
981 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6" // Clobber List
982 );
983 }
984 #endif // HAS_RGB565TOARGBROW_NEON
985
986 #define ARGB1555TOARGB \
987 "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \
988 "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \
989 "xtn v3.8b, v2.8h \n" /* RRRRR000 AAAAAAAA */ \
990 \
991 "sshr v2.8h, v0.8h, #15 \n" /* A AAAAAAAA */ \
992 "xtn2 v3.16b, v2.8h \n" \
993 \
994 "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \
995 "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \
996 \
997 "ushr v1.16b, v3.16b, #5 \n" /* R,A 00000RRR lower 3 */ \
998 "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \
999 "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \
1000 \
1001 "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \
1002 "orr v2.16b, v1.16b, v3.16b \n" /* R,A */ \
1003 "dup v1.2D, v0.D[1] \n" \
1004 "dup v3.2D, v2.D[1] \n"
1005
1006 // RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
1007 #define RGB555TOARGB \
1008 "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \
1009 "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \
1010 "xtn v3.8b, v2.8h \n" /* RRRRR000 */ \
1011 \
1012 "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \
1013 "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \
1014 \
1015 "ushr v1.16b, v3.16b, #5 \n" /* R 00000RRR lower 3 */ \
1016 "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \
1017 "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \
1018 \
1019 "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \
1020 "orr v2.16b, v1.16b, v3.16b \n" /* R */ \
1021 "dup v1.2D, v0.D[1] \n" /* G */ \
1022
1023 #ifdef HAS_ARGB1555TOARGBROW_NEON
ARGB1555ToARGBRow_NEON(const uint8 * src_argb1555,uint8 * dst_argb,int pix)1024 void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
1025 int pix) {
1026 asm volatile (
1027 "movi v3.8b, #255 \n" // Alpha
1028 "1: \n"
1029 MEMACCESS(0)
1030 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
1031 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1032 ARGB1555TOARGB
1033 MEMACCESS(1)
1034 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels
1035 "b.gt 1b \n"
1036 : "+r"(src_argb1555), // %0
1037 "+r"(dst_argb), // %1
1038 "+r"(pix) // %2
1039 :
1040 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
1041 );
1042 }
1043 #endif // HAS_ARGB1555TOARGBROW_NEON
1044
1045 #define ARGB4444TOARGB \
1046 "shrn v1.8b, v0.8h, #8 \n" /* v1(l) AR */ \
1047 "xtn2 v1.16b, v0.8h \n" /* v1(h) GB */ \
1048 "shl v2.16b, v1.16b, #4 \n" /* B,R BBBB0000 */ \
1049 "ushr v3.16b, v1.16b, #4 \n" /* G,A 0000GGGG */ \
1050 "ushr v0.16b, v2.16b, #4 \n" /* B,R 0000BBBB */ \
1051 "shl v1.16b, v3.16b, #4 \n" /* G,A GGGG0000 */ \
1052 "orr v2.16b, v0.16b, v2.16b \n" /* B,R BBBBBBBB */ \
1053 "orr v3.16b, v1.16b, v3.16b \n" /* G,A GGGGGGGG */ \
1054 "dup v0.2D, v2.D[1] \n" \
1055 "dup v1.2D, v3.D[1] \n"
1056
1057 #ifdef HAS_ARGB4444TOARGBROW_NEON
ARGB4444ToARGBRow_NEON(const uint8 * src_argb4444,uint8 * dst_argb,int pix)1058 void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
1059 int pix) {
1060 asm volatile (
1061 "1: \n"
1062 MEMACCESS(0)
1063 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
1064 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1065 ARGB4444TOARGB
1066 MEMACCESS(1)
1067 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels
1068 "b.gt 1b \n"
1069 : "+r"(src_argb4444), // %0
1070 "+r"(dst_argb), // %1
1071 "+r"(pix) // %2
1072 :
1073 : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List
1074 );
1075 }
1076 #endif // HAS_ARGB4444TOARGBROW_NEON
1077
1078 #ifdef HAS_ARGBTORGB24ROW_NEON
ARGBToRGB24Row_NEON(const uint8 * src_argb,uint8 * dst_rgb24,int pix)1079 void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) {
1080 asm volatile (
1081 "1: \n"
1082 MEMACCESS(0)
1083 "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB pixels
1084 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1085 MEMACCESS(1)
1086 "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of RGB24.
1087 "b.gt 1b \n"
1088 : "+r"(src_argb), // %0
1089 "+r"(dst_rgb24), // %1
1090 "+r"(pix) // %2
1091 :
1092 : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List
1093 );
1094 }
1095 #endif // HAS_ARGBTORGB24ROW_NEON
1096
1097 #ifdef HAS_ARGBTORAWROW_NEON
ARGBToRAWRow_NEON(const uint8 * src_argb,uint8 * dst_raw,int pix)1098 void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) {
1099 asm volatile (
1100 "1: \n"
1101 MEMACCESS(0)
1102 "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a
1103 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1104 "orr v4.8b, v2.8b, v2.8b \n" // mov g
1105 "orr v5.8b, v1.8b, v1.8b \n" // mov b
1106 MEMACCESS(1)
1107 "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b
1108 "b.gt 1b \n"
1109 : "+r"(src_argb), // %0
1110 "+r"(dst_raw), // %1
1111 "+r"(pix) // %2
1112 :
1113 : "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List
1114 );
1115 }
1116 #endif // HAS_ARGBTORAWROW_NEON
1117
1118 #ifdef HAS_YUY2TOYROW_NEON
YUY2ToYRow_NEON(const uint8 * src_yuy2,uint8 * dst_y,int pix)1119 void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) {
1120 asm volatile (
1121 "1: \n"
1122 MEMACCESS(0)
1123 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2.
1124 "subs %w2, %w2, #16 \n" // 16 processed per loop.
1125 MEMACCESS(1)
1126 "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y.
1127 "b.gt 1b \n"
1128 : "+r"(src_yuy2), // %0
1129 "+r"(dst_y), // %1
1130 "+r"(pix) // %2
1131 :
1132 : "cc", "memory", "v0", "v1" // Clobber List
1133 );
1134 }
1135 #endif // HAS_YUY2TOYROW_NEON
1136
1137 #ifdef HAS_UYVYTOYROW_NEON
UYVYToYRow_NEON(const uint8 * src_uyvy,uint8 * dst_y,int pix)1138 void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) {
1139 asm volatile (
1140 "1: \n"
1141 MEMACCESS(0)
1142 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY.
1143 "subs %w2, %w2, #16 \n" // 16 processed per loop.
1144 MEMACCESS(1)
1145 "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y.
1146 "b.gt 1b \n"
1147 : "+r"(src_uyvy), // %0
1148 "+r"(dst_y), // %1
1149 "+r"(pix) // %2
1150 :
1151 : "cc", "memory", "v0", "v1" // Clobber List
1152 );
1153 }
1154 #endif // HAS_UYVYTOYROW_NEON
1155
1156 #ifdef HAS_YUY2TOUV422ROW_NEON
YUY2ToUV422Row_NEON(const uint8 * src_yuy2,uint8 * dst_u,uint8 * dst_v,int pix)1157 void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
1158 int pix) {
1159 asm volatile (
1160 "1: \n"
1161 MEMACCESS(0)
1162 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2 pixels
1163 "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
1164 MEMACCESS(1)
1165 "st1 {v1.8b}, [%1], #8 \n" // store 8 U.
1166 MEMACCESS(2)
1167 "st1 {v3.8b}, [%2], #8 \n" // store 8 V.
1168 "b.gt 1b \n"
1169 : "+r"(src_yuy2), // %0
1170 "+r"(dst_u), // %1
1171 "+r"(dst_v), // %2
1172 "+r"(pix) // %3
1173 :
1174 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
1175 );
1176 }
1177 #endif // HAS_YUY2TOUV422ROW_NEON
1178
1179 #ifdef HAS_UYVYTOUV422ROW_NEON
UYVYToUV422Row_NEON(const uint8 * src_uyvy,uint8 * dst_u,uint8 * dst_v,int pix)1180 void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
1181 int pix) {
1182 asm volatile (
1183 "1: \n"
1184 MEMACCESS(0)
1185 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY pixels
1186 "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
1187 MEMACCESS(1)
1188 "st1 {v0.8b}, [%1], #8 \n" // store 8 U.
1189 MEMACCESS(2)
1190 "st1 {v2.8b}, [%2], #8 \n" // store 8 V.
1191 "b.gt 1b \n"
1192 : "+r"(src_uyvy), // %0
1193 "+r"(dst_u), // %1
1194 "+r"(dst_v), // %2
1195 "+r"(pix) // %3
1196 :
1197 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
1198 );
1199 }
1200 #endif // HAS_UYVYTOUV422ROW_NEON
1201
1202 #ifdef HAS_YUY2TOUVROW_NEON
YUY2ToUVRow_NEON(const uint8 * src_yuy2,int stride_yuy2,uint8 * dst_u,uint8 * dst_v,int pix)1203 void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
1204 uint8* dst_u, uint8* dst_v, int pix) {
1205 const uint8* src_yuy2b = src_yuy2 + stride_yuy2;
1206 asm volatile (
1207 "1: \n"
1208 MEMACCESS(0)
1209 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
1210 "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
1211 MEMACCESS(1)
1212 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
1213 "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U
1214 "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V
1215 MEMACCESS(2)
1216 "st1 {v1.8b}, [%2], #8 \n" // store 8 U.
1217 MEMACCESS(3)
1218 "st1 {v3.8b}, [%3], #8 \n" // store 8 V.
1219 "b.gt 1b \n"
1220 : "+r"(src_yuy2), // %0
1221 "+r"(src_yuy2b), // %1
1222 "+r"(dst_u), // %2
1223 "+r"(dst_v), // %3
1224 "+r"(pix) // %4
1225 :
1226 : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
1227 "v5", "v6", "v7" // Clobber List
1228 );
1229 }
1230 #endif // HAS_YUY2TOUVROW_NEON
1231
1232 #ifdef HAS_UYVYTOUVROW_NEON
UYVYToUVRow_NEON(const uint8 * src_uyvy,int stride_uyvy,uint8 * dst_u,uint8 * dst_v,int pix)1233 void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
1234 uint8* dst_u, uint8* dst_v, int pix) {
1235 const uint8* src_uyvyb = src_uyvy + stride_uyvy;
1236 asm volatile (
1237 "1: \n"
1238 MEMACCESS(0)
1239 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
1240 "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
1241 MEMACCESS(1)
1242 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
1243 "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U
1244 "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V
1245 MEMACCESS(2)
1246 "st1 {v0.8b}, [%2], #8 \n" // store 8 U.
1247 MEMACCESS(3)
1248 "st1 {v2.8b}, [%3], #8 \n" // store 8 V.
1249 "b.gt 1b \n"
1250 : "+r"(src_uyvy), // %0
1251 "+r"(src_uyvyb), // %1
1252 "+r"(dst_u), // %2
1253 "+r"(dst_v), // %3
1254 "+r"(pix) // %4
1255 :
1256 : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
1257 "v5", "v6", "v7" // Clobber List
1258 );
1259 }
1260 #endif // HAS_UYVYTOUVROW_NEON
1261
1262 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
1263 #ifdef HAS_ARGBSHUFFLEROW_NEON
ARGBShuffleRow_NEON(const uint8 * src_argb,uint8 * dst_argb,const uint8 * shuffler,int pix)1264 void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
1265 const uint8* shuffler, int pix) {
1266 asm volatile (
1267 MEMACCESS(3)
1268 "ld1 {v2.16b}, [%3] \n" // shuffler
1269 "1: \n"
1270 MEMACCESS(0)
1271 "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels.
1272 "subs %w2, %w2, #4 \n" // 4 processed per loop
1273 "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels
1274 MEMACCESS(1)
1275 "st1 {v1.16b}, [%1], #16 \n" // store 4.
1276 "b.gt 1b \n"
1277 : "+r"(src_argb), // %0
1278 "+r"(dst_argb), // %1
1279 "+r"(pix) // %2
1280 : "r"(shuffler) // %3
1281 : "cc", "memory", "v0", "v1", "v2" // Clobber List
1282 );
1283 }
1284 #endif // HAS_ARGBSHUFFLEROW_NEON
1285
1286 #ifdef HAS_I422TOYUY2ROW_NEON
I422ToYUY2Row_NEON(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_yuy2,int width)1287 void I422ToYUY2Row_NEON(const uint8* src_y,
1288 const uint8* src_u,
1289 const uint8* src_v,
1290 uint8* dst_yuy2, int width) {
1291 asm volatile (
1292 "1: \n"
1293 MEMACCESS(0)
1294 "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys
1295 "orr v2.8b, v1.8b, v1.8b \n"
1296 MEMACCESS(1)
1297 "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us
1298 MEMACCESS(2)
1299 "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs
1300 "subs %w4, %w4, #16 \n" // 16 pixels
1301 MEMACCESS(3)
1302 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
1303 "b.gt 1b \n"
1304 : "+r"(src_y), // %0
1305 "+r"(src_u), // %1
1306 "+r"(src_v), // %2
1307 "+r"(dst_yuy2), // %3
1308 "+r"(width) // %4
1309 :
1310 : "cc", "memory", "v0", "v1", "v2", "v3"
1311 );
1312 }
1313 #endif // HAS_I422TOYUY2ROW_NEON
1314
1315 #ifdef HAS_I422TOUYVYROW_NEON
I422ToUYVYRow_NEON(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_uyvy,int width)1316 void I422ToUYVYRow_NEON(const uint8* src_y,
1317 const uint8* src_u,
1318 const uint8* src_v,
1319 uint8* dst_uyvy, int width) {
1320 asm volatile (
1321 "1: \n"
1322 MEMACCESS(0)
1323 "ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys
1324 "orr v3.8b, v2.8b, v2.8b \n"
1325 MEMACCESS(1)
1326 "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us
1327 MEMACCESS(2)
1328 "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs
1329 "subs %w4, %w4, #16 \n" // 16 pixels
1330 MEMACCESS(3)
1331 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
1332 "b.gt 1b \n"
1333 : "+r"(src_y), // %0
1334 "+r"(src_u), // %1
1335 "+r"(src_v), // %2
1336 "+r"(dst_uyvy), // %3
1337 "+r"(width) // %4
1338 :
1339 : "cc", "memory", "v0", "v1", "v2", "v3"
1340 );
1341 }
1342 #endif // HAS_I422TOUYVYROW_NEON
1343
1344 #ifdef HAS_ARGBTORGB565ROW_NEON
ARGBToRGB565Row_NEON(const uint8 * src_argb,uint8 * dst_rgb565,int pix)1345 void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) {
1346 asm volatile (
1347 "1: \n"
1348 MEMACCESS(0)
1349 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
1350 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1351 ARGBTORGB565
1352 MEMACCESS(1)
1353 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565.
1354 "b.gt 1b \n"
1355 : "+r"(src_argb), // %0
1356 "+r"(dst_rgb565), // %1
1357 "+r"(pix) // %2
1358 :
1359 : "cc", "memory", "v0", "v20", "v21", "v22", "v23"
1360 );
1361 }
1362 #endif // HAS_ARGBTORGB565ROW_NEON
1363
1364 #ifdef HAS_ARGBTORGB565DITHERROW_NEON
ARGBToRGB565DitherRow_NEON(const uint8 * src_argb,uint8 * dst_rgb,const uint32 dither4,int width)1365 void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,
1366 const uint32 dither4, int width) {
1367 asm volatile (
1368 "dup v1.4s, %w2 \n" // dither4
1369 "1: \n"
1370 MEMACCESS(1)
1371 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" // load 8 pixels
1372 "subs %w3, %w3, #8 \n" // 8 processed per loop.
1373 "uqadd v20.8b, v20.8b, v1.8b \n"
1374 "uqadd v21.8b, v21.8b, v1.8b \n"
1375 "uqadd v22.8b, v22.8b, v1.8b \n"
1376 ARGBTORGB565
1377 MEMACCESS(0)
1378 "st1 {v0.16b}, [%0], #16 \n" // store 8 pixels RGB565.
1379 "b.gt 1b \n"
1380 : "+r"(dst_rgb) // %0
1381 : "r"(src_argb), // %1
1382 "r"(dither4), // %2
1383 "r"(width) // %3
1384 : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23"
1385 );
1386 }
1387 #endif // HAS_ARGBTORGB565ROW_NEON
1388
1389 #ifdef HAS_ARGBTOARGB1555ROW_NEON
ARGBToARGB1555Row_NEON(const uint8 * src_argb,uint8 * dst_argb1555,int pix)1390 void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
1391 int pix) {
1392 asm volatile (
1393 "1: \n"
1394 MEMACCESS(0)
1395 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
1396 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1397 ARGBTOARGB1555
1398 MEMACCESS(1)
1399 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB1555.
1400 "b.gt 1b \n"
1401 : "+r"(src_argb), // %0
1402 "+r"(dst_argb1555), // %1
1403 "+r"(pix) // %2
1404 :
1405 : "cc", "memory", "v0", "v20", "v21", "v22", "v23"
1406 );
1407 }
1408 #endif // HAS_ARGBTOARGB1555ROW_NEON
1409
1410 #ifdef HAS_ARGBTOARGB4444ROW_NEON
ARGBToARGB4444Row_NEON(const uint8 * src_argb,uint8 * dst_argb4444,int pix)1411 void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
1412 int pix) {
1413 asm volatile (
1414 "movi v4.16b, #0x0f \n" // bits to clear with vbic.
1415 "1: \n"
1416 MEMACCESS(0)
1417 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
1418 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1419 ARGBTOARGB4444
1420 MEMACCESS(1)
1421 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB4444.
1422 "b.gt 1b \n"
1423 : "+r"(src_argb), // %0
1424 "+r"(dst_argb4444), // %1
1425 "+r"(pix) // %2
1426 :
1427 : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23"
1428 );
1429 }
1430 #endif // HAS_ARGBTOARGB4444ROW_NEON
1431
1432 #ifdef HAS_ARGBTOYROW_NEON
ARGBToYRow_NEON(const uint8 * src_argb,uint8 * dst_y,int pix)1433 void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
1434 asm volatile (
1435 "movi v4.8b, #13 \n" // B * 0.1016 coefficient
1436 "movi v5.8b, #65 \n" // G * 0.5078 coefficient
1437 "movi v6.8b, #33 \n" // R * 0.2578 coefficient
1438 "movi v7.8b, #16 \n" // Add 16 constant
1439 "1: \n"
1440 MEMACCESS(0)
1441 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
1442 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1443 "umull v3.8h, v0.8b, v4.8b \n" // B
1444 "umlal v3.8h, v1.8b, v5.8b \n" // G
1445 "umlal v3.8h, v2.8b, v6.8b \n" // R
1446 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
1447 "uqadd v0.8b, v0.8b, v7.8b \n"
1448 MEMACCESS(1)
1449 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
1450 "b.gt 1b \n"
1451 : "+r"(src_argb), // %0
1452 "+r"(dst_y), // %1
1453 "+r"(pix) // %2
1454 :
1455 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
1456 );
1457 }
1458 #endif // HAS_ARGBTOYROW_NEON
1459
1460 #ifdef HAS_ARGBTOYJROW_NEON
ARGBToYJRow_NEON(const uint8 * src_argb,uint8 * dst_y,int pix)1461 void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
1462 asm volatile (
1463 "movi v4.8b, #15 \n" // B * 0.11400 coefficient
1464 "movi v5.8b, #75 \n" // G * 0.58700 coefficient
1465 "movi v6.8b, #38 \n" // R * 0.29900 coefficient
1466 "1: \n"
1467 MEMACCESS(0)
1468 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
1469 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1470 "umull v3.8h, v0.8b, v4.8b \n" // B
1471 "umlal v3.8h, v1.8b, v5.8b \n" // G
1472 "umlal v3.8h, v2.8b, v6.8b \n" // R
1473 "sqrshrun v0.8b, v3.8h, #7 \n" // 15 bit to 8 bit Y
1474 MEMACCESS(1)
1475 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
1476 "b.gt 1b \n"
1477 : "+r"(src_argb), // %0
1478 "+r"(dst_y), // %1
1479 "+r"(pix) // %2
1480 :
1481 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
1482 );
1483 }
1484 #endif // HAS_ARGBTOYJROW_NEON
1485
1486 // 8x1 pixels.
1487 #ifdef HAS_ARGBTOUV444ROW_NEON
ARGBToUV444Row_NEON(const uint8 * src_argb,uint8 * dst_u,uint8 * dst_v,int pix)1488 void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
1489 int pix) {
1490 asm volatile (
1491 "movi v24.8b, #112 \n" // UB / VR 0.875 coefficient
1492 "movi v25.8b, #74 \n" // UG -0.5781 coefficient
1493 "movi v26.8b, #38 \n" // UR -0.2969 coefficient
1494 "movi v27.8b, #18 \n" // VB -0.1406 coefficient
1495 "movi v28.8b, #94 \n" // VG -0.7344 coefficient
1496 "movi v29.16b,#0x80 \n" // 128.5
1497 "1: \n"
1498 MEMACCESS(0)
1499 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
1500 "subs %w3, %w3, #8 \n" // 8 processed per loop.
1501 "umull v4.8h, v0.8b, v24.8b \n" // B
1502 "umlsl v4.8h, v1.8b, v25.8b \n" // G
1503 "umlsl v4.8h, v2.8b, v26.8b \n" // R
1504 "add v4.8h, v4.8h, v29.8h \n" // +128 -> unsigned
1505
1506 "umull v3.8h, v2.8b, v24.8b \n" // R
1507 "umlsl v3.8h, v1.8b, v28.8b \n" // G
1508 "umlsl v3.8h, v0.8b, v27.8b \n" // B
1509 "add v3.8h, v3.8h, v29.8h \n" // +128 -> unsigned
1510
1511 "uqshrn v0.8b, v4.8h, #8 \n" // 16 bit to 8 bit U
1512 "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V
1513
1514 MEMACCESS(1)
1515 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U.
1516 MEMACCESS(2)
1517 "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V.
1518 "b.gt 1b \n"
1519 : "+r"(src_argb), // %0
1520 "+r"(dst_u), // %1
1521 "+r"(dst_v), // %2
1522 "+r"(pix) // %3
1523 :
1524 : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
1525 "v24", "v25", "v26", "v27", "v28", "v29"
1526 );
1527 }
1528 #endif // HAS_ARGBTOUV444ROW_NEON
1529
1530 // 16x1 pixels -> 8x1. pix is number of argb pixels. e.g. 16.
1531 #ifdef HAS_ARGBTOUV422ROW_NEON
ARGBToUV422Row_NEON(const uint8 * src_argb,uint8 * dst_u,uint8 * dst_v,int pix)1532 void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
1533 int pix) {
1534 asm volatile (
1535 RGBTOUV_SETUP_REG
1536 "1: \n"
1537 MEMACCESS(0)
1538 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
1539
1540 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
1541 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
1542 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
1543
1544 "subs %w3, %w3, #16 \n" // 16 processed per loop.
1545 "mul v3.8h, v0.8h, v20.8h \n" // B
1546 "mls v3.8h, v1.8h, v21.8h \n" // G
1547 "mls v3.8h, v2.8h, v22.8h \n" // R
1548 "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned
1549
1550 "mul v4.8h, v2.8h, v20.8h \n" // R
1551 "mls v4.8h, v1.8h, v24.8h \n" // G
1552 "mls v4.8h, v0.8h, v23.8h \n" // B
1553 "add v4.8h, v4.8h, v25.8h \n" // +128 -> unsigned
1554
1555 "uqshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit U
1556 "uqshrn v1.8b, v4.8h, #8 \n" // 16 bit to 8 bit V
1557
1558 MEMACCESS(1)
1559 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U.
1560 MEMACCESS(2)
1561 "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V.
1562 "b.gt 1b \n"
1563 : "+r"(src_argb), // %0
1564 "+r"(dst_u), // %1
1565 "+r"(dst_v), // %2
1566 "+r"(pix) // %3
1567 :
1568 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1569 "v20", "v21", "v22", "v23", "v24", "v25"
1570 );
1571 }
1572 #endif // HAS_ARGBTOUV422ROW_NEON
1573
1574 // 32x1 pixels -> 8x1. pix is number of argb pixels. e.g. 32.
1575 #ifdef HAS_ARGBTOUV411ROW_NEON
ARGBToUV411Row_NEON(const uint8 * src_argb,uint8 * dst_u,uint8 * dst_v,int pix)1576 void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
1577 int pix) {
1578 asm volatile (
1579 RGBTOUV_SETUP_REG
1580 "1: \n"
1581 MEMACCESS(0)
1582 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
1583 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
1584 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
1585 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
1586 MEMACCESS(0)
1587 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%0], #64 \n" // load next 16.
1588 "uaddlp v4.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
1589 "uaddlp v5.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
1590 "uaddlp v6.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
1591
1592 "addp v0.8h, v0.8h, v4.8h \n" // B 16 shorts -> 8 shorts.
1593 "addp v1.8h, v1.8h, v5.8h \n" // G 16 shorts -> 8 shorts.
1594 "addp v2.8h, v2.8h, v6.8h \n" // R 16 shorts -> 8 shorts.
1595
1596 "urshr v0.8h, v0.8h, #1 \n" // 2x average
1597 "urshr v1.8h, v1.8h, #1 \n"
1598 "urshr v2.8h, v2.8h, #1 \n"
1599
1600 "subs %w3, %w3, #32 \n" // 32 processed per loop.
1601 "mul v3.8h, v0.8h, v20.8h \n" // B
1602 "mls v3.8h, v1.8h, v21.8h \n" // G
1603 "mls v3.8h, v2.8h, v22.8h \n" // R
1604 "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned
1605 "mul v4.8h, v2.8h, v20.8h \n" // R
1606 "mls v4.8h, v1.8h, v24.8h \n" // G
1607 "mls v4.8h, v0.8h, v23.8h \n" // B
1608 "add v4.8h, v4.8h, v25.8h \n" // +128 -> unsigned
1609 "uqshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit U
1610 "uqshrn v1.8b, v4.8h, #8 \n" // 16 bit to 8 bit V
1611 MEMACCESS(1)
1612 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U.
1613 MEMACCESS(2)
1614 "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V.
1615 "b.gt 1b \n"
1616 : "+r"(src_argb), // %0
1617 "+r"(dst_u), // %1
1618 "+r"(dst_v), // %2
1619 "+r"(pix) // %3
1620 :
1621 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1622 "v20", "v21", "v22", "v23", "v24", "v25"
1623 );
1624 }
1625 #endif // HAS_ARGBTOUV411ROW_NEON
1626
1627 // 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16.
1628 #define RGBTOUV(QB, QG, QR) \
1629 "mul v3.8h, " #QB ",v20.8h \n" /* B */ \
1630 "mul v4.8h, " #QR ",v20.8h \n" /* R */ \
1631 "mls v3.8h, " #QG ",v21.8h \n" /* G */ \
1632 "mls v4.8h, " #QG ",v24.8h \n" /* G */ \
1633 "mls v3.8h, " #QR ",v22.8h \n" /* R */ \
1634 "mls v4.8h, " #QB ",v23.8h \n" /* B */ \
1635 "add v3.8h, v3.8h, v25.8h \n" /* +128 -> unsigned */ \
1636 "add v4.8h, v4.8h, v25.8h \n" /* +128 -> unsigned */ \
1637 "uqshrn v0.8b, v3.8h, #8 \n" /* 16 bit to 8 bit U */ \
1638 "uqshrn v1.8b, v4.8h, #8 \n" /* 16 bit to 8 bit V */
1639
1640 // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
1641 // TODO(fbarchard): consider ptrdiff_t for all strides.
1642
1643 #ifdef HAS_ARGBTOUVROW_NEON
ARGBToUVRow_NEON(const uint8 * src_argb,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int pix)1644 void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
1645 uint8* dst_u, uint8* dst_v, int pix) {
1646 const uint8* src_argb_1 = src_argb + src_stride_argb;
1647 asm volatile (
1648 RGBTOUV_SETUP_REG
1649 "1: \n"
1650 MEMACCESS(0)
1651 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
1652 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
1653 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
1654 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
1655
1656 MEMACCESS(1)
1657 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
1658 "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
1659 "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
1660 "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
1661
1662 "urshr v0.8h, v0.8h, #1 \n" // 2x average
1663 "urshr v1.8h, v1.8h, #1 \n"
1664 "urshr v2.8h, v2.8h, #1 \n"
1665
1666 "subs %w4, %w4, #16 \n" // 32 processed per loop.
1667 RGBTOUV(v0.8h, v1.8h, v2.8h)
1668 MEMACCESS(2)
1669 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
1670 MEMACCESS(3)
1671 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
1672 "b.gt 1b \n"
1673 : "+r"(src_argb), // %0
1674 "+r"(src_argb_1), // %1
1675 "+r"(dst_u), // %2
1676 "+r"(dst_v), // %3
1677 "+r"(pix) // %4
1678 :
1679 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1680 "v20", "v21", "v22", "v23", "v24", "v25"
1681 );
1682 }
1683 #endif // HAS_ARGBTOUVROW_NEON
1684
1685 // TODO(fbarchard): Subsample match C code.
1686 #ifdef HAS_ARGBTOUVJROW_NEON
ARGBToUVJRow_NEON(const uint8 * src_argb,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int pix)1687 void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
1688 uint8* dst_u, uint8* dst_v, int pix) {
1689 const uint8* src_argb_1 = src_argb + src_stride_argb;
1690 asm volatile (
1691 "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2
1692 "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2
1693 "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2
1694 "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2
1695 "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2
1696 "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit)
1697 "1: \n"
1698 MEMACCESS(0)
1699 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
1700 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
1701 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
1702 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
1703 MEMACCESS(1)
1704 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
1705 "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
1706 "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
1707 "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
1708
1709 "urshr v0.8h, v0.8h, #1 \n" // 2x average
1710 "urshr v1.8h, v1.8h, #1 \n"
1711 "urshr v2.8h, v2.8h, #1 \n"
1712
1713 "subs %w4, %w4, #16 \n" // 32 processed per loop.
1714 RGBTOUV(v0.8h, v1.8h, v2.8h)
1715 MEMACCESS(2)
1716 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
1717 MEMACCESS(3)
1718 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
1719 "b.gt 1b \n"
1720 : "+r"(src_argb), // %0
1721 "+r"(src_argb_1), // %1
1722 "+r"(dst_u), // %2
1723 "+r"(dst_v), // %3
1724 "+r"(pix) // %4
1725 :
1726 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1727 "v20", "v21", "v22", "v23", "v24", "v25"
1728 );
1729 }
1730 #endif // HAS_ARGBTOUVJROW_NEON
1731
1732 #ifdef HAS_BGRATOUVROW_NEON
BGRAToUVRow_NEON(const uint8 * src_bgra,int src_stride_bgra,uint8 * dst_u,uint8 * dst_v,int pix)1733 void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
1734 uint8* dst_u, uint8* dst_v, int pix) {
1735 const uint8* src_bgra_1 = src_bgra + src_stride_bgra;
1736 asm volatile (
1737 RGBTOUV_SETUP_REG
1738 "1: \n"
1739 MEMACCESS(0)
1740 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
1741 "uaddlp v0.8h, v3.16b \n" // B 16 bytes -> 8 shorts.
1742 "uaddlp v3.8h, v2.16b \n" // G 16 bytes -> 8 shorts.
1743 "uaddlp v2.8h, v1.16b \n" // R 16 bytes -> 8 shorts.
1744 MEMACCESS(1)
1745 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more
1746 "uadalp v0.8h, v7.16b \n" // B 16 bytes -> 8 shorts.
1747 "uadalp v3.8h, v6.16b \n" // G 16 bytes -> 8 shorts.
1748 "uadalp v2.8h, v5.16b \n" // R 16 bytes -> 8 shorts.
1749
1750 "urshr v0.8h, v0.8h, #1 \n" // 2x average
1751 "urshr v1.8h, v3.8h, #1 \n"
1752 "urshr v2.8h, v2.8h, #1 \n"
1753
1754 "subs %w4, %w4, #16 \n" // 32 processed per loop.
1755 RGBTOUV(v0.8h, v1.8h, v2.8h)
1756 MEMACCESS(2)
1757 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
1758 MEMACCESS(3)
1759 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
1760 "b.gt 1b \n"
1761 : "+r"(src_bgra), // %0
1762 "+r"(src_bgra_1), // %1
1763 "+r"(dst_u), // %2
1764 "+r"(dst_v), // %3
1765 "+r"(pix) // %4
1766 :
1767 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1768 "v20", "v21", "v22", "v23", "v24", "v25"
1769 );
1770 }
1771 #endif // HAS_BGRATOUVROW_NEON
1772
1773 #ifdef HAS_ABGRTOUVROW_NEON
ABGRToUVRow_NEON(const uint8 * src_abgr,int src_stride_abgr,uint8 * dst_u,uint8 * dst_v,int pix)1774 void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
1775 uint8* dst_u, uint8* dst_v, int pix) {
1776 const uint8* src_abgr_1 = src_abgr + src_stride_abgr;
1777 asm volatile (
1778 RGBTOUV_SETUP_REG
1779 "1: \n"
1780 MEMACCESS(0)
1781 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
1782 "uaddlp v3.8h, v2.16b \n" // B 16 bytes -> 8 shorts.
1783 "uaddlp v2.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
1784 "uaddlp v1.8h, v0.16b \n" // R 16 bytes -> 8 shorts.
1785 MEMACCESS(1)
1786 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more.
1787 "uadalp v3.8h, v6.16b \n" // B 16 bytes -> 8 shorts.
1788 "uadalp v2.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
1789 "uadalp v1.8h, v4.16b \n" // R 16 bytes -> 8 shorts.
1790
1791 "urshr v0.8h, v3.8h, #1 \n" // 2x average
1792 "urshr v2.8h, v2.8h, #1 \n"
1793 "urshr v1.8h, v1.8h, #1 \n"
1794
1795 "subs %w4, %w4, #16 \n" // 32 processed per loop.
1796 RGBTOUV(v0.8h, v2.8h, v1.8h)
1797 MEMACCESS(2)
1798 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
1799 MEMACCESS(3)
1800 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
1801 "b.gt 1b \n"
1802 : "+r"(src_abgr), // %0
1803 "+r"(src_abgr_1), // %1
1804 "+r"(dst_u), // %2
1805 "+r"(dst_v), // %3
1806 "+r"(pix) // %4
1807 :
1808 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1809 "v20", "v21", "v22", "v23", "v24", "v25"
1810 );
1811 }
1812 #endif // HAS_ABGRTOUVROW_NEON
1813
1814 #ifdef HAS_RGBATOUVROW_NEON
RGBAToUVRow_NEON(const uint8 * src_rgba,int src_stride_rgba,uint8 * dst_u,uint8 * dst_v,int pix)1815 void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
1816 uint8* dst_u, uint8* dst_v, int pix) {
1817 const uint8* src_rgba_1 = src_rgba + src_stride_rgba;
1818 asm volatile (
1819 RGBTOUV_SETUP_REG
1820 "1: \n"
1821 MEMACCESS(0)
1822 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
1823 "uaddlp v0.8h, v1.16b \n" // B 16 bytes -> 8 shorts.
1824 "uaddlp v1.8h, v2.16b \n" // G 16 bytes -> 8 shorts.
1825 "uaddlp v2.8h, v3.16b \n" // R 16 bytes -> 8 shorts.
1826 MEMACCESS(1)
1827 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more.
1828 "uadalp v0.8h, v5.16b \n" // B 16 bytes -> 8 shorts.
1829 "uadalp v1.8h, v6.16b \n" // G 16 bytes -> 8 shorts.
1830 "uadalp v2.8h, v7.16b \n" // R 16 bytes -> 8 shorts.
1831
1832 "urshr v0.8h, v0.8h, #1 \n" // 2x average
1833 "urshr v1.8h, v1.8h, #1 \n"
1834 "urshr v2.8h, v2.8h, #1 \n"
1835
1836 "subs %w4, %w4, #16 \n" // 32 processed per loop.
1837 RGBTOUV(v0.8h, v1.8h, v2.8h)
1838 MEMACCESS(2)
1839 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
1840 MEMACCESS(3)
1841 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
1842 "b.gt 1b \n"
1843 : "+r"(src_rgba), // %0
1844 "+r"(src_rgba_1), // %1
1845 "+r"(dst_u), // %2
1846 "+r"(dst_v), // %3
1847 "+r"(pix) // %4
1848 :
1849 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1850 "v20", "v21", "v22", "v23", "v24", "v25"
1851 );
1852 }
1853 #endif // HAS_RGBATOUVROW_NEON
1854
1855 #ifdef HAS_RGB24TOUVROW_NEON
RGB24ToUVRow_NEON(const uint8 * src_rgb24,int src_stride_rgb24,uint8 * dst_u,uint8 * dst_v,int pix)1856 void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
1857 uint8* dst_u, uint8* dst_v, int pix) {
1858 const uint8* src_rgb24_1 = src_rgb24 + src_stride_rgb24;
1859 asm volatile (
1860 RGBTOUV_SETUP_REG
1861 "1: \n"
1862 MEMACCESS(0)
1863 "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels.
1864 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
1865 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
1866 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
1867 MEMACCESS(1)
1868 "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 16 more.
1869 "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
1870 "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
1871 "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
1872
1873 "urshr v0.8h, v0.8h, #1 \n" // 2x average
1874 "urshr v1.8h, v1.8h, #1 \n"
1875 "urshr v2.8h, v2.8h, #1 \n"
1876
1877 "subs %w4, %w4, #16 \n" // 32 processed per loop.
1878 RGBTOUV(v0.8h, v1.8h, v2.8h)
1879 MEMACCESS(2)
1880 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
1881 MEMACCESS(3)
1882 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
1883 "b.gt 1b \n"
1884 : "+r"(src_rgb24), // %0
1885 "+r"(src_rgb24_1), // %1
1886 "+r"(dst_u), // %2
1887 "+r"(dst_v), // %3
1888 "+r"(pix) // %4
1889 :
1890 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1891 "v20", "v21", "v22", "v23", "v24", "v25"
1892 );
1893 }
1894 #endif // HAS_RGB24TOUVROW_NEON
1895
1896 #ifdef HAS_RAWTOUVROW_NEON
RAWToUVRow_NEON(const uint8 * src_raw,int src_stride_raw,uint8 * dst_u,uint8 * dst_v,int pix)1897 void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
1898 uint8* dst_u, uint8* dst_v, int pix) {
1899 const uint8* src_raw_1 = src_raw + src_stride_raw;
1900 asm volatile (
1901 RGBTOUV_SETUP_REG
1902 "1: \n"
1903 MEMACCESS(0)
1904 "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 8 RAW pixels.
1905 "uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts.
1906 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
1907 "uaddlp v0.8h, v0.16b \n" // R 16 bytes -> 8 shorts.
1908 MEMACCESS(1)
1909 "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 8 more RAW pixels
1910 "uadalp v2.8h, v6.16b \n" // B 16 bytes -> 8 shorts.
1911 "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
1912 "uadalp v0.8h, v4.16b \n" // R 16 bytes -> 8 shorts.
1913
1914 "urshr v2.8h, v2.8h, #1 \n" // 2x average
1915 "urshr v1.8h, v1.8h, #1 \n"
1916 "urshr v0.8h, v0.8h, #1 \n"
1917
1918 "subs %w4, %w4, #16 \n" // 32 processed per loop.
1919 RGBTOUV(v2.8h, v1.8h, v0.8h)
1920 MEMACCESS(2)
1921 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
1922 MEMACCESS(3)
1923 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
1924 "b.gt 1b \n"
1925 : "+r"(src_raw), // %0
1926 "+r"(src_raw_1), // %1
1927 "+r"(dst_u), // %2
1928 "+r"(dst_v), // %3
1929 "+r"(pix) // %4
1930 :
1931 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1932 "v20", "v21", "v22", "v23", "v24", "v25"
1933 );
1934 }
1935 #endif // HAS_RAWTOUVROW_NEON
1936
1937 // 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16.
1938 #ifdef HAS_RGB565TOUVROW_NEON
RGB565ToUVRow_NEON(const uint8 * src_rgb565,int src_stride_rgb565,uint8 * dst_u,uint8 * dst_v,int pix)1939 void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
1940 uint8* dst_u, uint8* dst_v, int pix) {
1941 const uint8* src_rgb565_1 = src_rgb565 + src_stride_rgb565;
1942 asm volatile (
1943 "movi v22.8h, #56, lsl #0 \n" // UB / VR coeff (0.875) / 2
1944 "movi v23.8h, #37, lsl #0 \n" // UG coeff (-0.5781) / 2
1945 "movi v24.8h, #19, lsl #0 \n" // UR coeff (-0.2969) / 2
1946 "movi v25.8h, #9 , lsl #0 \n" // VB coeff (-0.1406) / 2
1947 "movi v26.8h, #47, lsl #0 \n" // VG coeff (-0.7344) / 2
1948 "movi v27.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit)
1949 "1: \n"
1950 MEMACCESS(0)
1951 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
1952 RGB565TOARGB
1953 "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
1954 "uaddlp v18.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
1955 "uaddlp v20.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
1956 MEMACCESS(0)
1957 "ld1 {v0.16b}, [%0], #16 \n" // next 8 RGB565 pixels.
1958 RGB565TOARGB
1959 "uaddlp v17.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
1960 "uaddlp v19.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
1961 "uaddlp v21.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
1962
1963 MEMACCESS(1)
1964 "ld1 {v0.16b}, [%1], #16 \n" // load 8 RGB565 pixels.
1965 RGB565TOARGB
1966 "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
1967 "uadalp v18.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
1968 "uadalp v20.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
1969 MEMACCESS(1)
1970 "ld1 {v0.16b}, [%1], #16 \n" // next 8 RGB565 pixels.
1971 RGB565TOARGB
1972 "uadalp v17.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
1973 "uadalp v19.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
1974 "uadalp v21.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
1975
1976 "ins v16.D[1], v17.D[0] \n"
1977 "ins v18.D[1], v19.D[0] \n"
1978 "ins v20.D[1], v21.D[0] \n"
1979
1980 "urshr v4.8h, v16.8h, #1 \n" // 2x average
1981 "urshr v5.8h, v18.8h, #1 \n"
1982 "urshr v6.8h, v20.8h, #1 \n"
1983
1984 "subs %w4, %w4, #16 \n" // 16 processed per loop.
1985 "mul v16.8h, v4.8h, v22.8h \n" // B
1986 "mls v16.8h, v5.8h, v23.8h \n" // G
1987 "mls v16.8h, v6.8h, v24.8h \n" // R
1988 "add v16.8h, v16.8h, v27.8h \n" // +128 -> unsigned
1989 "mul v17.8h, v6.8h, v22.8h \n" // R
1990 "mls v17.8h, v5.8h, v26.8h \n" // G
1991 "mls v17.8h, v4.8h, v25.8h \n" // B
1992 "add v17.8h, v17.8h, v27.8h \n" // +128 -> unsigned
1993 "uqshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit U
1994 "uqshrn v1.8b, v17.8h, #8 \n" // 16 bit to 8 bit V
1995 MEMACCESS(2)
1996 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
1997 MEMACCESS(3)
1998 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
1999 "b.gt 1b \n"
2000 : "+r"(src_rgb565), // %0
2001 "+r"(src_rgb565_1), // %1
2002 "+r"(dst_u), // %2
2003 "+r"(dst_v), // %3
2004 "+r"(pix) // %4
2005 :
2006 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
2007 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
2008 "v25", "v26", "v27"
2009 );
2010 }
2011 #endif // HAS_RGB565TOUVROW_NEON
2012
2013 // 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16.
2014 #ifdef HAS_ARGB1555TOUVROW_NEON
ARGB1555ToUVRow_NEON(const uint8 * src_argb1555,int src_stride_argb1555,uint8 * dst_u,uint8 * dst_v,int pix)2015 void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
2016 uint8* dst_u, uint8* dst_v, int pix) {
2017 const uint8* src_argb1555_1 = src_argb1555 + src_stride_argb1555;
2018 asm volatile (
2019 RGBTOUV_SETUP_REG
2020 "1: \n"
2021 MEMACCESS(0)
2022 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
2023 RGB555TOARGB
2024 "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
2025 "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
2026 "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
2027 MEMACCESS(0)
2028 "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB1555 pixels.
2029 RGB555TOARGB
2030 "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
2031 "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
2032 "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
2033
2034 MEMACCESS(1)
2035 "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB1555 pixels.
2036 RGB555TOARGB
2037 "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
2038 "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
2039 "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
2040 MEMACCESS(1)
2041 "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB1555 pixels.
2042 RGB555TOARGB
2043 "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
2044 "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
2045 "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
2046
2047 "ins v16.D[1], v26.D[0] \n"
2048 "ins v17.D[1], v27.D[0] \n"
2049 "ins v18.D[1], v28.D[0] \n"
2050
2051 "urshr v4.8h, v16.8h, #1 \n" // 2x average
2052 "urshr v5.8h, v17.8h, #1 \n"
2053 "urshr v6.8h, v18.8h, #1 \n"
2054
2055 "subs %w4, %w4, #16 \n" // 16 processed per loop.
2056 "mul v2.8h, v4.8h, v20.8h \n" // B
2057 "mls v2.8h, v5.8h, v21.8h \n" // G
2058 "mls v2.8h, v6.8h, v22.8h \n" // R
2059 "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned
2060 "mul v3.8h, v6.8h, v20.8h \n" // R
2061 "mls v3.8h, v5.8h, v24.8h \n" // G
2062 "mls v3.8h, v4.8h, v23.8h \n" // B
2063 "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned
2064 "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U
2065 "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V
2066 MEMACCESS(2)
2067 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
2068 MEMACCESS(3)
2069 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
2070 "b.gt 1b \n"
2071 : "+r"(src_argb1555), // %0
2072 "+r"(src_argb1555_1), // %1
2073 "+r"(dst_u), // %2
2074 "+r"(dst_v), // %3
2075 "+r"(pix) // %4
2076 :
2077 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
2078 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
2079 "v26", "v27", "v28"
2080 );
2081 }
2082 #endif // HAS_ARGB1555TOUVROW_NEON
2083
2084 // 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16.
2085 #ifdef HAS_ARGB4444TOUVROW_NEON
ARGB4444ToUVRow_NEON(const uint8 * src_argb4444,int src_stride_argb4444,uint8 * dst_u,uint8 * dst_v,int pix)2086 void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
2087 uint8* dst_u, uint8* dst_v, int pix) {
2088 const uint8* src_argb4444_1 = src_argb4444 + src_stride_argb4444;
2089 asm volatile (
2090 RGBTOUV_SETUP_REG
2091 "1: \n"
2092 MEMACCESS(0)
2093 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
2094 ARGB4444TOARGB
2095 "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
2096 "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
2097 "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
2098 MEMACCESS(0)
2099 "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB4444 pixels.
2100 ARGB4444TOARGB
2101 "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
2102 "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
2103 "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
2104
2105 MEMACCESS(1)
2106 "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB4444 pixels.
2107 ARGB4444TOARGB
2108 "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
2109 "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
2110 "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
2111 MEMACCESS(1)
2112 "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB4444 pixels.
2113 ARGB4444TOARGB
2114 "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
2115 "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
2116 "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
2117
2118 "ins v16.D[1], v26.D[0] \n"
2119 "ins v17.D[1], v27.D[0] \n"
2120 "ins v18.D[1], v28.D[0] \n"
2121
2122 "urshr v4.8h, v16.8h, #1 \n" // 2x average
2123 "urshr v5.8h, v17.8h, #1 \n"
2124 "urshr v6.8h, v18.8h, #1 \n"
2125
2126 "subs %w4, %w4, #16 \n" // 16 processed per loop.
2127 "mul v2.8h, v4.8h, v20.8h \n" // B
2128 "mls v2.8h, v5.8h, v21.8h \n" // G
2129 "mls v2.8h, v6.8h, v22.8h \n" // R
2130 "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned
2131 "mul v3.8h, v6.8h, v20.8h \n" // R
2132 "mls v3.8h, v5.8h, v24.8h \n" // G
2133 "mls v3.8h, v4.8h, v23.8h \n" // B
2134 "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned
2135 "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U
2136 "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V
2137 MEMACCESS(2)
2138 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
2139 MEMACCESS(3)
2140 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
2141 "b.gt 1b \n"
2142 : "+r"(src_argb4444), // %0
2143 "+r"(src_argb4444_1), // %1
2144 "+r"(dst_u), // %2
2145 "+r"(dst_v), // %3
2146 "+r"(pix) // %4
2147 :
2148 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
2149 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
2150 "v26", "v27", "v28"
2151
2152 );
2153 }
2154 #endif // HAS_ARGB4444TOUVROW_NEON
2155
2156 #ifdef HAS_RGB565TOYROW_NEON
RGB565ToYRow_NEON(const uint8 * src_rgb565,uint8 * dst_y,int pix)2157 void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) {
2158 asm volatile (
2159 "movi v24.8b, #13 \n" // B * 0.1016 coefficient
2160 "movi v25.8b, #65 \n" // G * 0.5078 coefficient
2161 "movi v26.8b, #33 \n" // R * 0.2578 coefficient
2162 "movi v27.8b, #16 \n" // Add 16 constant
2163 "1: \n"
2164 MEMACCESS(0)
2165 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
2166 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2167 RGB565TOARGB
2168 "umull v3.8h, v0.8b, v24.8b \n" // B
2169 "umlal v3.8h, v1.8b, v25.8b \n" // G
2170 "umlal v3.8h, v2.8b, v26.8b \n" // R
2171 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
2172 "uqadd v0.8b, v0.8b, v27.8b \n"
2173 MEMACCESS(1)
2174 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
2175 "b.gt 1b \n"
2176 : "+r"(src_rgb565), // %0
2177 "+r"(dst_y), // %1
2178 "+r"(pix) // %2
2179 :
2180 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6",
2181 "v24", "v25", "v26", "v27"
2182 );
2183 }
2184 #endif // HAS_RGB565TOYROW_NEON
2185
2186 #ifdef HAS_ARGB1555TOYROW_NEON
ARGB1555ToYRow_NEON(const uint8 * src_argb1555,uint8 * dst_y,int pix)2187 void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) {
2188 asm volatile (
2189 "movi v4.8b, #13 \n" // B * 0.1016 coefficient
2190 "movi v5.8b, #65 \n" // G * 0.5078 coefficient
2191 "movi v6.8b, #33 \n" // R * 0.2578 coefficient
2192 "movi v7.8b, #16 \n" // Add 16 constant
2193 "1: \n"
2194 MEMACCESS(0)
2195 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
2196 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2197 ARGB1555TOARGB
2198 "umull v3.8h, v0.8b, v4.8b \n" // B
2199 "umlal v3.8h, v1.8b, v5.8b \n" // G
2200 "umlal v3.8h, v2.8b, v6.8b \n" // R
2201 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
2202 "uqadd v0.8b, v0.8b, v7.8b \n"
2203 MEMACCESS(1)
2204 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
2205 "b.gt 1b \n"
2206 : "+r"(src_argb1555), // %0
2207 "+r"(dst_y), // %1
2208 "+r"(pix) // %2
2209 :
2210 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
2211 );
2212 }
2213 #endif // HAS_ARGB1555TOYROW_NEON
2214
2215 #ifdef HAS_ARGB4444TOYROW_NEON
ARGB4444ToYRow_NEON(const uint8 * src_argb4444,uint8 * dst_y,int pix)2216 void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) {
2217 asm volatile (
2218 "movi v24.8b, #13 \n" // B * 0.1016 coefficient
2219 "movi v25.8b, #65 \n" // G * 0.5078 coefficient
2220 "movi v26.8b, #33 \n" // R * 0.2578 coefficient
2221 "movi v27.8b, #16 \n" // Add 16 constant
2222 "1: \n"
2223 MEMACCESS(0)
2224 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
2225 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2226 ARGB4444TOARGB
2227 "umull v3.8h, v0.8b, v24.8b \n" // B
2228 "umlal v3.8h, v1.8b, v25.8b \n" // G
2229 "umlal v3.8h, v2.8b, v26.8b \n" // R
2230 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
2231 "uqadd v0.8b, v0.8b, v27.8b \n"
2232 MEMACCESS(1)
2233 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
2234 "b.gt 1b \n"
2235 : "+r"(src_argb4444), // %0
2236 "+r"(dst_y), // %1
2237 "+r"(pix) // %2
2238 :
2239 : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27"
2240 );
2241 }
2242 #endif // HAS_ARGB4444TOYROW_NEON
2243
2244 #ifdef HAS_BGRATOYROW_NEON
BGRAToYRow_NEON(const uint8 * src_bgra,uint8 * dst_y,int pix)2245 void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) {
2246 asm volatile (
2247 "movi v4.8b, #33 \n" // R * 0.2578 coefficient
2248 "movi v5.8b, #65 \n" // G * 0.5078 coefficient
2249 "movi v6.8b, #13 \n" // B * 0.1016 coefficient
2250 "movi v7.8b, #16 \n" // Add 16 constant
2251 "1: \n"
2252 MEMACCESS(0)
2253 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
2254 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2255 "umull v16.8h, v1.8b, v4.8b \n" // R
2256 "umlal v16.8h, v2.8b, v5.8b \n" // G
2257 "umlal v16.8h, v3.8b, v6.8b \n" // B
2258 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
2259 "uqadd v0.8b, v0.8b, v7.8b \n"
2260 MEMACCESS(1)
2261 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
2262 "b.gt 1b \n"
2263 : "+r"(src_bgra), // %0
2264 "+r"(dst_y), // %1
2265 "+r"(pix) // %2
2266 :
2267 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
2268 );
2269 }
2270 #endif // HAS_BGRATOYROW_NEON
2271
2272 #ifdef HAS_ABGRTOYROW_NEON
ABGRToYRow_NEON(const uint8 * src_abgr,uint8 * dst_y,int pix)2273 void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) {
2274 asm volatile (
2275 "movi v4.8b, #33 \n" // R * 0.2578 coefficient
2276 "movi v5.8b, #65 \n" // G * 0.5078 coefficient
2277 "movi v6.8b, #13 \n" // B * 0.1016 coefficient
2278 "movi v7.8b, #16 \n" // Add 16 constant
2279 "1: \n"
2280 MEMACCESS(0)
2281 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
2282 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2283 "umull v16.8h, v0.8b, v4.8b \n" // R
2284 "umlal v16.8h, v1.8b, v5.8b \n" // G
2285 "umlal v16.8h, v2.8b, v6.8b \n" // B
2286 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
2287 "uqadd v0.8b, v0.8b, v7.8b \n"
2288 MEMACCESS(1)
2289 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
2290 "b.gt 1b \n"
2291 : "+r"(src_abgr), // %0
2292 "+r"(dst_y), // %1
2293 "+r"(pix) // %2
2294 :
2295 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
2296 );
2297 }
2298 #endif // HAS_ABGRTOYROW_NEON
2299
2300 #ifdef HAS_RGBATOYROW_NEON
RGBAToYRow_NEON(const uint8 * src_rgba,uint8 * dst_y,int pix)2301 void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) {
2302 asm volatile (
2303 "movi v4.8b, #13 \n" // B * 0.1016 coefficient
2304 "movi v5.8b, #65 \n" // G * 0.5078 coefficient
2305 "movi v6.8b, #33 \n" // R * 0.2578 coefficient
2306 "movi v7.8b, #16 \n" // Add 16 constant
2307 "1: \n"
2308 MEMACCESS(0)
2309 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
2310 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2311 "umull v16.8h, v1.8b, v4.8b \n" // B
2312 "umlal v16.8h, v2.8b, v5.8b \n" // G
2313 "umlal v16.8h, v3.8b, v6.8b \n" // R
2314 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
2315 "uqadd v0.8b, v0.8b, v7.8b \n"
2316 MEMACCESS(1)
2317 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
2318 "b.gt 1b \n"
2319 : "+r"(src_rgba), // %0
2320 "+r"(dst_y), // %1
2321 "+r"(pix) // %2
2322 :
2323 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
2324 );
2325 }
2326 #endif // HAS_RGBATOYROW_NEON
2327
2328 #ifdef HAS_RGB24TOYROW_NEON
RGB24ToYRow_NEON(const uint8 * src_rgb24,uint8 * dst_y,int pix)2329 void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) {
2330 asm volatile (
2331 "movi v4.8b, #13 \n" // B * 0.1016 coefficient
2332 "movi v5.8b, #65 \n" // G * 0.5078 coefficient
2333 "movi v6.8b, #33 \n" // R * 0.2578 coefficient
2334 "movi v7.8b, #16 \n" // Add 16 constant
2335 "1: \n"
2336 MEMACCESS(0)
2337 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
2338 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2339 "umull v16.8h, v0.8b, v4.8b \n" // B
2340 "umlal v16.8h, v1.8b, v5.8b \n" // G
2341 "umlal v16.8h, v2.8b, v6.8b \n" // R
2342 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
2343 "uqadd v0.8b, v0.8b, v7.8b \n"
2344 MEMACCESS(1)
2345 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
2346 "b.gt 1b \n"
2347 : "+r"(src_rgb24), // %0
2348 "+r"(dst_y), // %1
2349 "+r"(pix) // %2
2350 :
2351 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
2352 );
2353 }
2354 #endif // HAS_RGB24TOYROW_NEON
2355
2356 #ifdef HAS_RAWTOYROW_NEON
RAWToYRow_NEON(const uint8 * src_raw,uint8 * dst_y,int pix)2357 void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) {
2358 asm volatile (
2359 "movi v4.8b, #33 \n" // R * 0.2578 coefficient
2360 "movi v5.8b, #65 \n" // G * 0.5078 coefficient
2361 "movi v6.8b, #13 \n" // B * 0.1016 coefficient
2362 "movi v7.8b, #16 \n" // Add 16 constant
2363 "1: \n"
2364 MEMACCESS(0)
2365 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
2366 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2367 "umull v16.8h, v0.8b, v4.8b \n" // B
2368 "umlal v16.8h, v1.8b, v5.8b \n" // G
2369 "umlal v16.8h, v2.8b, v6.8b \n" // R
2370 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
2371 "uqadd v0.8b, v0.8b, v7.8b \n"
2372 MEMACCESS(1)
2373 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
2374 "b.gt 1b \n"
2375 : "+r"(src_raw), // %0
2376 "+r"(dst_y), // %1
2377 "+r"(pix) // %2
2378 :
2379 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
2380 );
2381 }
2382 #endif // HAS_RAWTOYROW_NEON
2383
2384 // Bilinear filter 16x2 -> 16x1
2385 #ifdef HAS_INTERPOLATEROW_NEON
InterpolateRow_NEON(uint8 * dst_ptr,const uint8 * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)2386 void InterpolateRow_NEON(uint8* dst_ptr,
2387 const uint8* src_ptr, ptrdiff_t src_stride,
2388 int dst_width, int source_y_fraction) {
2389 int y1_fraction = source_y_fraction;
2390 int y0_fraction = 256 - y1_fraction;
2391 const uint8* src_ptr1 = src_ptr + src_stride;
2392 asm volatile (
2393 "cmp %w4, #0 \n"
2394 "b.eq 100f \n"
2395 "cmp %w4, #64 \n"
2396 "b.eq 75f \n"
2397 "cmp %w4, #128 \n"
2398 "b.eq 50f \n"
2399 "cmp %w4, #192 \n"
2400 "b.eq 25f \n"
2401
2402 "dup v5.16b, %w4 \n"
2403 "dup v4.16b, %w5 \n"
2404 // General purpose row blend.
2405 "1: \n"
2406 MEMACCESS(1)
2407 "ld1 {v0.16b}, [%1], #16 \n"
2408 MEMACCESS(2)
2409 "ld1 {v1.16b}, [%2], #16 \n"
2410 "subs %w3, %w3, #16 \n"
2411 "umull v2.8h, v0.8b, v4.8b \n"
2412 "umull2 v3.8h, v0.16b, v4.16b \n"
2413 "umlal v2.8h, v1.8b, v5.8b \n"
2414 "umlal2 v3.8h, v1.16b, v5.16b \n"
2415 "rshrn v0.8b, v2.8h, #8 \n"
2416 "rshrn2 v0.16b, v3.8h, #8 \n"
2417 MEMACCESS(0)
2418 "st1 {v0.16b}, [%0], #16 \n"
2419 "b.gt 1b \n"
2420 "b 99f \n"
2421
2422 // Blend 25 / 75.
2423 "25: \n"
2424 MEMACCESS(1)
2425 "ld1 {v0.16b}, [%1], #16 \n"
2426 MEMACCESS(2)
2427 "ld1 {v1.16b}, [%2], #16 \n"
2428 "subs %w3, %w3, #16 \n"
2429 "urhadd v0.16b, v0.16b, v1.16b \n"
2430 "urhadd v0.16b, v0.16b, v1.16b \n"
2431 MEMACCESS(0)
2432 "st1 {v0.16b}, [%0], #16 \n"
2433 "b.gt 25b \n"
2434 "b 99f \n"
2435
2436 // Blend 50 / 50.
2437 "50: \n"
2438 MEMACCESS(1)
2439 "ld1 {v0.16b}, [%1], #16 \n"
2440 MEMACCESS(2)
2441 "ld1 {v1.16b}, [%2], #16 \n"
2442 "subs %w3, %w3, #16 \n"
2443 "urhadd v0.16b, v0.16b, v1.16b \n"
2444 MEMACCESS(0)
2445 "st1 {v0.16b}, [%0], #16 \n"
2446 "b.gt 50b \n"
2447 "b 99f \n"
2448
2449 // Blend 75 / 25.
2450 "75: \n"
2451 MEMACCESS(1)
2452 "ld1 {v1.16b}, [%1], #16 \n"
2453 MEMACCESS(2)
2454 "ld1 {v0.16b}, [%2], #16 \n"
2455 "subs %w3, %w3, #16 \n"
2456 "urhadd v0.16b, v0.16b, v1.16b \n"
2457 "urhadd v0.16b, v0.16b, v1.16b \n"
2458 MEMACCESS(0)
2459 "st1 {v0.16b}, [%0], #16 \n"
2460 "b.gt 75b \n"
2461 "b 99f \n"
2462
2463 // Blend 100 / 0 - Copy row unchanged.
2464 "100: \n"
2465 MEMACCESS(1)
2466 "ld1 {v0.16b}, [%1], #16 \n"
2467 "subs %w3, %w3, #16 \n"
2468 MEMACCESS(0)
2469 "st1 {v0.16b}, [%0], #16 \n"
2470 "b.gt 100b \n"
2471
2472 "99: \n"
2473 : "+r"(dst_ptr), // %0
2474 "+r"(src_ptr), // %1
2475 "+r"(src_ptr1), // %2
2476 "+r"(dst_width), // %3
2477 "+r"(y1_fraction), // %4
2478 "+r"(y0_fraction) // %5
2479 :
2480 : "cc", "memory", "v0", "v1", "v3", "v4", "v5"
2481 );
2482 }
2483 #endif // HAS_INTERPOLATEROW_NEON
2484
2485 // dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
2486 #ifdef HAS_ARGBBLENDROW_NEON
ARGBBlendRow_NEON(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)2487 void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
2488 uint8* dst_argb, int width) {
2489 asm volatile (
2490 "subs %w3, %w3, #8 \n"
2491 "b.lt 89f \n"
2492 // Blend 8 pixels.
2493 "8: \n"
2494 MEMACCESS(0)
2495 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0 pixels
2496 MEMACCESS(1)
2497 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 ARGB1 pixels
2498 "subs %w3, %w3, #8 \n" // 8 processed per loop.
2499 "umull v16.8h, v4.8b, v3.8b \n" // db * a
2500 "umull v17.8h, v5.8b, v3.8b \n" // dg * a
2501 "umull v18.8h, v6.8b, v3.8b \n" // dr * a
2502 "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8
2503 "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8
2504 "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8
2505 "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256)
2506 "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256)
2507 "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256)
2508 "uqadd v0.8b, v0.8b, v4.8b \n" // + sb
2509 "uqadd v1.8b, v1.8b, v5.8b \n" // + sg
2510 "uqadd v2.8b, v2.8b, v6.8b \n" // + sr
2511 "movi v3.8b, #255 \n" // a = 255
2512 MEMACCESS(2)
2513 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels
2514 "b.ge 8b \n"
2515
2516 "89: \n"
2517 "adds %w3, %w3, #8-1 \n"
2518 "b.lt 99f \n"
2519
2520 // Blend 1 pixels.
2521 "1: \n"
2522 MEMACCESS(0)
2523 "ld4 {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n" // load 1 pixel ARGB0.
2524 MEMACCESS(1)
2525 "ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel ARGB1.
2526 "subs %w3, %w3, #1 \n" // 1 processed per loop.
2527 "umull v16.8h, v4.8b, v3.8b \n" // db * a
2528 "umull v17.8h, v5.8b, v3.8b \n" // dg * a
2529 "umull v18.8h, v6.8b, v3.8b \n" // dr * a
2530 "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8
2531 "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8
2532 "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8
2533 "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256)
2534 "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256)
2535 "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256)
2536 "uqadd v0.8b, v0.8b, v4.8b \n" // + sb
2537 "uqadd v1.8b, v1.8b, v5.8b \n" // + sg
2538 "uqadd v2.8b, v2.8b, v6.8b \n" // + sr
2539 "movi v3.8b, #255 \n" // a = 255
2540 MEMACCESS(2)
2541 "st4 {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n" // store 1 pixel.
2542 "b.ge 1b \n"
2543
2544 "99: \n"
2545
2546 : "+r"(src_argb0), // %0
2547 "+r"(src_argb1), // %1
2548 "+r"(dst_argb), // %2
2549 "+r"(width) // %3
2550 :
2551 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
2552 "v16", "v17", "v18"
2553 );
2554 }
2555 #endif // HAS_ARGBBLENDROW_NEON
2556
2557 // Attenuate 8 pixels at a time.
2558 #ifdef HAS_ARGBATTENUATEROW_NEON
ARGBAttenuateRow_NEON(const uint8 * src_argb,uint8 * dst_argb,int width)2559 void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
2560 asm volatile (
2561 // Attenuate 8 pixels.
2562 "1: \n"
2563 MEMACCESS(0)
2564 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels
2565 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2566 "umull v4.8h, v0.8b, v3.8b \n" // b * a
2567 "umull v5.8h, v1.8b, v3.8b \n" // g * a
2568 "umull v6.8h, v2.8b, v3.8b \n" // r * a
2569 "uqrshrn v0.8b, v4.8h, #8 \n" // b >>= 8
2570 "uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8
2571 "uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8
2572 MEMACCESS(1)
2573 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels
2574 "b.gt 1b \n"
2575 : "+r"(src_argb), // %0
2576 "+r"(dst_argb), // %1
2577 "+r"(width) // %2
2578 :
2579 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
2580 );
2581 }
2582 #endif // HAS_ARGBATTENUATEROW_NEON
2583
2584 // Quantize 8 ARGB pixels (32 bytes).
2585 // dst = (dst * scale >> 16) * interval_size + interval_offset;
2586 #ifdef HAS_ARGBQUANTIZEROW_NEON
ARGBQuantizeRow_NEON(uint8 * dst_argb,int scale,int interval_size,int interval_offset,int width)2587 void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
2588 int interval_offset, int width) {
2589 asm volatile (
2590 "dup v4.8h, %w2 \n"
2591 "ushr v4.8h, v4.8h, #1 \n" // scale >>= 1
2592 "dup v5.8h, %w3 \n" // interval multiply.
2593 "dup v6.8h, %w4 \n" // interval add
2594
2595 // 8 pixel loop.
2596 "1: \n"
2597 MEMACCESS(0)
2598 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 pixels of ARGB.
2599 "subs %w1, %w1, #8 \n" // 8 processed per loop.
2600 "uxtl v0.8h, v0.8b \n" // b (0 .. 255)
2601 "uxtl v1.8h, v1.8b \n"
2602 "uxtl v2.8h, v2.8b \n"
2603 "sqdmulh v0.8h, v0.8h, v4.8h \n" // b * scale
2604 "sqdmulh v1.8h, v1.8h, v4.8h \n" // g
2605 "sqdmulh v2.8h, v2.8h, v4.8h \n" // r
2606 "mul v0.8h, v0.8h, v5.8h \n" // b * interval_size
2607 "mul v1.8h, v1.8h, v5.8h \n" // g
2608 "mul v2.8h, v2.8h, v5.8h \n" // r
2609 "add v0.8h, v0.8h, v6.8h \n" // b + interval_offset
2610 "add v1.8h, v1.8h, v6.8h \n" // g
2611 "add v2.8h, v2.8h, v6.8h \n" // r
2612 "uqxtn v0.8b, v0.8h \n"
2613 "uqxtn v1.8b, v1.8h \n"
2614 "uqxtn v2.8b, v2.8h \n"
2615 MEMACCESS(0)
2616 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 ARGB pixels
2617 "b.gt 1b \n"
2618 : "+r"(dst_argb), // %0
2619 "+r"(width) // %1
2620 : "r"(scale), // %2
2621 "r"(interval_size), // %3
2622 "r"(interval_offset) // %4
2623 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
2624 );
2625 }
2626 #endif // HAS_ARGBQUANTIZEROW_NEON
2627
2628 // Shade 8 pixels at a time by specified value.
2629 // NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
2630 // Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
2631 #ifdef HAS_ARGBSHADEROW_NEON
ARGBShadeRow_NEON(const uint8 * src_argb,uint8 * dst_argb,int width,uint32 value)2632 void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
2633 uint32 value) {
2634 asm volatile (
2635 "dup v0.4s, %w3 \n" // duplicate scale value.
2636 "zip1 v0.8b, v0.8b, v0.8b \n" // v0.8b aarrggbb.
2637 "ushr v0.8h, v0.8h, #1 \n" // scale / 2.
2638
2639 // 8 pixel loop.
2640 "1: \n"
2641 MEMACCESS(0)
2642 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB pixels.
2643 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2644 "uxtl v4.8h, v4.8b \n" // b (0 .. 255)
2645 "uxtl v5.8h, v5.8b \n"
2646 "uxtl v6.8h, v6.8b \n"
2647 "uxtl v7.8h, v7.8b \n"
2648 "sqrdmulh v4.8h, v4.8h, v0.h[0] \n" // b * scale * 2
2649 "sqrdmulh v5.8h, v5.8h, v0.h[1] \n" // g
2650 "sqrdmulh v6.8h, v6.8h, v0.h[2] \n" // r
2651 "sqrdmulh v7.8h, v7.8h, v0.h[3] \n" // a
2652 "uqxtn v4.8b, v4.8h \n"
2653 "uqxtn v5.8b, v5.8h \n"
2654 "uqxtn v6.8b, v6.8h \n"
2655 "uqxtn v7.8b, v7.8h \n"
2656 MEMACCESS(1)
2657 "st4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 ARGB pixels
2658 "b.gt 1b \n"
2659 : "+r"(src_argb), // %0
2660 "+r"(dst_argb), // %1
2661 "+r"(width) // %2
2662 : "r"(value) // %3
2663 : "cc", "memory", "v0", "v4", "v5", "v6", "v7"
2664 );
2665 }
2666 #endif // HAS_ARGBSHADEROW_NEON
2667
2668 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
2669 // Similar to ARGBToYJ but stores ARGB.
2670 // C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
2671 #ifdef HAS_ARGBGRAYROW_NEON
ARGBGrayRow_NEON(const uint8 * src_argb,uint8 * dst_argb,int width)2672 void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
2673 asm volatile (
2674 "movi v24.8b, #15 \n" // B * 0.11400 coefficient
2675 "movi v25.8b, #75 \n" // G * 0.58700 coefficient
2676 "movi v26.8b, #38 \n" // R * 0.29900 coefficient
2677 "1: \n"
2678 MEMACCESS(0)
2679 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
2680 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2681 "umull v4.8h, v0.8b, v24.8b \n" // B
2682 "umlal v4.8h, v1.8b, v25.8b \n" // G
2683 "umlal v4.8h, v2.8b, v26.8b \n" // R
2684 "sqrshrun v0.8b, v4.8h, #7 \n" // 15 bit to 8 bit B
2685 "orr v1.8b, v0.8b, v0.8b \n" // G
2686 "orr v2.8b, v0.8b, v0.8b \n" // R
2687 MEMACCESS(1)
2688 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 pixels.
2689 "b.gt 1b \n"
2690 : "+r"(src_argb), // %0
2691 "+r"(dst_argb), // %1
2692 "+r"(width) // %2
2693 :
2694 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26"
2695 );
2696 }
2697 #endif // HAS_ARGBGRAYROW_NEON
2698
2699 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
2700 // b = (r * 35 + g * 68 + b * 17) >> 7
2701 // g = (r * 45 + g * 88 + b * 22) >> 7
2702 // r = (r * 50 + g * 98 + b * 24) >> 7
2703
2704 #ifdef HAS_ARGBSEPIAROW_NEON
ARGBSepiaRow_NEON(uint8 * dst_argb,int width)2705 void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
2706 asm volatile (
2707 "movi v20.8b, #17 \n" // BB coefficient
2708 "movi v21.8b, #68 \n" // BG coefficient
2709 "movi v22.8b, #35 \n" // BR coefficient
2710 "movi v24.8b, #22 \n" // GB coefficient
2711 "movi v25.8b, #88 \n" // GG coefficient
2712 "movi v26.8b, #45 \n" // GR coefficient
2713 "movi v28.8b, #24 \n" // BB coefficient
2714 "movi v29.8b, #98 \n" // BG coefficient
2715 "movi v30.8b, #50 \n" // BR coefficient
2716 "1: \n"
2717 MEMACCESS(0)
2718 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels.
2719 "subs %w1, %w1, #8 \n" // 8 processed per loop.
2720 "umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B
2721 "umlal v4.8h, v1.8b, v21.8b \n" // G
2722 "umlal v4.8h, v2.8b, v22.8b \n" // R
2723 "umull v5.8h, v0.8b, v24.8b \n" // B to Sepia G
2724 "umlal v5.8h, v1.8b, v25.8b \n" // G
2725 "umlal v5.8h, v2.8b, v26.8b \n" // R
2726 "umull v6.8h, v0.8b, v28.8b \n" // B to Sepia R
2727 "umlal v6.8h, v1.8b, v29.8b \n" // G
2728 "umlal v6.8h, v2.8b, v30.8b \n" // R
2729 "uqshrn v0.8b, v4.8h, #7 \n" // 16 bit to 8 bit B
2730 "uqshrn v1.8b, v5.8h, #7 \n" // 16 bit to 8 bit G
2731 "uqshrn v2.8b, v6.8h, #7 \n" // 16 bit to 8 bit R
2732 MEMACCESS(0)
2733 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 pixels.
2734 "b.gt 1b \n"
2735 : "+r"(dst_argb), // %0
2736 "+r"(width) // %1
2737 :
2738 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
2739 "v20", "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30"
2740 );
2741 }
2742 #endif // HAS_ARGBSEPIAROW_NEON
2743
2744 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
2745 // TODO(fbarchard): Was same as Sepia except matrix is provided. This function
2746 // needs to saturate. Consider doing a non-saturating version.
2747 #ifdef HAS_ARGBCOLORMATRIXROW_NEON
ARGBColorMatrixRow_NEON(const uint8 * src_argb,uint8 * dst_argb,const int8 * matrix_argb,int width)2748 void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
2749 const int8* matrix_argb, int width) {
2750 asm volatile (
2751 MEMACCESS(3)
2752 "ld1 {v2.16b}, [%3] \n" // load 3 ARGB vectors.
2753 "sxtl v0.8h, v2.8b \n" // B,G coefficients s16.
2754 "sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16.
2755
2756 "1: \n"
2757 MEMACCESS(0)
2758 "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 pixels.
2759 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2760 "uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit
2761 "uxtl v17.8h, v17.8b \n" // g
2762 "uxtl v18.8h, v18.8b \n" // r
2763 "uxtl v19.8h, v19.8b \n" // a
2764 "mul v22.8h, v16.8h, v0.h[0] \n" // B = B * Matrix B
2765 "mul v23.8h, v16.8h, v0.h[4] \n" // G = B * Matrix G
2766 "mul v24.8h, v16.8h, v1.h[0] \n" // R = B * Matrix R
2767 "mul v25.8h, v16.8h, v1.h[4] \n" // A = B * Matrix A
2768 "mul v4.8h, v17.8h, v0.h[1] \n" // B += G * Matrix B
2769 "mul v5.8h, v17.8h, v0.h[5] \n" // G += G * Matrix G
2770 "mul v6.8h, v17.8h, v1.h[1] \n" // R += G * Matrix R
2771 "mul v7.8h, v17.8h, v1.h[5] \n" // A += G * Matrix A
2772 "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
2773 "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
2774 "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
2775 "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
2776 "mul v4.8h, v18.8h, v0.h[2] \n" // B += R * Matrix B
2777 "mul v5.8h, v18.8h, v0.h[6] \n" // G += R * Matrix G
2778 "mul v6.8h, v18.8h, v1.h[2] \n" // R += R * Matrix R
2779 "mul v7.8h, v18.8h, v1.h[6] \n" // A += R * Matrix A
2780 "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
2781 "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
2782 "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
2783 "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
2784 "mul v4.8h, v19.8h, v0.h[3] \n" // B += A * Matrix B
2785 "mul v5.8h, v19.8h, v0.h[7] \n" // G += A * Matrix G
2786 "mul v6.8h, v19.8h, v1.h[3] \n" // R += A * Matrix R
2787 "mul v7.8h, v19.8h, v1.h[7] \n" // A += A * Matrix A
2788 "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
2789 "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
2790 "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
2791 "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
2792 "sqshrun v16.8b, v22.8h, #6 \n" // 16 bit to 8 bit B
2793 "sqshrun v17.8b, v23.8h, #6 \n" // 16 bit to 8 bit G
2794 "sqshrun v18.8b, v24.8h, #6 \n" // 16 bit to 8 bit R
2795 "sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A
2796 MEMACCESS(1)
2797 "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 pixels.
2798 "b.gt 1b \n"
2799 : "+r"(src_argb), // %0
2800 "+r"(dst_argb), // %1
2801 "+r"(width) // %2
2802 : "r"(matrix_argb) // %3
2803 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
2804 "v18", "v19", "v22", "v23", "v24", "v25"
2805 );
2806 }
2807 #endif // HAS_ARGBCOLORMATRIXROW_NEON
2808
2809 // TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable.
2810 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
2811 #ifdef HAS_ARGBMULTIPLYROW_NEON
ARGBMultiplyRow_NEON(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)2812 void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
2813 uint8* dst_argb, int width) {
2814 asm volatile (
2815 // 8 pixel loop.
2816 "1: \n"
2817 MEMACCESS(0)
2818 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
2819 MEMACCESS(1)
2820 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels.
2821 "subs %w3, %w3, #8 \n" // 8 processed per loop.
2822 "umull v0.8h, v0.8b, v4.8b \n" // multiply B
2823 "umull v1.8h, v1.8b, v5.8b \n" // multiply G
2824 "umull v2.8h, v2.8b, v6.8b \n" // multiply R
2825 "umull v3.8h, v3.8b, v7.8b \n" // multiply A
2826 "rshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit B
2827 "rshrn v1.8b, v1.8h, #8 \n" // 16 bit to 8 bit G
2828 "rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R
2829 "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A
2830 MEMACCESS(2)
2831 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels
2832 "b.gt 1b \n"
2833
2834 : "+r"(src_argb0), // %0
2835 "+r"(src_argb1), // %1
2836 "+r"(dst_argb), // %2
2837 "+r"(width) // %3
2838 :
2839 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
2840 );
2841 }
2842 #endif // HAS_ARGBMULTIPLYROW_NEON
2843
2844 // Add 2 rows of ARGB pixels together, 8 pixels at a time.
2845 #ifdef HAS_ARGBADDROW_NEON
ARGBAddRow_NEON(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)2846 void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
2847 uint8* dst_argb, int width) {
2848 asm volatile (
2849 // 8 pixel loop.
2850 "1: \n"
2851 MEMACCESS(0)
2852 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
2853 MEMACCESS(1)
2854 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels.
2855 "subs %w3, %w3, #8 \n" // 8 processed per loop.
2856 "uqadd v0.8b, v0.8b, v4.8b \n"
2857 "uqadd v1.8b, v1.8b, v5.8b \n"
2858 "uqadd v2.8b, v2.8b, v6.8b \n"
2859 "uqadd v3.8b, v3.8b, v7.8b \n"
2860 MEMACCESS(2)
2861 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels
2862 "b.gt 1b \n"
2863
2864 : "+r"(src_argb0), // %0
2865 "+r"(src_argb1), // %1
2866 "+r"(dst_argb), // %2
2867 "+r"(width) // %3
2868 :
2869 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
2870 );
2871 }
2872 #endif // HAS_ARGBADDROW_NEON
2873
2874 // Subtract 2 rows of ARGB pixels, 8 pixels at a time.
2875 #ifdef HAS_ARGBSUBTRACTROW_NEON
ARGBSubtractRow_NEON(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)2876 void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
2877 uint8* dst_argb, int width) {
2878 asm volatile (
2879 // 8 pixel loop.
2880 "1: \n"
2881 MEMACCESS(0)
2882 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
2883 MEMACCESS(1)
2884 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels.
2885 "subs %w3, %w3, #8 \n" // 8 processed per loop.
2886 "uqsub v0.8b, v0.8b, v4.8b \n"
2887 "uqsub v1.8b, v1.8b, v5.8b \n"
2888 "uqsub v2.8b, v2.8b, v6.8b \n"
2889 "uqsub v3.8b, v3.8b, v7.8b \n"
2890 MEMACCESS(2)
2891 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels
2892 "b.gt 1b \n"
2893
2894 : "+r"(src_argb0), // %0
2895 "+r"(src_argb1), // %1
2896 "+r"(dst_argb), // %2
2897 "+r"(width) // %3
2898 :
2899 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
2900 );
2901 }
2902 #endif // HAS_ARGBSUBTRACTROW_NEON
2903
2904 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
2905 // A = 255
2906 // R = Sobel
2907 // G = Sobel
2908 // B = Sobel
2909 #ifdef HAS_SOBELROW_NEON
SobelRow_NEON(const uint8 * src_sobelx,const uint8 * src_sobely,uint8 * dst_argb,int width)2910 void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
2911 uint8* dst_argb, int width) {
2912 asm volatile (
2913 "movi v3.8b, #255 \n" // alpha
2914 // 8 pixel loop.
2915 "1: \n"
2916 MEMACCESS(0)
2917 "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx.
2918 MEMACCESS(1)
2919 "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely.
2920 "subs %w3, %w3, #8 \n" // 8 processed per loop.
2921 "uqadd v0.8b, v0.8b, v1.8b \n" // add
2922 "orr v1.8b, v0.8b, v0.8b \n"
2923 "orr v2.8b, v0.8b, v0.8b \n"
2924 MEMACCESS(2)
2925 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels
2926 "b.gt 1b \n"
2927 : "+r"(src_sobelx), // %0
2928 "+r"(src_sobely), // %1
2929 "+r"(dst_argb), // %2
2930 "+r"(width) // %3
2931 :
2932 : "cc", "memory", "v0", "v1", "v2", "v3"
2933 );
2934 }
2935 #endif // HAS_SOBELROW_NEON
2936
2937 // Adds Sobel X and Sobel Y and stores Sobel into plane.
2938 #ifdef HAS_SOBELTOPLANEROW_NEON
SobelToPlaneRow_NEON(const uint8 * src_sobelx,const uint8 * src_sobely,uint8 * dst_y,int width)2939 void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
2940 uint8* dst_y, int width) {
2941 asm volatile (
2942 // 16 pixel loop.
2943 "1: \n"
2944 MEMACCESS(0)
2945 "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx.
2946 MEMACCESS(1)
2947 "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely.
2948 "subs %w3, %w3, #16 \n" // 16 processed per loop.
2949 "uqadd v0.16b, v0.16b, v1.16b \n" // add
2950 MEMACCESS(2)
2951 "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels.
2952 "b.gt 1b \n"
2953 : "+r"(src_sobelx), // %0
2954 "+r"(src_sobely), // %1
2955 "+r"(dst_y), // %2
2956 "+r"(width) // %3
2957 :
2958 : "cc", "memory", "v0", "v1"
2959 );
2960 }
2961 #endif // HAS_SOBELTOPLANEROW_NEON
2962
2963 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
2964 // A = 255
2965 // R = Sobel X
2966 // G = Sobel
2967 // B = Sobel Y
2968 #ifdef HAS_SOBELXYROW_NEON
SobelXYRow_NEON(const uint8 * src_sobelx,const uint8 * src_sobely,uint8 * dst_argb,int width)2969 void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
2970 uint8* dst_argb, int width) {
2971 asm volatile (
2972 "movi v3.8b, #255 \n" // alpha
2973 // 8 pixel loop.
2974 "1: \n"
2975 MEMACCESS(0)
2976 "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx.
2977 MEMACCESS(1)
2978 "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely.
2979 "subs %w3, %w3, #8 \n" // 8 processed per loop.
2980 "uqadd v1.8b, v0.8b, v2.8b \n" // add
2981 MEMACCESS(2)
2982 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels
2983 "b.gt 1b \n"
2984 : "+r"(src_sobelx), // %0
2985 "+r"(src_sobely), // %1
2986 "+r"(dst_argb), // %2
2987 "+r"(width) // %3
2988 :
2989 : "cc", "memory", "v0", "v1", "v2", "v3"
2990 );
2991 }
2992 #endif // HAS_SOBELXYROW_NEON
2993
2994 // SobelX as a matrix is
2995 // -1 0 1
2996 // -2 0 2
2997 // -1 0 1
2998 #ifdef HAS_SOBELXROW_NEON
SobelXRow_NEON(const uint8 * src_y0,const uint8 * src_y1,const uint8 * src_y2,uint8 * dst_sobelx,int width)2999 void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
3000 const uint8* src_y2, uint8* dst_sobelx, int width) {
3001 asm volatile (
3002 "1: \n"
3003 MEMACCESS(0)
3004 "ld1 {v0.8b}, [%0],%5 \n" // top
3005 MEMACCESS(0)
3006 "ld1 {v1.8b}, [%0],%6 \n"
3007 "usubl v0.8h, v0.8b, v1.8b \n"
3008 MEMACCESS(1)
3009 "ld1 {v2.8b}, [%1],%5 \n" // center * 2
3010 MEMACCESS(1)
3011 "ld1 {v3.8b}, [%1],%6 \n"
3012 "usubl v1.8h, v2.8b, v3.8b \n"
3013 "add v0.8h, v0.8h, v1.8h \n"
3014 "add v0.8h, v0.8h, v1.8h \n"
3015 MEMACCESS(2)
3016 "ld1 {v2.8b}, [%2],%5 \n" // bottom
3017 MEMACCESS(2)
3018 "ld1 {v3.8b}, [%2],%6 \n"
3019 "subs %w4, %w4, #8 \n" // 8 pixels
3020 "usubl v1.8h, v2.8b, v3.8b \n"
3021 "add v0.8h, v0.8h, v1.8h \n"
3022 "abs v0.8h, v0.8h \n"
3023 "uqxtn v0.8b, v0.8h \n"
3024 MEMACCESS(3)
3025 "st1 {v0.8b}, [%3], #8 \n" // store 8 sobelx
3026 "b.gt 1b \n"
3027 : "+r"(src_y0), // %0
3028 "+r"(src_y1), // %1
3029 "+r"(src_y2), // %2
3030 "+r"(dst_sobelx), // %3
3031 "+r"(width) // %4
3032 : "r"(2LL), // %5
3033 "r"(6LL) // %6
3034 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
3035 );
3036 }
3037 #endif // HAS_SOBELXROW_NEON
3038
3039 // SobelY as a matrix is
3040 // -1 -2 -1
3041 // 0 0 0
3042 // 1 2 1
3043 #ifdef HAS_SOBELYROW_NEON
SobelYRow_NEON(const uint8 * src_y0,const uint8 * src_y1,uint8 * dst_sobely,int width)3044 void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
3045 uint8* dst_sobely, int width) {
3046 asm volatile (
3047 "1: \n"
3048 MEMACCESS(0)
3049 "ld1 {v0.8b}, [%0],%4 \n" // left
3050 MEMACCESS(1)
3051 "ld1 {v1.8b}, [%1],%4 \n"
3052 "usubl v0.8h, v0.8b, v1.8b \n"
3053 MEMACCESS(0)
3054 "ld1 {v2.8b}, [%0],%4 \n" // center * 2
3055 MEMACCESS(1)
3056 "ld1 {v3.8b}, [%1],%4 \n"
3057 "usubl v1.8h, v2.8b, v3.8b \n"
3058 "add v0.8h, v0.8h, v1.8h \n"
3059 "add v0.8h, v0.8h, v1.8h \n"
3060 MEMACCESS(0)
3061 "ld1 {v2.8b}, [%0],%5 \n" // right
3062 MEMACCESS(1)
3063 "ld1 {v3.8b}, [%1],%5 \n"
3064 "subs %w3, %w3, #8 \n" // 8 pixels
3065 "usubl v1.8h, v2.8b, v3.8b \n"
3066 "add v0.8h, v0.8h, v1.8h \n"
3067 "abs v0.8h, v0.8h \n"
3068 "uqxtn v0.8b, v0.8h \n"
3069 MEMACCESS(2)
3070 "st1 {v0.8b}, [%2], #8 \n" // store 8 sobely
3071 "b.gt 1b \n"
3072 : "+r"(src_y0), // %0
3073 "+r"(src_y1), // %1
3074 "+r"(dst_sobely), // %2
3075 "+r"(width) // %3
3076 : "r"(1LL), // %4
3077 "r"(6LL) // %5
3078 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
3079 );
3080 }
3081 #endif // HAS_SOBELYROW_NEON
3082 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
3083
3084 #ifdef __cplusplus
3085 } // extern "C"
3086 } // namespace libyuv
3087 #endif
3088