1 /*
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "libyuv/row.h"
12
13 #ifdef __cplusplus
14 namespace libyuv {
15 extern "C" {
16 #endif
17
18 // This module is for GCC Neon.
19 #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
20 !defined(__aarch64__)
21
22 // NEON downscalers with interpolation.
23 // Provided by Fritz Koenig
24
25 // Read 32x1 throw away even pixels, and write 16x1.
ScaleRowDown2_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst,int dst_width)26 void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
27 uint8* dst, int dst_width) {
28 asm volatile (
29 ".p2align 2 \n"
30 "1: \n"
31 // load even pixels into q0, odd into q1
32 MEMACCESS(0)
33 "vld2.8 {q0, q1}, [%0]! \n"
34 "subs %2, %2, #16 \n" // 16 processed per loop
35 MEMACCESS(1)
36 "vst1.8 {q1}, [%1]! \n" // store odd pixels
37 "bgt 1b \n"
38 : "+r"(src_ptr), // %0
39 "+r"(dst), // %1
40 "+r"(dst_width) // %2
41 :
42 : "q0", "q1" // Clobber List
43 );
44 }
45
46 // Read 32x1 average down and write 16x1.
ScaleRowDown2Linear_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst,int dst_width)47 void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
48 uint8* dst, int dst_width) {
49 asm volatile (
50 ".p2align 2 \n"
51 "1: \n"
52 MEMACCESS(0)
53 "vld1.8 {q0, q1}, [%0]! \n" // load pixels and post inc
54 "subs %2, %2, #16 \n" // 16 processed per loop
55 "vpaddl.u8 q0, q0 \n" // add adjacent
56 "vpaddl.u8 q1, q1 \n"
57 "vrshrn.u16 d0, q0, #1 \n" // downshift, round and pack
58 "vrshrn.u16 d1, q1, #1 \n"
59 MEMACCESS(1)
60 "vst1.8 {q0}, [%1]! \n"
61 "bgt 1b \n"
62 : "+r"(src_ptr), // %0
63 "+r"(dst), // %1
64 "+r"(dst_width) // %2
65 :
66 : "q0", "q1" // Clobber List
67 );
68 }
69
70 // Read 32x2 average down and write 16x1.
ScaleRowDown2Box_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst,int dst_width)71 void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
72 uint8* dst, int dst_width) {
73 asm volatile (
74 // change the stride to row 2 pointer
75 "add %1, %0 \n"
76 ".p2align 2 \n"
77 "1: \n"
78 MEMACCESS(0)
79 "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc
80 MEMACCESS(1)
81 "vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc
82 "subs %3, %3, #16 \n" // 16 processed per loop
83 "vpaddl.u8 q0, q0 \n" // row 1 add adjacent
84 "vpaddl.u8 q1, q1 \n"
85 "vpadal.u8 q0, q2 \n" // row 2 add adjacent + row1
86 "vpadal.u8 q1, q3 \n"
87 "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack
88 "vrshrn.u16 d1, q1, #2 \n"
89 MEMACCESS(2)
90 "vst1.8 {q0}, [%2]! \n"
91 "bgt 1b \n"
92 : "+r"(src_ptr), // %0
93 "+r"(src_stride), // %1
94 "+r"(dst), // %2
95 "+r"(dst_width) // %3
96 :
97 : "q0", "q1", "q2", "q3" // Clobber List
98 );
99 }
100
ScaleRowDown4_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)101 void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
102 uint8* dst_ptr, int dst_width) {
103 asm volatile (
104 ".p2align 2 \n"
105 "1: \n"
106 MEMACCESS(0)
107 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
108 "subs %2, %2, #8 \n" // 8 processed per loop
109 MEMACCESS(1)
110 "vst1.8 {d2}, [%1]! \n"
111 "bgt 1b \n"
112 : "+r"(src_ptr), // %0
113 "+r"(dst_ptr), // %1
114 "+r"(dst_width) // %2
115 :
116 : "q0", "q1", "memory", "cc"
117 );
118 }
119
ScaleRowDown4Box_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)120 void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
121 uint8* dst_ptr, int dst_width) {
122 const uint8* src_ptr1 = src_ptr + src_stride;
123 const uint8* src_ptr2 = src_ptr + src_stride * 2;
124 const uint8* src_ptr3 = src_ptr + src_stride * 3;
125 asm volatile (
126 ".p2align 2 \n"
127 "1: \n"
128 MEMACCESS(0)
129 "vld1.8 {q0}, [%0]! \n" // load up 16x4
130 MEMACCESS(3)
131 "vld1.8 {q1}, [%3]! \n"
132 MEMACCESS(4)
133 "vld1.8 {q2}, [%4]! \n"
134 MEMACCESS(5)
135 "vld1.8 {q3}, [%5]! \n"
136 "subs %2, %2, #4 \n"
137 "vpaddl.u8 q0, q0 \n"
138 "vpadal.u8 q0, q1 \n"
139 "vpadal.u8 q0, q2 \n"
140 "vpadal.u8 q0, q3 \n"
141 "vpaddl.u16 q0, q0 \n"
142 "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding
143 "vmovn.u16 d0, q0 \n"
144 MEMACCESS(1)
145 "vst1.32 {d0[0]}, [%1]! \n"
146 "bgt 1b \n"
147 : "+r"(src_ptr), // %0
148 "+r"(dst_ptr), // %1
149 "+r"(dst_width), // %2
150 "+r"(src_ptr1), // %3
151 "+r"(src_ptr2), // %4
152 "+r"(src_ptr3) // %5
153 :
154 : "q0", "q1", "q2", "q3", "memory", "cc"
155 );
156 }
157
158 // Down scale from 4 to 3 pixels. Use the neon multilane read/write
159 // to load up the every 4th pixel into a 4 different registers.
160 // Point samples 32 pixels to 24 pixels.
ScaleRowDown34_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)161 void ScaleRowDown34_NEON(const uint8* src_ptr,
162 ptrdiff_t src_stride,
163 uint8* dst_ptr, int dst_width) {
164 asm volatile (
165 ".p2align 2 \n"
166 "1: \n"
167 MEMACCESS(0)
168 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
169 "subs %2, %2, #24 \n"
170 "vmov d2, d3 \n" // order d0, d1, d2
171 MEMACCESS(1)
172 "vst3.8 {d0, d1, d2}, [%1]! \n"
173 "bgt 1b \n"
174 : "+r"(src_ptr), // %0
175 "+r"(dst_ptr), // %1
176 "+r"(dst_width) // %2
177 :
178 : "d0", "d1", "d2", "d3", "memory", "cc"
179 );
180 }
181
ScaleRowDown34_0_Box_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)182 void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
183 ptrdiff_t src_stride,
184 uint8* dst_ptr, int dst_width) {
185 asm volatile (
186 "vmov.u8 d24, #3 \n"
187 "add %3, %0 \n"
188 ".p2align 2 \n"
189 "1: \n"
190 MEMACCESS(0)
191 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
192 MEMACCESS(3)
193 "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
194 "subs %2, %2, #24 \n"
195
196 // filter src line 0 with src line 1
197 // expand chars to shorts to allow for room
198 // when adding lines together
199 "vmovl.u8 q8, d4 \n"
200 "vmovl.u8 q9, d5 \n"
201 "vmovl.u8 q10, d6 \n"
202 "vmovl.u8 q11, d7 \n"
203
204 // 3 * line_0 + line_1
205 "vmlal.u8 q8, d0, d24 \n"
206 "vmlal.u8 q9, d1, d24 \n"
207 "vmlal.u8 q10, d2, d24 \n"
208 "vmlal.u8 q11, d3, d24 \n"
209
210 // (3 * line_0 + line_1) >> 2
211 "vqrshrn.u16 d0, q8, #2 \n"
212 "vqrshrn.u16 d1, q9, #2 \n"
213 "vqrshrn.u16 d2, q10, #2 \n"
214 "vqrshrn.u16 d3, q11, #2 \n"
215
216 // a0 = (src[0] * 3 + s[1] * 1) >> 2
217 "vmovl.u8 q8, d1 \n"
218 "vmlal.u8 q8, d0, d24 \n"
219 "vqrshrn.u16 d0, q8, #2 \n"
220
221 // a1 = (src[1] * 1 + s[2] * 1) >> 1
222 "vrhadd.u8 d1, d1, d2 \n"
223
224 // a2 = (src[2] * 1 + s[3] * 3) >> 2
225 "vmovl.u8 q8, d2 \n"
226 "vmlal.u8 q8, d3, d24 \n"
227 "vqrshrn.u16 d2, q8, #2 \n"
228
229 MEMACCESS(1)
230 "vst3.8 {d0, d1, d2}, [%1]! \n"
231
232 "bgt 1b \n"
233 : "+r"(src_ptr), // %0
234 "+r"(dst_ptr), // %1
235 "+r"(dst_width), // %2
236 "+r"(src_stride) // %3
237 :
238 : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc"
239 );
240 }
241
ScaleRowDown34_1_Box_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)242 void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
243 ptrdiff_t src_stride,
244 uint8* dst_ptr, int dst_width) {
245 asm volatile (
246 "vmov.u8 d24, #3 \n"
247 "add %3, %0 \n"
248 ".p2align 2 \n"
249 "1: \n"
250 MEMACCESS(0)
251 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
252 MEMACCESS(3)
253 "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
254 "subs %2, %2, #24 \n"
255 // average src line 0 with src line 1
256 "vrhadd.u8 q0, q0, q2 \n"
257 "vrhadd.u8 q1, q1, q3 \n"
258
259 // a0 = (src[0] * 3 + s[1] * 1) >> 2
260 "vmovl.u8 q3, d1 \n"
261 "vmlal.u8 q3, d0, d24 \n"
262 "vqrshrn.u16 d0, q3, #2 \n"
263
264 // a1 = (src[1] * 1 + s[2] * 1) >> 1
265 "vrhadd.u8 d1, d1, d2 \n"
266
267 // a2 = (src[2] * 1 + s[3] * 3) >> 2
268 "vmovl.u8 q3, d2 \n"
269 "vmlal.u8 q3, d3, d24 \n"
270 "vqrshrn.u16 d2, q3, #2 \n"
271
272 MEMACCESS(1)
273 "vst3.8 {d0, d1, d2}, [%1]! \n"
274 "bgt 1b \n"
275 : "+r"(src_ptr), // %0
276 "+r"(dst_ptr), // %1
277 "+r"(dst_width), // %2
278 "+r"(src_stride) // %3
279 :
280 : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc"
281 );
282 }
283
284 #define HAS_SCALEROWDOWN38_NEON
285 static uvec8 kShuf38 =
286 { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
287 static uvec8 kShuf38_2 =
288 { 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 };
289 static vec16 kMult38_Div6 =
290 { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
291 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
292 static vec16 kMult38_Div9 =
293 { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
294 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };
295
296 // 32 -> 12
ScaleRowDown38_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)297 void ScaleRowDown38_NEON(const uint8* src_ptr,
298 ptrdiff_t src_stride,
299 uint8* dst_ptr, int dst_width) {
300 asm volatile (
301 MEMACCESS(3)
302 "vld1.8 {q3}, [%3] \n"
303 ".p2align 2 \n"
304 "1: \n"
305 MEMACCESS(0)
306 "vld1.8 {d0, d1, d2, d3}, [%0]! \n"
307 "subs %2, %2, #12 \n"
308 "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n"
309 "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n"
310 MEMACCESS(1)
311 "vst1.8 {d4}, [%1]! \n"
312 MEMACCESS(1)
313 "vst1.32 {d5[0]}, [%1]! \n"
314 "bgt 1b \n"
315 : "+r"(src_ptr), // %0
316 "+r"(dst_ptr), // %1
317 "+r"(dst_width) // %2
318 : "r"(&kShuf38) // %3
319 : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc"
320 );
321 }
322
323 // 32x3 -> 12x1
ScaleRowDown38_3_Box_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)324 void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
325 ptrdiff_t src_stride,
326 uint8* dst_ptr, int dst_width) {
327 const uint8* src_ptr1 = src_ptr + src_stride * 2;
328
329 asm volatile (
330 MEMACCESS(5)
331 "vld1.16 {q13}, [%5] \n"
332 MEMACCESS(6)
333 "vld1.8 {q14}, [%6] \n"
334 MEMACCESS(7)
335 "vld1.8 {q15}, [%7] \n"
336 "add %3, %0 \n"
337 ".p2align 2 \n"
338 "1: \n"
339
340 // d0 = 00 40 01 41 02 42 03 43
341 // d1 = 10 50 11 51 12 52 13 53
342 // d2 = 20 60 21 61 22 62 23 63
343 // d3 = 30 70 31 71 32 72 33 73
344 MEMACCESS(0)
345 "vld4.8 {d0, d1, d2, d3}, [%0]! \n"
346 MEMACCESS(3)
347 "vld4.8 {d4, d5, d6, d7}, [%3]! \n"
348 MEMACCESS(4)
349 "vld4.8 {d16, d17, d18, d19}, [%4]! \n"
350 "subs %2, %2, #12 \n"
351
352 // Shuffle the input data around to get align the data
353 // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
354 // d0 = 00 10 01 11 02 12 03 13
355 // d1 = 40 50 41 51 42 52 43 53
356 "vtrn.u8 d0, d1 \n"
357 "vtrn.u8 d4, d5 \n"
358 "vtrn.u8 d16, d17 \n"
359
360 // d2 = 20 30 21 31 22 32 23 33
361 // d3 = 60 70 61 71 62 72 63 73
362 "vtrn.u8 d2, d3 \n"
363 "vtrn.u8 d6, d7 \n"
364 "vtrn.u8 d18, d19 \n"
365
366 // d0 = 00+10 01+11 02+12 03+13
367 // d2 = 40+50 41+51 42+52 43+53
368 "vpaddl.u8 q0, q0 \n"
369 "vpaddl.u8 q2, q2 \n"
370 "vpaddl.u8 q8, q8 \n"
371
372 // d3 = 60+70 61+71 62+72 63+73
373 "vpaddl.u8 d3, d3 \n"
374 "vpaddl.u8 d7, d7 \n"
375 "vpaddl.u8 d19, d19 \n"
376
377 // combine source lines
378 "vadd.u16 q0, q2 \n"
379 "vadd.u16 q0, q8 \n"
380 "vadd.u16 d4, d3, d7 \n"
381 "vadd.u16 d4, d19 \n"
382
383 // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
384 // + s[6 + st * 1] + s[7 + st * 1]
385 // + s[6 + st * 2] + s[7 + st * 2]) / 6
386 "vqrdmulh.s16 q2, q2, q13 \n"
387 "vmovn.u16 d4, q2 \n"
388
389 // Shuffle 2,3 reg around so that 2 can be added to the
390 // 0,1 reg and 3 can be added to the 4,5 reg. This
391 // requires expanding from u8 to u16 as the 0,1 and 4,5
392 // registers are already expanded. Then do transposes
393 // to get aligned.
394 // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
395 "vmovl.u8 q1, d2 \n"
396 "vmovl.u8 q3, d6 \n"
397 "vmovl.u8 q9, d18 \n"
398
399 // combine source lines
400 "vadd.u16 q1, q3 \n"
401 "vadd.u16 q1, q9 \n"
402
403 // d4 = xx 20 xx 30 xx 22 xx 32
404 // d5 = xx 21 xx 31 xx 23 xx 33
405 "vtrn.u32 d2, d3 \n"
406
407 // d4 = xx 20 xx 21 xx 22 xx 23
408 // d5 = xx 30 xx 31 xx 32 xx 33
409 "vtrn.u16 d2, d3 \n"
410
411 // 0+1+2, 3+4+5
412 "vadd.u16 q0, q1 \n"
413
414 // Need to divide, but can't downshift as the the value
415 // isn't a power of 2. So multiply by 65536 / n
416 // and take the upper 16 bits.
417 "vqrdmulh.s16 q0, q0, q15 \n"
418
419 // Align for table lookup, vtbl requires registers to
420 // be adjacent
421 "vmov.u8 d2, d4 \n"
422
423 "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
424 "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
425
426 MEMACCESS(1)
427 "vst1.8 {d3}, [%1]! \n"
428 MEMACCESS(1)
429 "vst1.32 {d4[0]}, [%1]! \n"
430 "bgt 1b \n"
431 : "+r"(src_ptr), // %0
432 "+r"(dst_ptr), // %1
433 "+r"(dst_width), // %2
434 "+r"(src_stride), // %3
435 "+r"(src_ptr1) // %4
436 : "r"(&kMult38_Div6), // %5
437 "r"(&kShuf38_2), // %6
438 "r"(&kMult38_Div9) // %7
439 : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory", "cc"
440 );
441 }
442
443 // 32x2 -> 12x1
ScaleRowDown38_2_Box_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)444 void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
445 ptrdiff_t src_stride,
446 uint8* dst_ptr, int dst_width) {
447 asm volatile (
448 MEMACCESS(4)
449 "vld1.16 {q13}, [%4] \n"
450 MEMACCESS(5)
451 "vld1.8 {q14}, [%5] \n"
452 "add %3, %0 \n"
453 ".p2align 2 \n"
454 "1: \n"
455
456 // d0 = 00 40 01 41 02 42 03 43
457 // d1 = 10 50 11 51 12 52 13 53
458 // d2 = 20 60 21 61 22 62 23 63
459 // d3 = 30 70 31 71 32 72 33 73
460 MEMACCESS(0)
461 "vld4.8 {d0, d1, d2, d3}, [%0]! \n"
462 MEMACCESS(3)
463 "vld4.8 {d4, d5, d6, d7}, [%3]! \n"
464 "subs %2, %2, #12 \n"
465
466 // Shuffle the input data around to get align the data
467 // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
468 // d0 = 00 10 01 11 02 12 03 13
469 // d1 = 40 50 41 51 42 52 43 53
470 "vtrn.u8 d0, d1 \n"
471 "vtrn.u8 d4, d5 \n"
472
473 // d2 = 20 30 21 31 22 32 23 33
474 // d3 = 60 70 61 71 62 72 63 73
475 "vtrn.u8 d2, d3 \n"
476 "vtrn.u8 d6, d7 \n"
477
478 // d0 = 00+10 01+11 02+12 03+13
479 // d2 = 40+50 41+51 42+52 43+53
480 "vpaddl.u8 q0, q0 \n"
481 "vpaddl.u8 q2, q2 \n"
482
483 // d3 = 60+70 61+71 62+72 63+73
484 "vpaddl.u8 d3, d3 \n"
485 "vpaddl.u8 d7, d7 \n"
486
487 // combine source lines
488 "vadd.u16 q0, q2 \n"
489 "vadd.u16 d4, d3, d7 \n"
490
491 // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
492 "vqrshrn.u16 d4, q2, #2 \n"
493
494 // Shuffle 2,3 reg around so that 2 can be added to the
495 // 0,1 reg and 3 can be added to the 4,5 reg. This
496 // requires expanding from u8 to u16 as the 0,1 and 4,5
497 // registers are already expanded. Then do transposes
498 // to get aligned.
499 // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
500 "vmovl.u8 q1, d2 \n"
501 "vmovl.u8 q3, d6 \n"
502
503 // combine source lines
504 "vadd.u16 q1, q3 \n"
505
506 // d4 = xx 20 xx 30 xx 22 xx 32
507 // d5 = xx 21 xx 31 xx 23 xx 33
508 "vtrn.u32 d2, d3 \n"
509
510 // d4 = xx 20 xx 21 xx 22 xx 23
511 // d5 = xx 30 xx 31 xx 32 xx 33
512 "vtrn.u16 d2, d3 \n"
513
514 // 0+1+2, 3+4+5
515 "vadd.u16 q0, q1 \n"
516
517 // Need to divide, but can't downshift as the the value
518 // isn't a power of 2. So multiply by 65536 / n
519 // and take the upper 16 bits.
520 "vqrdmulh.s16 q0, q0, q13 \n"
521
522 // Align for table lookup, vtbl requires registers to
523 // be adjacent
524 "vmov.u8 d2, d4 \n"
525
526 "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
527 "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
528
529 MEMACCESS(1)
530 "vst1.8 {d3}, [%1]! \n"
531 MEMACCESS(1)
532 "vst1.32 {d4[0]}, [%1]! \n"
533 "bgt 1b \n"
534 : "+r"(src_ptr), // %0
535 "+r"(dst_ptr), // %1
536 "+r"(dst_width), // %2
537 "+r"(src_stride) // %3
538 : "r"(&kMult38_Div6), // %4
539 "r"(&kShuf38_2) // %5
540 : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"
541 );
542 }
543
ScaleAddRows_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint16 * dst_ptr,int src_width,int src_height)544 void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
545 uint16* dst_ptr, int src_width, int src_height) {
546 const uint8* src_tmp = NULL;
547 asm volatile (
548 ".p2align 2 \n"
549 "1: \n"
550 "mov %0, %1 \n"
551 "mov r12, %5 \n"
552 "veor q2, q2, q2 \n"
553 "veor q3, q3, q3 \n"
554 "2: \n"
555 // load 16 pixels into q0
556 MEMACCESS(0)
557 "vld1.8 {q0}, [%0], %3 \n"
558 "vaddw.u8 q3, q3, d1 \n"
559 "vaddw.u8 q2, q2, d0 \n"
560 "subs r12, r12, #1 \n"
561 "bgt 2b \n"
562 MEMACCESS(2)
563 "vst1.16 {q2, q3}, [%2]! \n" // store pixels
564 "add %1, %1, #16 \n"
565 "subs %4, %4, #16 \n" // 16 processed per loop
566 "bgt 1b \n"
567 : "+r"(src_tmp), // %0
568 "+r"(src_ptr), // %1
569 "+r"(dst_ptr), // %2
570 "+r"(src_stride), // %3
571 "+r"(src_width), // %4
572 "+r"(src_height) // %5
573 :
574 : "memory", "cc", "r12", "q0", "q1", "q2", "q3" // Clobber List
575 );
576 }
577
578 // TODO(Yang Zhang): Investigate less load instructions for
579 // the x/dx stepping
580 #define LOAD2_DATA8_LANE(n) \
581 "lsr %5, %3, #16 \n" \
582 "add %6, %1, %5 \n" \
583 "add %3, %3, %4 \n" \
584 MEMACCESS(6) \
585 "vld2.8 {d6["#n"], d7["#n"]}, [%6] \n"
586
ScaleFilterCols_NEON(uint8 * dst_ptr,const uint8 * src_ptr,int dst_width,int x,int dx)587 void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
588 int dst_width, int x, int dx) {
589 int dx_offset[4] = {0, 1, 2, 3};
590 int* tmp = dx_offset;
591 const uint8* src_tmp = src_ptr;
592 asm volatile (
593 ".p2align 2 \n"
594 "vdup.32 q0, %3 \n" // x
595 "vdup.32 q1, %4 \n" // dx
596 "vld1.32 {q2}, [%5] \n" // 0 1 2 3
597 "vshl.i32 q3, q1, #2 \n" // 4 * dx
598 "vmul.s32 q1, q1, q2 \n"
599 // x , x + 1 * dx, x + 2 * dx, x + 3 * dx
600 "vadd.s32 q1, q1, q0 \n"
601 // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
602 "vadd.s32 q2, q1, q3 \n"
603 "vshl.i32 q0, q3, #1 \n" // 8 * dx
604 "1: \n"
605 LOAD2_DATA8_LANE(0)
606 LOAD2_DATA8_LANE(1)
607 LOAD2_DATA8_LANE(2)
608 LOAD2_DATA8_LANE(3)
609 LOAD2_DATA8_LANE(4)
610 LOAD2_DATA8_LANE(5)
611 LOAD2_DATA8_LANE(6)
612 LOAD2_DATA8_LANE(7)
613 "vmov q10, q1 \n"
614 "vmov q11, q2 \n"
615 "vuzp.16 q10, q11 \n"
616 "vmovl.u8 q8, d6 \n"
617 "vmovl.u8 q9, d7 \n"
618 "vsubl.s16 q11, d18, d16 \n"
619 "vsubl.s16 q12, d19, d17 \n"
620 "vmovl.u16 q13, d20 \n"
621 "vmovl.u16 q10, d21 \n"
622 "vmul.s32 q11, q11, q13 \n"
623 "vmul.s32 q12, q12, q10 \n"
624 "vshrn.s32 d18, q11, #16 \n"
625 "vshrn.s32 d19, q12, #16 \n"
626 "vadd.s16 q8, q8, q9 \n"
627 "vmovn.s16 d6, q8 \n"
628
629 MEMACCESS(0)
630 "vst1.8 {d6}, [%0]! \n" // store pixels
631 "vadd.s32 q1, q1, q0 \n"
632 "vadd.s32 q2, q2, q0 \n"
633 "subs %2, %2, #8 \n" // 8 processed per loop
634 "bgt 1b \n"
635 : "+r"(dst_ptr), // %0
636 "+r"(src_ptr), // %1
637 "+r"(dst_width), // %2
638 "+r"(x), // %3
639 "+r"(dx), // %4
640 "+r"(tmp), // %5
641 "+r"(src_tmp) // %6
642 :
643 : "memory", "cc", "q0", "q1", "q2", "q3",
644 "q8", "q9", "q10", "q11", "q12", "q13"
645 );
646 }
647
648 #undef LOAD2_DATA8_LANE
649
650 // 16x2 -> 16x1
ScaleFilterRows_NEON(uint8 * dst_ptr,const uint8 * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)651 void ScaleFilterRows_NEON(uint8* dst_ptr,
652 const uint8* src_ptr, ptrdiff_t src_stride,
653 int dst_width, int source_y_fraction) {
654 asm volatile (
655 "cmp %4, #0 \n"
656 "beq 100f \n"
657 "add %2, %1 \n"
658 "cmp %4, #64 \n"
659 "beq 75f \n"
660 "cmp %4, #128 \n"
661 "beq 50f \n"
662 "cmp %4, #192 \n"
663 "beq 25f \n"
664
665 "vdup.8 d5, %4 \n"
666 "rsb %4, #256 \n"
667 "vdup.8 d4, %4 \n"
668 // General purpose row blend.
669 "1: \n"
670 MEMACCESS(1)
671 "vld1.8 {q0}, [%1]! \n"
672 MEMACCESS(2)
673 "vld1.8 {q1}, [%2]! \n"
674 "subs %3, %3, #16 \n"
675 "vmull.u8 q13, d0, d4 \n"
676 "vmull.u8 q14, d1, d4 \n"
677 "vmlal.u8 q13, d2, d5 \n"
678 "vmlal.u8 q14, d3, d5 \n"
679 "vrshrn.u16 d0, q13, #8 \n"
680 "vrshrn.u16 d1, q14, #8 \n"
681 MEMACCESS(0)
682 "vst1.8 {q0}, [%0]! \n"
683 "bgt 1b \n"
684 "b 99f \n"
685
686 // Blend 25 / 75.
687 "25: \n"
688 MEMACCESS(1)
689 "vld1.8 {q0}, [%1]! \n"
690 MEMACCESS(2)
691 "vld1.8 {q1}, [%2]! \n"
692 "subs %3, %3, #16 \n"
693 "vrhadd.u8 q0, q1 \n"
694 "vrhadd.u8 q0, q1 \n"
695 MEMACCESS(0)
696 "vst1.8 {q0}, [%0]! \n"
697 "bgt 25b \n"
698 "b 99f \n"
699
700 // Blend 50 / 50.
701 "50: \n"
702 MEMACCESS(1)
703 "vld1.8 {q0}, [%1]! \n"
704 MEMACCESS(2)
705 "vld1.8 {q1}, [%2]! \n"
706 "subs %3, %3, #16 \n"
707 "vrhadd.u8 q0, q1 \n"
708 MEMACCESS(0)
709 "vst1.8 {q0}, [%0]! \n"
710 "bgt 50b \n"
711 "b 99f \n"
712
713 // Blend 75 / 25.
714 "75: \n"
715 MEMACCESS(1)
716 "vld1.8 {q1}, [%1]! \n"
717 MEMACCESS(2)
718 "vld1.8 {q0}, [%2]! \n"
719 "subs %3, %3, #16 \n"
720 "vrhadd.u8 q0, q1 \n"
721 "vrhadd.u8 q0, q1 \n"
722 MEMACCESS(0)
723 "vst1.8 {q0}, [%0]! \n"
724 "bgt 75b \n"
725 "b 99f \n"
726
727 // Blend 100 / 0 - Copy row unchanged.
728 "100: \n"
729 MEMACCESS(1)
730 "vld1.8 {q0}, [%1]! \n"
731 "subs %3, %3, #16 \n"
732 MEMACCESS(0)
733 "vst1.8 {q0}, [%0]! \n"
734 "bgt 100b \n"
735
736 "99: \n"
737 MEMACCESS(0)
738 "vst1.8 {d1[7]}, [%0] \n"
739 : "+r"(dst_ptr), // %0
740 "+r"(src_ptr), // %1
741 "+r"(src_stride), // %2
742 "+r"(dst_width), // %3
743 "+r"(source_y_fraction) // %4
744 :
745 : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc"
746 );
747 }
748
ScaleARGBRowDown2_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst,int dst_width)749 void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
750 uint8* dst, int dst_width) {
751 asm volatile (
752 ".p2align 2 \n"
753 "1: \n"
754 // load even pixels into q0, odd into q1
755 MEMACCESS(0)
756 "vld2.32 {q0, q1}, [%0]! \n"
757 MEMACCESS(0)
758 "vld2.32 {q2, q3}, [%0]! \n"
759 "subs %2, %2, #8 \n" // 8 processed per loop
760 MEMACCESS(1)
761 "vst1.8 {q1}, [%1]! \n" // store odd pixels
762 MEMACCESS(1)
763 "vst1.8 {q3}, [%1]! \n"
764 "bgt 1b \n"
765 : "+r"(src_ptr), // %0
766 "+r"(dst), // %1
767 "+r"(dst_width) // %2
768 :
769 : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List
770 );
771 }
772
ScaleARGBRowDown2Linear_NEON(const uint8 * src_argb,ptrdiff_t src_stride,uint8 * dst_argb,int dst_width)773 void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
774 uint8* dst_argb, int dst_width) {
775 asm volatile (
776 ".p2align 2 \n"
777 "1: \n"
778 MEMACCESS(0)
779 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
780 MEMACCESS(0)
781 "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
782 "subs %2, %2, #8 \n" // 8 processed per loop
783 "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
784 "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
785 "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
786 "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts.
787 "vrshrn.u16 d0, q0, #1 \n" // downshift, round and pack
788 "vrshrn.u16 d1, q1, #1 \n"
789 "vrshrn.u16 d2, q2, #1 \n"
790 "vrshrn.u16 d3, q3, #1 \n"
791 MEMACCESS(1)
792 "vst4.8 {d0, d1, d2, d3}, [%1]! \n"
793 "bgt 1b \n"
794 : "+r"(src_argb), // %0
795 "+r"(dst_argb), // %1
796 "+r"(dst_width) // %2
797 :
798 : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List
799 );
800 }
801
ScaleARGBRowDown2Box_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst,int dst_width)802 void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
803 uint8* dst, int dst_width) {
804 asm volatile (
805 // change the stride to row 2 pointer
806 "add %1, %1, %0 \n"
807 ".p2align 2 \n"
808 "1: \n"
809 MEMACCESS(0)
810 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
811 MEMACCESS(0)
812 "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
813 "subs %3, %3, #8 \n" // 8 processed per loop.
814 "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
815 "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
816 "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
817 "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts.
818 MEMACCESS(1)
819 "vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB pixels.
820 MEMACCESS(1)
821 "vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB pixels.
822 "vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts.
823 "vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts.
824 "vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts.
825 "vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts.
826 "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack
827 "vrshrn.u16 d1, q1, #2 \n"
828 "vrshrn.u16 d2, q2, #2 \n"
829 "vrshrn.u16 d3, q3, #2 \n"
830 MEMACCESS(2)
831 "vst4.8 {d0, d1, d2, d3}, [%2]! \n"
832 "bgt 1b \n"
833 : "+r"(src_ptr), // %0
834 "+r"(src_stride), // %1
835 "+r"(dst), // %2
836 "+r"(dst_width) // %3
837 :
838 : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
839 );
840 }
841
842 // Reads 4 pixels at a time.
843 // Alignment requirement: src_argb 4 byte aligned.
ScaleARGBRowDownEven_NEON(const uint8 * src_argb,ptrdiff_t src_stride,int src_stepx,uint8 * dst_argb,int dst_width)844 void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride,
845 int src_stepx, uint8* dst_argb, int dst_width) {
846 asm volatile (
847 "mov r12, %3, lsl #2 \n"
848 ".p2align 2 \n"
849 "1: \n"
850 MEMACCESS(0)
851 "vld1.32 {d0[0]}, [%0], r12 \n"
852 MEMACCESS(0)
853 "vld1.32 {d0[1]}, [%0], r12 \n"
854 MEMACCESS(0)
855 "vld1.32 {d1[0]}, [%0], r12 \n"
856 MEMACCESS(0)
857 "vld1.32 {d1[1]}, [%0], r12 \n"
858 "subs %2, %2, #4 \n" // 4 pixels per loop.
859 MEMACCESS(1)
860 "vst1.8 {q0}, [%1]! \n"
861 "bgt 1b \n"
862 : "+r"(src_argb), // %0
863 "+r"(dst_argb), // %1
864 "+r"(dst_width) // %2
865 : "r"(src_stepx) // %3
866 : "memory", "cc", "r12", "q0"
867 );
868 }
869
870 // Reads 4 pixels at a time.
871 // Alignment requirement: src_argb 4 byte aligned.
ScaleARGBRowDownEvenBox_NEON(const uint8 * src_argb,ptrdiff_t src_stride,int src_stepx,uint8 * dst_argb,int dst_width)872 void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
873 int src_stepx,
874 uint8* dst_argb, int dst_width) {
875 asm volatile (
876 "mov r12, %4, lsl #2 \n"
877 "add %1, %1, %0 \n"
878 ".p2align 2 \n"
879 "1: \n"
880 MEMACCESS(0)
881 "vld1.8 {d0}, [%0], r12 \n" // Read 4 2x2 blocks -> 2x1
882 MEMACCESS(1)
883 "vld1.8 {d1}, [%1], r12 \n"
884 MEMACCESS(0)
885 "vld1.8 {d2}, [%0], r12 \n"
886 MEMACCESS(1)
887 "vld1.8 {d3}, [%1], r12 \n"
888 MEMACCESS(0)
889 "vld1.8 {d4}, [%0], r12 \n"
890 MEMACCESS(1)
891 "vld1.8 {d5}, [%1], r12 \n"
892 MEMACCESS(0)
893 "vld1.8 {d6}, [%0], r12 \n"
894 MEMACCESS(1)
895 "vld1.8 {d7}, [%1], r12 \n"
896 "vaddl.u8 q0, d0, d1 \n"
897 "vaddl.u8 q1, d2, d3 \n"
898 "vaddl.u8 q2, d4, d5 \n"
899 "vaddl.u8 q3, d6, d7 \n"
900 "vswp.8 d1, d2 \n" // ab_cd -> ac_bd
901 "vswp.8 d5, d6 \n" // ef_gh -> eg_fh
902 "vadd.u16 q0, q0, q1 \n" // (a+b)_(c+d)
903 "vadd.u16 q2, q2, q3 \n" // (e+f)_(g+h)
904 "vrshrn.u16 d0, q0, #2 \n" // first 2 pixels.
905 "vrshrn.u16 d1, q2, #2 \n" // next 2 pixels.
906 "subs %3, %3, #4 \n" // 4 pixels per loop.
907 MEMACCESS(2)
908 "vst1.8 {q0}, [%2]! \n"
909 "bgt 1b \n"
910 : "+r"(src_argb), // %0
911 "+r"(src_stride), // %1
912 "+r"(dst_argb), // %2
913 "+r"(dst_width) // %3
914 : "r"(src_stepx) // %4
915 : "memory", "cc", "r12", "q0", "q1", "q2", "q3"
916 );
917 }
918
919 // TODO(Yang Zhang): Investigate less load instructions for
920 // the x/dx stepping
921 #define LOAD1_DATA32_LANE(dn, n) \
922 "lsr %5, %3, #16 \n" \
923 "add %6, %1, %5, lsl #2 \n" \
924 "add %3, %3, %4 \n" \
925 MEMACCESS(6) \
926 "vld1.32 {"#dn"["#n"]}, [%6] \n"
927
ScaleARGBCols_NEON(uint8 * dst_argb,const uint8 * src_argb,int dst_width,int x,int dx)928 void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
929 int dst_width, int x, int dx) {
930 int tmp = 0;
931 const uint8* src_tmp = src_argb;
932 asm volatile (
933 ".p2align 2 \n"
934 "1: \n"
935 LOAD1_DATA32_LANE(d0, 0)
936 LOAD1_DATA32_LANE(d0, 1)
937 LOAD1_DATA32_LANE(d1, 0)
938 LOAD1_DATA32_LANE(d1, 1)
939 LOAD1_DATA32_LANE(d2, 0)
940 LOAD1_DATA32_LANE(d2, 1)
941 LOAD1_DATA32_LANE(d3, 0)
942 LOAD1_DATA32_LANE(d3, 1)
943
944 MEMACCESS(0)
945 "vst1.32 {q0, q1}, [%0]! \n" // store pixels
946 "subs %2, %2, #8 \n" // 8 processed per loop
947 "bgt 1b \n"
948 : "+r"(dst_argb), // %0
949 "+r"(src_argb), // %1
950 "+r"(dst_width), // %2
951 "+r"(x), // %3
952 "+r"(dx), // %4
953 "+r"(tmp), // %5
954 "+r"(src_tmp) // %6
955 :
956 : "memory", "cc", "q0", "q1"
957 );
958 }
959
960 #undef LOAD1_DATA32_LANE
961
962 // TODO(Yang Zhang): Investigate less load instructions for
963 // the x/dx stepping
964 #define LOAD2_DATA32_LANE(dn1, dn2, n) \
965 "lsr %5, %3, #16 \n" \
966 "add %6, %1, %5, lsl #2 \n" \
967 "add %3, %3, %4 \n" \
968 MEMACCESS(6) \
969 "vld2.32 {"#dn1"["#n"], "#dn2"["#n"]}, [%6] \n"
970
ScaleARGBFilterCols_NEON(uint8 * dst_argb,const uint8 * src_argb,int dst_width,int x,int dx)971 void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
972 int dst_width, int x, int dx) {
973 int dx_offset[4] = {0, 1, 2, 3};
974 int* tmp = dx_offset;
975 const uint8* src_tmp = src_argb;
976 asm volatile (
977 ".p2align 2 \n"
978 "vdup.32 q0, %3 \n" // x
979 "vdup.32 q1, %4 \n" // dx
980 "vld1.32 {q2}, [%5] \n" // 0 1 2 3
981 "vshl.i32 q9, q1, #2 \n" // 4 * dx
982 "vmul.s32 q1, q1, q2 \n"
983 "vmov.i8 q3, #0x7f \n" // 0x7F
984 "vmov.i16 q15, #0x7f \n" // 0x7F
985 // x , x + 1 * dx, x + 2 * dx, x + 3 * dx
986 "vadd.s32 q8, q1, q0 \n"
987 "1: \n"
988 // d0, d1: a
989 // d2, d3: b
990 LOAD2_DATA32_LANE(d0, d2, 0)
991 LOAD2_DATA32_LANE(d0, d2, 1)
992 LOAD2_DATA32_LANE(d1, d3, 0)
993 LOAD2_DATA32_LANE(d1, d3, 1)
994 "vshrn.i32 d22, q8, #9 \n"
995 "vand.16 d22, d22, d30 \n"
996 "vdup.8 d24, d22[0] \n"
997 "vdup.8 d25, d22[2] \n"
998 "vdup.8 d26, d22[4] \n"
999 "vdup.8 d27, d22[6] \n"
1000 "vext.8 d4, d24, d25, #4 \n"
1001 "vext.8 d5, d26, d27, #4 \n" // f
1002 "veor.8 q10, q2, q3 \n" // 0x7f ^ f
1003 "vmull.u8 q11, d0, d20 \n"
1004 "vmull.u8 q12, d1, d21 \n"
1005 "vmull.u8 q13, d2, d4 \n"
1006 "vmull.u8 q14, d3, d5 \n"
1007 "vadd.i16 q11, q11, q13 \n"
1008 "vadd.i16 q12, q12, q14 \n"
1009 "vshrn.i16 d0, q11, #7 \n"
1010 "vshrn.i16 d1, q12, #7 \n"
1011
1012 MEMACCESS(0)
1013 "vst1.32 {d0, d1}, [%0]! \n" // store pixels
1014 "vadd.s32 q8, q8, q9 \n"
1015 "subs %2, %2, #4 \n" // 4 processed per loop
1016 "bgt 1b \n"
1017 : "+r"(dst_argb), // %0
1018 "+r"(src_argb), // %1
1019 "+r"(dst_width), // %2
1020 "+r"(x), // %3
1021 "+r"(dx), // %4
1022 "+r"(tmp), // %5
1023 "+r"(src_tmp) // %6
1024 :
1025 : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9",
1026 "q10", "q11", "q12", "q13", "q14", "q15"
1027 );
1028 }
1029
1030 #undef LOAD2_DATA32_LANE
1031
1032 #endif // defined(__ARM_NEON__) && !defined(__aarch64__)
1033
1034 #ifdef __cplusplus
1035 } // extern "C"
1036 } // namespace libyuv
1037 #endif
1038