1 /*
2  *  Copyright 2014 The LibYuv Project Authors. All rights reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS. All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "libyuv/row.h"
12 #include "libyuv/scale.h"
13 #include "libyuv/scale_row.h"
14 
15 #ifdef __cplusplus
16 namespace libyuv {
17 extern "C" {
18 #endif
19 
20 // This module is for GCC Neon armv8 64 bit.
21 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
22 
23 // Read 32x1 throw away even pixels, and write 16x1.
ScaleRowDown2_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)24 void ScaleRowDown2_NEON(const uint8_t* src_ptr,
25                         ptrdiff_t src_stride,
26                         uint8_t* dst,
27                         int dst_width) {
28   (void)src_stride;
29   asm volatile(
30       "1:                                        \n"
31       // load even pixels into v0, odd into v1
32       "ld2        {v0.16b,v1.16b}, [%0], #32     \n"
33       "subs       %w2, %w2, #16                  \n"  // 16 processed per loop
34       "st1        {v1.16b}, [%1], #16            \n"  // store odd pixels
35       "b.gt       1b                             \n"
36       : "+r"(src_ptr),   // %0
37         "+r"(dst),       // %1
38         "+r"(dst_width)  // %2
39       :
40       : "v0", "v1"  // Clobber List
41       );
42 }
43 
44 // Read 32x1 average down and write 16x1.
ScaleRowDown2Linear_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)45 void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
46                               ptrdiff_t src_stride,
47                               uint8_t* dst,
48                               int dst_width) {
49   (void)src_stride;
50   asm volatile(
51       "1:                                        \n"
52       // load even pixels into v0, odd into v1
53       "ld2        {v0.16b,v1.16b}, [%0], #32     \n"
54       "subs       %w2, %w2, #16                  \n"  // 16 processed per loop
55       "urhadd     v0.16b, v0.16b, v1.16b         \n"  // rounding half add
56       "st1        {v0.16b}, [%1], #16            \n"
57       "b.gt       1b                             \n"
58       : "+r"(src_ptr),   // %0
59         "+r"(dst),       // %1
60         "+r"(dst_width)  // %2
61       :
62       : "v0", "v1"  // Clobber List
63       );
64 }
65 
66 // Read 32x2 average down and write 16x1.
ScaleRowDown2Box_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)67 void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
68                            ptrdiff_t src_stride,
69                            uint8_t* dst,
70                            int dst_width) {
71   asm volatile(
72       // change the stride to row 2 pointer
73       "add        %1, %1, %0                     \n"
74       "1:                                        \n"
75       "ld1        {v0.16b, v1.16b}, [%0], #32    \n"  // load row 1 and post inc
76       "ld1        {v2.16b, v3.16b}, [%1], #32    \n"  // load row 2 and post inc
77       "subs       %w3, %w3, #16                  \n"  // 16 processed per loop
78       "uaddlp     v0.8h, v0.16b                  \n"  // row 1 add adjacent
79       "uaddlp     v1.8h, v1.16b                  \n"
80       "uadalp     v0.8h, v2.16b                  \n"  // += row 2 add adjacent
81       "uadalp     v1.8h, v3.16b                  \n"
82       "rshrn      v0.8b, v0.8h, #2               \n"  // round and pack
83       "rshrn2     v0.16b, v1.8h, #2              \n"
84       "st1        {v0.16b}, [%2], #16            \n"
85       "b.gt       1b                             \n"
86       : "+r"(src_ptr),     // %0
87         "+r"(src_stride),  // %1
88         "+r"(dst),         // %2
89         "+r"(dst_width)    // %3
90       :
91       : "v0", "v1", "v2", "v3"  // Clobber List
92       );
93 }
94 
ScaleRowDown4_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)95 void ScaleRowDown4_NEON(const uint8_t* src_ptr,
96                         ptrdiff_t src_stride,
97                         uint8_t* dst_ptr,
98                         int dst_width) {
99   (void)src_stride;
100   asm volatile(
101       "1:                                        \n"
102       "ld4     {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32  \n"  // src line 0
103       "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
104       "st1     {v2.8b}, [%1], #8                 \n"
105       "b.gt       1b                             \n"
106       : "+r"(src_ptr),   // %0
107         "+r"(dst_ptr),   // %1
108         "+r"(dst_width)  // %2
109       :
110       : "v0", "v1", "v2", "v3", "memory", "cc");
111 }
112 
ScaleRowDown4Box_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)113 void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,
114                            ptrdiff_t src_stride,
115                            uint8_t* dst_ptr,
116                            int dst_width) {
117   const uint8_t* src_ptr1 = src_ptr + src_stride;
118   const uint8_t* src_ptr2 = src_ptr + src_stride * 2;
119   const uint8_t* src_ptr3 = src_ptr + src_stride * 3;
120   asm volatile(
121       "1:                                        \n"
122       "ld1     {v0.16b}, [%0], #16               \n"  // load up 16x4
123       "ld1     {v1.16b}, [%2], #16               \n"
124       "ld1     {v2.16b}, [%3], #16               \n"
125       "ld1     {v3.16b}, [%4], #16               \n"
126       "subs    %w5, %w5, #4                      \n"
127       "uaddlp  v0.8h, v0.16b                     \n"
128       "uadalp  v0.8h, v1.16b                     \n"
129       "uadalp  v0.8h, v2.16b                     \n"
130       "uadalp  v0.8h, v3.16b                     \n"
131       "addp    v0.8h, v0.8h, v0.8h               \n"
132       "rshrn   v0.8b, v0.8h, #4                  \n"  // divide by 16 w/rounding
133       "st1    {v0.s}[0], [%1], #4                \n"
134       "b.gt       1b                             \n"
135       : "+r"(src_ptr),   // %0
136         "+r"(dst_ptr),   // %1
137         "+r"(src_ptr1),  // %2
138         "+r"(src_ptr2),  // %3
139         "+r"(src_ptr3),  // %4
140         "+r"(dst_width)  // %5
141       :
142       : "v0", "v1", "v2", "v3", "memory", "cc");
143 }
144 
145 // Down scale from 4 to 3 pixels. Use the neon multilane read/write
146 // to load up the every 4th pixel into a 4 different registers.
147 // Point samples 32 pixels to 24 pixels.
ScaleRowDown34_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)148 void ScaleRowDown34_NEON(const uint8_t* src_ptr,
149                          ptrdiff_t src_stride,
150                          uint8_t* dst_ptr,
151                          int dst_width) {
152   (void)src_stride;
153   asm volatile(
154       "1:                                                \n"
155       "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32    \n"  // src line 0
156       "subs      %w2, %w2, #24                           \n"
157       "orr       v2.16b, v3.16b, v3.16b                  \n"  // order v0,v1,v2
158       "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24          \n"
159       "b.gt      1b                                      \n"
160       : "+r"(src_ptr),   // %0
161         "+r"(dst_ptr),   // %1
162         "+r"(dst_width)  // %2
163       :
164       : "v0", "v1", "v2", "v3", "memory", "cc");
165 }
166 
ScaleRowDown34_0_Box_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)167 void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
168                                ptrdiff_t src_stride,
169                                uint8_t* dst_ptr,
170                                int dst_width) {
171   asm volatile(
172       "movi      v20.8b, #3                              \n"
173       "add       %3, %3, %0                              \n"
174       "1:                                                \n"
175       "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32    \n"  // src line 0
176       "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32    \n"  // src line 1
177       "subs         %w2, %w2, #24                        \n"
178 
179       // filter src line 0 with src line 1
180       // expand chars to shorts to allow for room
181       // when adding lines together
182       "ushll     v16.8h, v4.8b, #0                       \n"
183       "ushll     v17.8h, v5.8b, #0                       \n"
184       "ushll     v18.8h, v6.8b, #0                       \n"
185       "ushll     v19.8h, v7.8b, #0                       \n"
186 
187       // 3 * line_0 + line_1
188       "umlal     v16.8h, v0.8b, v20.8b                   \n"
189       "umlal     v17.8h, v1.8b, v20.8b                   \n"
190       "umlal     v18.8h, v2.8b, v20.8b                   \n"
191       "umlal     v19.8h, v3.8b, v20.8b                   \n"
192 
193       // (3 * line_0 + line_1) >> 2
194       "uqrshrn   v0.8b, v16.8h, #2                       \n"
195       "uqrshrn   v1.8b, v17.8h, #2                       \n"
196       "uqrshrn   v2.8b, v18.8h, #2                       \n"
197       "uqrshrn   v3.8b, v19.8h, #2                       \n"
198 
199       // a0 = (src[0] * 3 + s[1] * 1) >> 2
200       "ushll     v16.8h, v1.8b, #0                       \n"
201       "umlal     v16.8h, v0.8b, v20.8b                   \n"
202       "uqrshrn   v0.8b, v16.8h, #2                       \n"
203 
204       // a1 = (src[1] * 1 + s[2] * 1) >> 1
205       "urhadd    v1.8b, v1.8b, v2.8b                     \n"
206 
207       // a2 = (src[2] * 1 + s[3] * 3) >> 2
208       "ushll     v16.8h, v2.8b, #0                       \n"
209       "umlal     v16.8h, v3.8b, v20.8b                   \n"
210       "uqrshrn   v2.8b, v16.8h, #2                       \n"
211 
212       "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24          \n"
213 
214       "b.gt      1b                                      \n"
215       : "+r"(src_ptr),    // %0
216         "+r"(dst_ptr),    // %1
217         "+r"(dst_width),  // %2
218         "+r"(src_stride)  // %3
219       :
220       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
221         "v19", "v20", "memory", "cc");
222 }
223 
ScaleRowDown34_1_Box_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)224 void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
225                                ptrdiff_t src_stride,
226                                uint8_t* dst_ptr,
227                                int dst_width) {
228   asm volatile(
229       "movi      v20.8b, #3                              \n"
230       "add       %3, %3, %0                              \n"
231       "1:                                                \n"
232       "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32    \n"  // src line 0
233       "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32    \n"  // src line 1
234       "subs         %w2, %w2, #24                        \n"
235       // average src line 0 with src line 1
236       "urhadd    v0.8b, v0.8b, v4.8b                     \n"
237       "urhadd    v1.8b, v1.8b, v5.8b                     \n"
238       "urhadd    v2.8b, v2.8b, v6.8b                     \n"
239       "urhadd    v3.8b, v3.8b, v7.8b                     \n"
240 
241       // a0 = (src[0] * 3 + s[1] * 1) >> 2
242       "ushll     v4.8h, v1.8b, #0                        \n"
243       "umlal     v4.8h, v0.8b, v20.8b                    \n"
244       "uqrshrn   v0.8b, v4.8h, #2                        \n"
245 
246       // a1 = (src[1] * 1 + s[2] * 1) >> 1
247       "urhadd    v1.8b, v1.8b, v2.8b                     \n"
248 
249       // a2 = (src[2] * 1 + s[3] * 3) >> 2
250       "ushll     v4.8h, v2.8b, #0                        \n"
251       "umlal     v4.8h, v3.8b, v20.8b                    \n"
252       "uqrshrn   v2.8b, v4.8h, #2                        \n"
253 
254       "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24          \n"
255       "b.gt      1b                                      \n"
256       : "+r"(src_ptr),    // %0
257         "+r"(dst_ptr),    // %1
258         "+r"(dst_width),  // %2
259         "+r"(src_stride)  // %3
260       :
261       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "memory", "cc");
262 }
263 
264 static const uvec8 kShuf38 = {0,  3,  6,  8,  11, 14, 16, 19,
265                               22, 24, 27, 30, 0,  0,  0,  0};
266 static const uvec8 kShuf38_2 = {0,  16, 32, 2,  18, 33, 4, 20,
267                                 34, 6,  22, 35, 0,  0,  0, 0};
268 static const vec16 kMult38_Div6 = {65536 / 12, 65536 / 12, 65536 / 12,
269                                    65536 / 12, 65536 / 12, 65536 / 12,
270                                    65536 / 12, 65536 / 12};
271 static const vec16 kMult38_Div9 = {65536 / 18, 65536 / 18, 65536 / 18,
272                                    65536 / 18, 65536 / 18, 65536 / 18,
273                                    65536 / 18, 65536 / 18};
274 
275 // 32 -> 12
ScaleRowDown38_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)276 void ScaleRowDown38_NEON(const uint8_t* src_ptr,
277                          ptrdiff_t src_stride,
278                          uint8_t* dst_ptr,
279                          int dst_width) {
280   (void)src_stride;
281   asm volatile(
282       "ld1       {v3.16b}, [%3]                          \n"
283       "1:                                                \n"
284       "ld1       {v0.16b,v1.16b}, [%0], #32              \n"
285       "subs      %w2, %w2, #12                           \n"
286       "tbl       v2.16b, {v0.16b,v1.16b}, v3.16b         \n"
287       "st1       {v2.8b}, [%1], #8                       \n"
288       "st1       {v2.s}[2], [%1], #4                     \n"
289       "b.gt      1b                                      \n"
290       : "+r"(src_ptr),   // %0
291         "+r"(dst_ptr),   // %1
292         "+r"(dst_width)  // %2
293       : "r"(&kShuf38)    // %3
294       : "v0", "v1", "v2", "v3", "memory", "cc");
295 }
296 
297 // 32x3 -> 12x1
ScaleRowDown38_3_Box_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)298 void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
299                                       ptrdiff_t src_stride,
300                                       uint8_t* dst_ptr,
301                                       int dst_width) {
302   const uint8_t* src_ptr1 = src_ptr + src_stride * 2;
303   ptrdiff_t tmp_src_stride = src_stride;
304 
305   asm volatile(
306       "ld1       {v29.8h}, [%5]                          \n"
307       "ld1       {v30.16b}, [%6]                         \n"
308       "ld1       {v31.8h}, [%7]                          \n"
309       "add       %2, %2, %0                              \n"
310       "1:                                                \n"
311 
312       // 00 40 01 41 02 42 03 43
313       // 10 50 11 51 12 52 13 53
314       // 20 60 21 61 22 62 23 63
315       // 30 70 31 71 32 72 33 73
316       "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32    \n"
317       "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32    \n"
318       "ld4       {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32  \n"
319       "subs      %w4, %w4, #12                           \n"
320 
321       // Shuffle the input data around to get align the data
322       //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
323       // 00 10 01 11 02 12 03 13
324       // 40 50 41 51 42 52 43 53
325       "trn1      v20.8b, v0.8b, v1.8b                    \n"
326       "trn2      v21.8b, v0.8b, v1.8b                    \n"
327       "trn1      v22.8b, v4.8b, v5.8b                    \n"
328       "trn2      v23.8b, v4.8b, v5.8b                    \n"
329       "trn1      v24.8b, v16.8b, v17.8b                  \n"
330       "trn2      v25.8b, v16.8b, v17.8b                  \n"
331 
332       // 20 30 21 31 22 32 23 33
333       // 60 70 61 71 62 72 63 73
334       "trn1      v0.8b, v2.8b, v3.8b                     \n"
335       "trn2      v1.8b, v2.8b, v3.8b                     \n"
336       "trn1      v4.8b, v6.8b, v7.8b                     \n"
337       "trn2      v5.8b, v6.8b, v7.8b                     \n"
338       "trn1      v16.8b, v18.8b, v19.8b                  \n"
339       "trn2      v17.8b, v18.8b, v19.8b                  \n"
340 
341       // 00+10 01+11 02+12 03+13
342       // 40+50 41+51 42+52 43+53
343       "uaddlp    v20.4h, v20.8b                          \n"
344       "uaddlp    v21.4h, v21.8b                          \n"
345       "uaddlp    v22.4h, v22.8b                          \n"
346       "uaddlp    v23.4h, v23.8b                          \n"
347       "uaddlp    v24.4h, v24.8b                          \n"
348       "uaddlp    v25.4h, v25.8b                          \n"
349 
350       // 60+70 61+71 62+72 63+73
351       "uaddlp    v1.4h, v1.8b                            \n"
352       "uaddlp    v5.4h, v5.8b                            \n"
353       "uaddlp    v17.4h, v17.8b                          \n"
354 
355       // combine source lines
356       "add       v20.4h, v20.4h, v22.4h                  \n"
357       "add       v21.4h, v21.4h, v23.4h                  \n"
358       "add       v20.4h, v20.4h, v24.4h                  \n"
359       "add       v21.4h, v21.4h, v25.4h                  \n"
360       "add       v2.4h, v1.4h, v5.4h                     \n"
361       "add       v2.4h, v2.4h, v17.4h                    \n"
362 
363       // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
364       //             + s[6 + st * 1] + s[7 + st * 1]
365       //             + s[6 + st * 2] + s[7 + st * 2]) / 6
366       "sqrdmulh  v2.8h, v2.8h, v29.8h                    \n"
367       "xtn       v2.8b,  v2.8h                           \n"
368 
369       // Shuffle 2,3 reg around so that 2 can be added to the
370       //  0,1 reg and 3 can be added to the 4,5 reg. This
371       //  requires expanding from u8 to u16 as the 0,1 and 4,5
372       //  registers are already expanded. Then do transposes
373       //  to get aligned.
374       // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
375       "ushll     v16.8h, v16.8b, #0                      \n"
376       "uaddl     v0.8h, v0.8b, v4.8b                     \n"
377 
378       // combine source lines
379       "add       v0.8h, v0.8h, v16.8h                    \n"
380 
381       // xx 20 xx 21 xx 22 xx 23
382       // xx 30 xx 31 xx 32 xx 33
383       "trn1      v1.8h, v0.8h, v0.8h                     \n"
384       "trn2      v4.8h, v0.8h, v0.8h                     \n"
385       "xtn       v0.4h, v1.4s                            \n"
386       "xtn       v4.4h, v4.4s                            \n"
387 
388       // 0+1+2, 3+4+5
389       "add       v20.8h, v20.8h, v0.8h                   \n"
390       "add       v21.8h, v21.8h, v4.8h                   \n"
391 
392       // Need to divide, but can't downshift as the the value
393       //  isn't a power of 2. So multiply by 65536 / n
394       //  and take the upper 16 bits.
395       "sqrdmulh  v0.8h, v20.8h, v31.8h                   \n"
396       "sqrdmulh  v1.8h, v21.8h, v31.8h                   \n"
397 
398       // Align for table lookup, vtbl requires registers to be adjacent
399       "tbl       v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n"
400 
401       "st1       {v3.8b}, [%1], #8                       \n"
402       "st1       {v3.s}[2], [%1], #4                     \n"
403       "b.gt      1b                                      \n"
404       : "+r"(src_ptr),         // %0
405         "+r"(dst_ptr),         // %1
406         "+r"(tmp_src_stride),  // %2
407         "+r"(src_ptr1),        // %3
408         "+r"(dst_width)        // %4
409       : "r"(&kMult38_Div6),    // %5
410         "r"(&kShuf38_2),       // %6
411         "r"(&kMult38_Div9)     // %7
412       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
413         "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v29", "v30", "v31",
414         "memory", "cc");
415 }
416 
417 // 32x2 -> 12x1
ScaleRowDown38_2_Box_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)418 void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
419                                ptrdiff_t src_stride,
420                                uint8_t* dst_ptr,
421                                int dst_width) {
422   // TODO(fbarchard): use src_stride directly for clang 3.5+.
423   ptrdiff_t tmp_src_stride = src_stride;
424   asm volatile(
425       "ld1       {v30.8h}, [%4]                          \n"
426       "ld1       {v31.16b}, [%5]                         \n"
427       "add       %2, %2, %0                              \n"
428       "1:                                                \n"
429 
430       // 00 40 01 41 02 42 03 43
431       // 10 50 11 51 12 52 13 53
432       // 20 60 21 61 22 62 23 63
433       // 30 70 31 71 32 72 33 73
434       "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32    \n"
435       "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32    \n"
436       "subs      %w3, %w3, #12                           \n"
437 
438       // Shuffle the input data around to get align the data
439       //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
440       // 00 10 01 11 02 12 03 13
441       // 40 50 41 51 42 52 43 53
442       "trn1      v16.8b, v0.8b, v1.8b                    \n"
443       "trn2      v17.8b, v0.8b, v1.8b                    \n"
444       "trn1      v18.8b, v4.8b, v5.8b                    \n"
445       "trn2      v19.8b, v4.8b, v5.8b                    \n"
446 
447       // 20 30 21 31 22 32 23 33
448       // 60 70 61 71 62 72 63 73
449       "trn1      v0.8b, v2.8b, v3.8b                     \n"
450       "trn2      v1.8b, v2.8b, v3.8b                     \n"
451       "trn1      v4.8b, v6.8b, v7.8b                     \n"
452       "trn2      v5.8b, v6.8b, v7.8b                     \n"
453 
454       // 00+10 01+11 02+12 03+13
455       // 40+50 41+51 42+52 43+53
456       "uaddlp    v16.4h, v16.8b                          \n"
457       "uaddlp    v17.4h, v17.8b                          \n"
458       "uaddlp    v18.4h, v18.8b                          \n"
459       "uaddlp    v19.4h, v19.8b                          \n"
460 
461       // 60+70 61+71 62+72 63+73
462       "uaddlp    v1.4h, v1.8b                            \n"
463       "uaddlp    v5.4h, v5.8b                            \n"
464 
465       // combine source lines
466       "add       v16.4h, v16.4h, v18.4h                  \n"
467       "add       v17.4h, v17.4h, v19.4h                  \n"
468       "add       v2.4h, v1.4h, v5.4h                     \n"
469 
470       // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
471       "uqrshrn   v2.8b, v2.8h, #2                        \n"
472 
473       // Shuffle 2,3 reg around so that 2 can be added to the
474       //  0,1 reg and 3 can be added to the 4,5 reg. This
475       //  requires expanding from u8 to u16 as the 0,1 and 4,5
476       //  registers are already expanded. Then do transposes
477       //  to get aligned.
478       // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
479 
480       // combine source lines
481       "uaddl     v0.8h, v0.8b, v4.8b                     \n"
482 
483       // xx 20 xx 21 xx 22 xx 23
484       // xx 30 xx 31 xx 32 xx 33
485       "trn1      v1.8h, v0.8h, v0.8h                     \n"
486       "trn2      v4.8h, v0.8h, v0.8h                     \n"
487       "xtn       v0.4h, v1.4s                            \n"
488       "xtn       v4.4h, v4.4s                            \n"
489 
490       // 0+1+2, 3+4+5
491       "add       v16.8h, v16.8h, v0.8h                   \n"
492       "add       v17.8h, v17.8h, v4.8h                   \n"
493 
494       // Need to divide, but can't downshift as the the value
495       //  isn't a power of 2. So multiply by 65536 / n
496       //  and take the upper 16 bits.
497       "sqrdmulh  v0.8h, v16.8h, v30.8h                   \n"
498       "sqrdmulh  v1.8h, v17.8h, v30.8h                   \n"
499 
500       // Align for table lookup, vtbl requires registers to
501       //  be adjacent
502 
503       "tbl       v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n"
504 
505       "st1       {v3.8b}, [%1], #8                       \n"
506       "st1       {v3.s}[2], [%1], #4                     \n"
507       "b.gt      1b                                      \n"
508       : "+r"(src_ptr),         // %0
509         "+r"(dst_ptr),         // %1
510         "+r"(tmp_src_stride),  // %2
511         "+r"(dst_width)        // %3
512       : "r"(&kMult38_Div6),    // %4
513         "r"(&kShuf38_2)        // %5
514       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
515         "v19", "v30", "v31", "memory", "cc");
516 }
517 
ScaleAddRows_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,int src_width,int src_height)518 void ScaleAddRows_NEON(const uint8_t* src_ptr,
519                        ptrdiff_t src_stride,
520                        uint16_t* dst_ptr,
521                        int src_width,
522                        int src_height) {
523   const uint8_t* src_tmp;
524   asm volatile(
525       "1:                                        \n"
526       "mov       %0, %1                          \n"
527       "mov       w12, %w5                        \n"
528       "eor       v2.16b, v2.16b, v2.16b          \n"
529       "eor       v3.16b, v3.16b, v3.16b          \n"
530       "2:                                        \n"
531       // load 16 pixels into q0
532       "ld1       {v0.16b}, [%0], %3              \n"
533       "uaddw2    v3.8h, v3.8h, v0.16b            \n"
534       "uaddw     v2.8h, v2.8h, v0.8b             \n"
535       "subs      w12, w12, #1                    \n"
536       "b.gt      2b                              \n"
537       "st1      {v2.8h, v3.8h}, [%2], #32        \n"  // store pixels
538       "add      %1, %1, #16                      \n"
539       "subs     %w4, %w4, #16                    \n"  // 16 processed per loop
540       "b.gt     1b                               \n"
541       : "=&r"(src_tmp),    // %0
542         "+r"(src_ptr),     // %1
543         "+r"(dst_ptr),     // %2
544         "+r"(src_stride),  // %3
545         "+r"(src_width),   // %4
546         "+r"(src_height)   // %5
547       :
548       : "memory", "cc", "w12", "v0", "v1", "v2", "v3"  // Clobber List
549       );
550 }
551 
552 // TODO(Yang Zhang): Investigate less load instructions for
553 // the x/dx stepping
554 #define LOAD2_DATA8_LANE(n)                      \
555   "lsr        %5, %3, #16                    \n" \
556   "add        %6, %1, %5                     \n" \
557   "add        %3, %3, %4                     \n" \
558   "ld2        {v4.b, v5.b}[" #n "], [%6]     \n"
559 
560 // The NEON version mimics this formula (from row_common.cc):
561 // #define BLENDER(a, b, f) (uint8_t)((int)(a) +
562 //    ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
563 
ScaleFilterCols_NEON(uint8_t * dst_ptr,const uint8_t * src_ptr,int dst_width,int x,int dx)564 void ScaleFilterCols_NEON(uint8_t* dst_ptr,
565                           const uint8_t* src_ptr,
566                           int dst_width,
567                           int x,
568                           int dx) {
569   int dx_offset[4] = {0, 1, 2, 3};
570   int* tmp = dx_offset;
571   const uint8_t* src_tmp = src_ptr;
572   int64_t x64 = (int64_t)x;    // NOLINT
573   int64_t dx64 = (int64_t)dx;  // NOLINT
574   asm volatile (
575     "dup        v0.4s, %w3                     \n"  // x
576     "dup        v1.4s, %w4                     \n"  // dx
577     "ld1        {v2.4s}, [%5]                  \n"  // 0 1 2 3
578     "shl        v3.4s, v1.4s, #2               \n"  // 4 * dx
579     "mul        v1.4s, v1.4s, v2.4s            \n"
580     // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
581     "add        v1.4s, v1.4s, v0.4s            \n"
582     // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
583     "add        v2.4s, v1.4s, v3.4s            \n"
584     "shl        v0.4s, v3.4s, #1               \n"  // 8 * dx
585   "1:                                          \n"
586     LOAD2_DATA8_LANE(0)
587     LOAD2_DATA8_LANE(1)
588     LOAD2_DATA8_LANE(2)
589     LOAD2_DATA8_LANE(3)
590     LOAD2_DATA8_LANE(4)
591     LOAD2_DATA8_LANE(5)
592     LOAD2_DATA8_LANE(6)
593     LOAD2_DATA8_LANE(7)
594     "mov       v6.16b, v1.16b                  \n"
595     "mov       v7.16b, v2.16b                  \n"
596     "uzp1      v6.8h, v6.8h, v7.8h             \n"
597     "ushll     v4.8h, v4.8b, #0                \n"
598     "ushll     v5.8h, v5.8b, #0                \n"
599     "ssubl     v16.4s, v5.4h, v4.4h            \n"
600     "ssubl2    v17.4s, v5.8h, v4.8h            \n"
601     "ushll     v7.4s, v6.4h, #0                \n"
602     "ushll2    v6.4s, v6.8h, #0                \n"
603     "mul       v16.4s, v16.4s, v7.4s           \n"
604     "mul       v17.4s, v17.4s, v6.4s           \n"
605     "rshrn     v6.4h, v16.4s, #16              \n"
606     "rshrn2    v6.8h, v17.4s, #16              \n"
607     "add       v4.8h, v4.8h, v6.8h             \n"
608     "xtn       v4.8b, v4.8h                    \n"
609 
610     "st1       {v4.8b}, [%0], #8               \n"  // store pixels
611     "add       v1.4s, v1.4s, v0.4s             \n"
612     "add       v2.4s, v2.4s, v0.4s             \n"
613     "subs      %w2, %w2, #8                    \n"  // 8 processed per loop
614     "b.gt      1b                              \n"
615   : "+r"(dst_ptr),          // %0
616     "+r"(src_ptr),          // %1
617     "+r"(dst_width),        // %2
618     "+r"(x64),              // %3
619     "+r"(dx64),             // %4
620     "+r"(tmp),              // %5
621     "+r"(src_tmp)           // %6
622   :
623   : "memory", "cc", "v0", "v1", "v2", "v3",
624     "v4", "v5", "v6", "v7", "v16", "v17"
625   );
626 }
627 
628 #undef LOAD2_DATA8_LANE
629 
630 // 16x2 -> 16x1
ScaleFilterRows_NEON(uint8_t * dst_ptr,const uint8_t * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)631 void ScaleFilterRows_NEON(uint8_t* dst_ptr,
632                           const uint8_t* src_ptr,
633                           ptrdiff_t src_stride,
634                           int dst_width,
635                           int source_y_fraction) {
636   int y_fraction = 256 - source_y_fraction;
637   asm volatile(
638       "cmp          %w4, #0                      \n"
639       "b.eq         100f                         \n"
640       "add          %2, %2, %1                   \n"
641       "cmp          %w4, #64                     \n"
642       "b.eq         75f                          \n"
643       "cmp          %w4, #128                    \n"
644       "b.eq         50f                          \n"
645       "cmp          %w4, #192                    \n"
646       "b.eq         25f                          \n"
647 
648       "dup          v5.8b, %w4                   \n"
649       "dup          v4.8b, %w5                   \n"
650       // General purpose row blend.
651       "1:                                        \n"
652       "ld1          {v0.16b}, [%1], #16          \n"
653       "ld1          {v1.16b}, [%2], #16          \n"
654       "subs         %w3, %w3, #16                \n"
655       "umull        v6.8h, v0.8b, v4.8b          \n"
656       "umull2       v7.8h, v0.16b, v4.16b        \n"
657       "umlal        v6.8h, v1.8b, v5.8b          \n"
658       "umlal2       v7.8h, v1.16b, v5.16b        \n"
659       "rshrn        v0.8b, v6.8h, #8             \n"
660       "rshrn2       v0.16b, v7.8h, #8            \n"
661       "st1          {v0.16b}, [%0], #16          \n"
662       "b.gt         1b                           \n"
663       "b            99f                          \n"
664 
665       // Blend 25 / 75.
666       "25:                                       \n"
667       "ld1          {v0.16b}, [%1], #16          \n"
668       "ld1          {v1.16b}, [%2], #16          \n"
669       "subs         %w3, %w3, #16                \n"
670       "urhadd       v0.16b, v0.16b, v1.16b       \n"
671       "urhadd       v0.16b, v0.16b, v1.16b       \n"
672       "st1          {v0.16b}, [%0], #16          \n"
673       "b.gt         25b                          \n"
674       "b            99f                          \n"
675 
676       // Blend 50 / 50.
677       "50:                                       \n"
678       "ld1          {v0.16b}, [%1], #16          \n"
679       "ld1          {v1.16b}, [%2], #16          \n"
680       "subs         %w3, %w3, #16                \n"
681       "urhadd       v0.16b, v0.16b, v1.16b       \n"
682       "st1          {v0.16b}, [%0], #16          \n"
683       "b.gt         50b                          \n"
684       "b            99f                          \n"
685 
686       // Blend 75 / 25.
687       "75:                                       \n"
688       "ld1          {v1.16b}, [%1], #16          \n"
689       "ld1          {v0.16b}, [%2], #16          \n"
690       "subs         %w3, %w3, #16                \n"
691       "urhadd       v0.16b, v0.16b, v1.16b       \n"
692       "urhadd       v0.16b, v0.16b, v1.16b       \n"
693       "st1          {v0.16b}, [%0], #16          \n"
694       "b.gt         75b                          \n"
695       "b            99f                          \n"
696 
697       // Blend 100 / 0 - Copy row unchanged.
698       "100:                                      \n"
699       "ld1          {v0.16b}, [%1], #16          \n"
700       "subs         %w3, %w3, #16                \n"
701       "st1          {v0.16b}, [%0], #16          \n"
702       "b.gt         100b                         \n"
703 
704       "99:                                       \n"
705       "st1          {v0.b}[15], [%0]             \n"
706       : "+r"(dst_ptr),            // %0
707         "+r"(src_ptr),            // %1
708         "+r"(src_stride),         // %2
709         "+r"(dst_width),          // %3
710         "+r"(source_y_fraction),  // %4
711         "+r"(y_fraction)          // %5
712       :
713       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory", "cc");
714 }
715 
ScaleARGBRowDown2_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)716 void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
717                             ptrdiff_t src_stride,
718                             uint8_t* dst,
719                             int dst_width) {
720   (void)src_stride;
721   asm volatile(
722       "1:                                        \n"
723       // load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3
724       "ld4        {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n"
725       "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
726       "mov        v2.16b, v3.16b                 \n"
727       "st2        {v1.4s,v2.4s}, [%1], #32       \n"  // store 8 odd pixels
728       "b.gt       1b                             \n"
729       : "+r"(src_ptr),   // %0
730         "+r"(dst),       // %1
731         "+r"(dst_width)  // %2
732       :
733       : "memory", "cc", "v0", "v1", "v2", "v3"  // Clobber List
734       );
735 }
736 
ScaleARGBRowDown2Linear_NEON(const uint8_t * src_argb,ptrdiff_t src_stride,uint8_t * dst_argb,int dst_width)737 void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
738                                   ptrdiff_t src_stride,
739                                   uint8_t* dst_argb,
740                                   int dst_width) {
741   (void)src_stride;
742   asm volatile(
743       "1:                                        \n"
744       // load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3
745       "ld4        {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n"
746       "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
747 
748       "urhadd     v0.16b, v0.16b, v1.16b         \n"  // rounding half add
749       "urhadd     v1.16b, v2.16b, v3.16b         \n"
750       "st2        {v0.4s,v1.4s}, [%1], #32       \n"  // store 8 pixels
751       "b.gt       1b                             \n"
752       : "+r"(src_argb),  // %0
753         "+r"(dst_argb),  // %1
754         "+r"(dst_width)  // %2
755       :
756       : "memory", "cc", "v0", "v1", "v2", "v3"  // Clobber List
757       );
758 }
759 
ScaleARGBRowDown2Box_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)760 void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
761                                ptrdiff_t src_stride,
762                                uint8_t* dst,
763                                int dst_width) {
764   asm volatile(
765       // change the stride to row 2 pointer
766       "add        %1, %1, %0                     \n"
767       "1:                                        \n"
768       "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 8 ARGB
769       "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
770       "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
771       "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
772       "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
773       "uaddlp     v3.8h, v3.16b                  \n"  // A 16 bytes -> 8 shorts.
774       "ld4        {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n"  // load 8
775       "uadalp     v0.8h, v16.16b                 \n"  // B 16 bytes -> 8 shorts.
776       "uadalp     v1.8h, v17.16b                 \n"  // G 16 bytes -> 8 shorts.
777       "uadalp     v2.8h, v18.16b                 \n"  // R 16 bytes -> 8 shorts.
778       "uadalp     v3.8h, v19.16b                 \n"  // A 16 bytes -> 8 shorts.
779       "rshrn      v0.8b, v0.8h, #2               \n"  // round and pack
780       "rshrn      v1.8b, v1.8h, #2               \n"
781       "rshrn      v2.8b, v2.8h, #2               \n"
782       "rshrn      v3.8b, v3.8h, #2               \n"
783       "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32     \n"
784       "b.gt       1b                             \n"
785       : "+r"(src_ptr),     // %0
786         "+r"(src_stride),  // %1
787         "+r"(dst),         // %2
788         "+r"(dst_width)    // %3
789       :
790       : "memory", "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
791 }
792 
793 // Reads 4 pixels at a time.
794 // Alignment requirement: src_argb 4 byte aligned.
ScaleARGBRowDownEven_NEON(const uint8_t * src_argb,ptrdiff_t src_stride,int src_stepx,uint8_t * dst_argb,int dst_width)795 void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
796                                ptrdiff_t src_stride,
797                                int src_stepx,
798                                uint8_t* dst_argb,
799                                int dst_width) {
800   (void)src_stride;
801   asm volatile(
802       "1:                                        \n"
803       "ld1        {v0.s}[0], [%0], %3            \n"
804       "ld1        {v0.s}[1], [%0], %3            \n"
805       "ld1        {v0.s}[2], [%0], %3            \n"
806       "ld1        {v0.s}[3], [%0], %3            \n"
807       "subs       %w2, %w2, #4                   \n"  // 4 pixels per loop.
808       "st1        {v0.16b}, [%1], #16            \n"
809       "b.gt       1b                             \n"
810       : "+r"(src_argb),                // %0
811         "+r"(dst_argb),                // %1
812         "+r"(dst_width)                // %2
813       : "r"((int64_t)(src_stepx * 4))  // %3
814       : "memory", "cc", "v0");
815 }
816 
817 // Reads 4 pixels at a time.
818 // Alignment requirement: src_argb 4 byte aligned.
819 // TODO(Yang Zhang): Might be worth another optimization pass in future.
820 // It could be upgraded to 8 pixels at a time to start with.
ScaleARGBRowDownEvenBox_NEON(const uint8_t * src_argb,ptrdiff_t src_stride,int src_stepx,uint8_t * dst_argb,int dst_width)821 void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
822                                   ptrdiff_t src_stride,
823                                   int src_stepx,
824                                   uint8_t* dst_argb,
825                                   int dst_width) {
826   asm volatile(
827       "add        %1, %1, %0                     \n"
828       "1:                                        \n"
829       "ld1        {v0.8b}, [%0], %4              \n"  // Read 4 2x2 -> 2x1
830       "ld1        {v1.8b}, [%1], %4              \n"
831       "ld1        {v2.8b}, [%0], %4              \n"
832       "ld1        {v3.8b}, [%1], %4              \n"
833       "ld1        {v4.8b}, [%0], %4              \n"
834       "ld1        {v5.8b}, [%1], %4              \n"
835       "ld1        {v6.8b}, [%0], %4              \n"
836       "ld1        {v7.8b}, [%1], %4              \n"
837       "uaddl      v0.8h, v0.8b, v1.8b            \n"
838       "uaddl      v2.8h, v2.8b, v3.8b            \n"
839       "uaddl      v4.8h, v4.8b, v5.8b            \n"
840       "uaddl      v6.8h, v6.8b, v7.8b            \n"
841       "mov        v16.d[1], v0.d[1]              \n"  // ab_cd -> ac_bd
842       "mov        v0.d[1], v2.d[0]               \n"
843       "mov        v2.d[0], v16.d[1]              \n"
844       "mov        v16.d[1], v4.d[1]              \n"  // ef_gh -> eg_fh
845       "mov        v4.d[1], v6.d[0]               \n"
846       "mov        v6.d[0], v16.d[1]              \n"
847       "add        v0.8h, v0.8h, v2.8h            \n"  // (a+b)_(c+d)
848       "add        v4.8h, v4.8h, v6.8h            \n"  // (e+f)_(g+h)
849       "rshrn      v0.8b, v0.8h, #2               \n"  // first 2 pixels.
850       "rshrn2     v0.16b, v4.8h, #2              \n"  // next 2 pixels.
851       "subs       %w3, %w3, #4                   \n"  // 4 pixels per loop.
852       "st1     {v0.16b}, [%2], #16               \n"
853       "b.gt       1b                             \n"
854       : "+r"(src_argb),                // %0
855         "+r"(src_stride),              // %1
856         "+r"(dst_argb),                // %2
857         "+r"(dst_width)                // %3
858       : "r"((int64_t)(src_stepx * 4))  // %4
859       : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
860 }
861 
862 // TODO(Yang Zhang): Investigate less load instructions for
863 // the x/dx stepping
864 #define LOAD1_DATA32_LANE(vn, n)                 \
865   "lsr        %5, %3, #16                    \n" \
866   "add        %6, %1, %5, lsl #2             \n" \
867   "add        %3, %3, %4                     \n" \
868   "ld1        {" #vn ".s}[" #n "], [%6]      \n"
869 
ScaleARGBCols_NEON(uint8_t * dst_argb,const uint8_t * src_argb,int dst_width,int x,int dx)870 void ScaleARGBCols_NEON(uint8_t* dst_argb,
871                         const uint8_t* src_argb,
872                         int dst_width,
873                         int x,
874                         int dx) {
875   const uint8_t* src_tmp = src_argb;
876   int64_t x64 = (int64_t)x;    // NOLINT
877   int64_t dx64 = (int64_t)dx;  // NOLINT
878   int64_t tmp64;
879   asm volatile(
880       "1:                                        \n"
881       // clang-format off
882       LOAD1_DATA32_LANE(v0, 0)
883       LOAD1_DATA32_LANE(v0, 1)
884       LOAD1_DATA32_LANE(v0, 2)
885       LOAD1_DATA32_LANE(v0, 3)
886       LOAD1_DATA32_LANE(v1, 0)
887       LOAD1_DATA32_LANE(v1, 1)
888       LOAD1_DATA32_LANE(v1, 2)
889       LOAD1_DATA32_LANE(v1, 3)
890       // clang-format on
891       "st1        {v0.4s, v1.4s}, [%0], #32      \n"  // store pixels
892       "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
893       "b.gt       1b                             \n"
894       : "+r"(dst_argb),   // %0
895         "+r"(src_argb),   // %1
896         "+r"(dst_width),  // %2
897         "+r"(x64),        // %3
898         "+r"(dx64),       // %4
899         "=&r"(tmp64),     // %5
900         "+r"(src_tmp)     // %6
901       :
902       : "memory", "cc", "v0", "v1");
903 }
904 
905 #undef LOAD1_DATA32_LANE
906 
907 // TODO(Yang Zhang): Investigate less load instructions for
908 // the x/dx stepping
909 #define LOAD2_DATA32_LANE(vn1, vn2, n)                  \
910   "lsr        %5, %3, #16                           \n" \
911   "add        %6, %1, %5, lsl #2                    \n" \
912   "add        %3, %3, %4                            \n" \
913   "ld2        {" #vn1 ".s, " #vn2 ".s}[" #n "], [%6]  \n"
914 
ScaleARGBFilterCols_NEON(uint8_t * dst_argb,const uint8_t * src_argb,int dst_width,int x,int dx)915 void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
916                               const uint8_t* src_argb,
917                               int dst_width,
918                               int x,
919                               int dx) {
920   int dx_offset[4] = {0, 1, 2, 3};
921   int* tmp = dx_offset;
922   const uint8_t* src_tmp = src_argb;
923   int64_t x64 = (int64_t)x;    // NOLINT
924   int64_t dx64 = (int64_t)dx;  // NOLINT
925   asm volatile (
926     "dup        v0.4s, %w3                     \n"  // x
927     "dup        v1.4s, %w4                     \n"  // dx
928     "ld1        {v2.4s}, [%5]                  \n"  // 0 1 2 3
929     "shl        v6.4s, v1.4s, #2               \n"  // 4 * dx
930     "mul        v1.4s, v1.4s, v2.4s            \n"
931     "movi       v3.16b, #0x7f                  \n"  // 0x7F
932     "movi       v4.8h, #0x7f                   \n"  // 0x7F
933     // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
934     "add        v5.4s, v1.4s, v0.4s            \n"
935   "1:                                          \n"
936     // d0, d1: a
937     // d2, d3: b
938     LOAD2_DATA32_LANE(v0, v1, 0)
939     LOAD2_DATA32_LANE(v0, v1, 1)
940     LOAD2_DATA32_LANE(v0, v1, 2)
941     LOAD2_DATA32_LANE(v0, v1, 3)
942     "shrn       v2.4h, v5.4s, #9               \n"
943     "and        v2.8b, v2.8b, v4.8b            \n"
944     "dup        v16.8b, v2.b[0]                \n"
945     "dup        v17.8b, v2.b[2]                \n"
946     "dup        v18.8b, v2.b[4]                \n"
947     "dup        v19.8b, v2.b[6]                \n"
948     "ext        v2.8b, v16.8b, v17.8b, #4      \n"
949     "ext        v17.8b, v18.8b, v19.8b, #4     \n"
950     "ins        v2.d[1], v17.d[0]              \n"  // f
951     "eor        v7.16b, v2.16b, v3.16b         \n"  // 0x7f ^ f
952     "umull      v16.8h, v0.8b, v7.8b           \n"
953     "umull2     v17.8h, v0.16b, v7.16b         \n"
954     "umull      v18.8h, v1.8b, v2.8b           \n"
955     "umull2     v19.8h, v1.16b, v2.16b         \n"
956     "add        v16.8h, v16.8h, v18.8h         \n"
957     "add        v17.8h, v17.8h, v19.8h         \n"
958     "shrn       v0.8b, v16.8h, #7              \n"
959     "shrn2      v0.16b, v17.8h, #7             \n"
960 
961     "st1     {v0.4s}, [%0], #16                \n"  // store pixels
962     "add     v5.4s, v5.4s, v6.4s               \n"
963     "subs    %w2, %w2, #4                      \n"  // 4 processed per loop
964     "b.gt    1b                                \n"
965   : "+r"(dst_argb),         // %0
966     "+r"(src_argb),         // %1
967     "+r"(dst_width),        // %2
968     "+r"(x64),              // %3
969     "+r"(dx64),             // %4
970     "+r"(tmp),              // %5
971     "+r"(src_tmp)           // %6
972   :
973   : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5",
974     "v6", "v7", "v16", "v17", "v18", "v19"
975   );
976 }
977 
978 #undef LOAD2_DATA32_LANE
979 
980 // Read 16x2 average down and write 8x1.
ScaleRowDown2Box_16_NEON(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst,int dst_width)981 void ScaleRowDown2Box_16_NEON(const uint16_t* src_ptr,
982                               ptrdiff_t src_stride,
983                               uint16_t* dst,
984                               int dst_width) {
985   asm volatile(
986       // change the stride to row 2 pointer
987       "add        %1, %0, %1, lsl #1             \n"  // ptr + stide * 2
988       "1:                                        \n"
989       "ld1        {v0.8h, v1.8h}, [%0], #32      \n"  // load row 1 and post inc
990       "ld1        {v2.8h, v3.8h}, [%1], #32      \n"  // load row 2 and post inc
991       "subs       %w3, %w3, #8                   \n"  // 8 processed per loop
992       "uaddlp     v0.4s, v0.8h                   \n"  // row 1 add adjacent
993       "uaddlp     v1.4s, v1.8h                   \n"
994       "uadalp     v0.4s, v2.8h                   \n"  // +row 2 add adjacent
995       "uadalp     v1.4s, v3.8h                   \n"
996       "rshrn      v0.4h, v0.4s, #2               \n"  // round and pack
997       "rshrn2     v0.8h, v1.4s, #2               \n"
998       "st1        {v0.8h}, [%2], #16             \n"
999       "b.gt       1b                             \n"
1000       : "+r"(src_ptr),     // %0
1001         "+r"(src_stride),  // %1
1002         "+r"(dst),         // %2
1003         "+r"(dst_width)    // %3
1004       :
1005       : "v0", "v1", "v2", "v3"  // Clobber List
1006       );
1007 }
1008 
1009 // Read 8x2 upsample with filtering and write 16x1.
1010 // Actually reads an extra pixel, so 9x2.
ScaleRowUp2_16_NEON(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst,int dst_width)1011 void ScaleRowUp2_16_NEON(const uint16_t* src_ptr,
1012                          ptrdiff_t src_stride,
1013                          uint16_t* dst,
1014                          int dst_width) {
1015   asm volatile(
1016       "add        %1, %0, %1, lsl #1             \n"  // ptr + stide * 2
1017       "movi       v0.8h, #9                      \n"  // constants
1018       "movi       v1.4s, #3                      \n"
1019 
1020       "1:                                        \n"
1021       "ld1        {v3.8h}, [%0], %4              \n"  // TL read first 8
1022       "ld1        {v4.8h}, [%0], %5              \n"  // TR read 8 offset by 1
1023       "ld1        {v5.8h}, [%1], %4              \n"  // BL read 8 from next row
1024       "ld1        {v6.8h}, [%1], %5              \n"  // BR offset by 1
1025       "subs       %w3, %w3, #16                  \n"  // 16 dst pixels per loop
1026       "umull      v16.4s, v3.4h, v0.4h           \n"
1027       "umull2     v7.4s, v3.8h, v0.8h            \n"
1028       "umull      v18.4s, v4.4h, v0.4h           \n"
1029       "umull2     v17.4s, v4.8h, v0.8h           \n"
1030       "uaddw      v16.4s, v16.4s, v6.4h          \n"
1031       "uaddl2     v19.4s, v6.8h, v3.8h           \n"
1032       "uaddl      v3.4s, v6.4h, v3.4h            \n"
1033       "uaddw2     v6.4s, v7.4s, v6.8h            \n"
1034       "uaddl2     v7.4s, v5.8h, v4.8h            \n"
1035       "uaddl      v4.4s, v5.4h, v4.4h            \n"
1036       "uaddw      v18.4s, v18.4s, v5.4h          \n"
1037       "mla        v16.4s, v4.4s, v1.4s           \n"
1038       "mla        v18.4s, v3.4s, v1.4s           \n"
1039       "mla        v6.4s, v7.4s, v1.4s            \n"
1040       "uaddw2     v4.4s, v17.4s, v5.8h           \n"
1041       "uqrshrn    v16.4h,  v16.4s, #4            \n"
1042       "mla        v4.4s, v19.4s, v1.4s           \n"
1043       "uqrshrn2   v16.8h, v6.4s, #4              \n"
1044       "uqrshrn    v17.4h, v18.4s, #4             \n"
1045       "uqrshrn2   v17.8h, v4.4s, #4              \n"
1046       "st2        {v16.8h-v17.8h}, [%2], #32     \n"
1047       "b.gt       1b                             \n"
1048       : "+r"(src_ptr),     // %0
1049         "+r"(src_stride),  // %1
1050         "+r"(dst),         // %2
1051         "+r"(dst_width)    // %3
1052       : "r"(2LL),          // %4
1053         "r"(14LL)          // %5
1054       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
1055         "v19"  // Clobber List
1056       );
1057 }
1058 
1059 #endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
1060 
1061 #ifdef __cplusplus
1062 }  // extern "C"
1063 }  // namespace libyuv
1064 #endif
1065