1/*
2 * Copyright © 2018, VideoLAN and dav1d authors
3 * Copyright © 2018, Janne Grunau
4 * Copyright © 2018, Martin Storsjo
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice, this
11 *    list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 *    this list of conditions and the following disclaimer in the documentation
15 *    and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
21 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29#include "src/arm/asm.S"
30#include "util.S"
31
32.macro avg dst, t0, t1, t2, t3
33        ld1             {\t0\().8h,\t1\().8h},   [x2],  32
34        ld1             {\t2\().8h,\t3\().8h},   [x3],  32
35        add             \t0\().8h,   \t0\().8h,   \t2\().8h
36        add             \t1\().8h,   \t1\().8h,   \t3\().8h
37        sqrshrun        \dst\().8b,  \t0\().8h,   #5
38        sqrshrun2       \dst\().16b, \t1\().8h,   #5
39.endm
40
41.macro w_avg dst, t0, t1, t2, t3
42        ld1             {\t0\().8h,\t1\().8h},   [x2],  32
43        ld1             {\t2\().8h,\t3\().8h},   [x3],  32
44        sub             \t0\().8h,   \t2\().8h,   \t0\().8h
45        sub             \t1\().8h,   \t3\().8h,   \t1\().8h
46        sqdmulh         \t0\().8h,   \t0\().8h,   v30.8h
47        sqdmulh         \t1\().8h,   \t1\().8h,   v30.8h
48        add             \t0\().8h,   \t2\().8h,   \t0\().8h
49        add             \t1\().8h,   \t3\().8h,   \t1\().8h
50        sqrshrun        \dst\().8b,  \t0\().8h,   #4
51        sqrshrun2       \dst\().16b, \t1\().8h,   #4
52.endm
53
54.macro mask dst, t0, t1, t2, t3
55        ld1             {v30.16b}, [x6],  16
56        ld1             {\t0\().8h,\t1\().8h},   [x2],  32
57        mul             v30.16b, v30.16b, v31.16b
58        ld1             {\t2\().8h,\t3\().8h},   [x3],  32
59        shll            v28.8h, v30.8b,  #8
60        shll2           v29.8h, v30.16b, #8
61        sub             \t0\().8h,   \t2\().8h,   \t0\().8h
62        sub             \t1\().8h,   \t3\().8h,   \t1\().8h
63        sqdmulh         \t0\().8h,   \t0\().8h,   v28.8h
64        sqdmulh         \t1\().8h,   \t1\().8h,   v29.8h
65        add             \t0\().8h,   \t2\().8h,   \t0\().8h
66        add             \t1\().8h,   \t3\().8h,   \t1\().8h
67        sqrshrun        \dst\().8b,  \t0\().8h,   #4
68        sqrshrun2       \dst\().16b, \t1\().8h,   #4
69.endm
70
71.macro bidir_fn type
72function \type\()_8bpc_neon, export=1
73        clz             w4,  w4
74.ifc \type, w_avg
75        dup             v30.8h, w6
76        neg             v30.8h, v30.8h
77        shl             v30.8h, v30.8h, #11
78.endif
79.ifc \type, mask
80        movi            v31.16b, #256-2
81.endif
82        adr             x7,  L(\type\()_tbl)
83        sub             w4,  w4,  #24
84        ldrh            w4,  [x7, x4, lsl #1]
85        \type           v4,  v0,  v1,  v2,  v3
86        sub             x7,  x7,  w4, uxtw
87        br              x7
8840:
89        AARCH64_VALID_JUMP_TARGET
90        add             x7,  x0,  x1
91        lsl             x1,  x1,  #1
924:
93        cmp             w5,  #4
94        st1             {v4.s}[0],  [x0], x1
95        st1             {v4.s}[1],  [x7], x1
96        st1             {v4.s}[2],  [x0], x1
97        st1             {v4.s}[3],  [x7], x1
98        b.eq            0f
99        \type           v5,  v0,  v1,  v2,  v3
100        cmp             w5,  #8
101        st1             {v5.s}[0],  [x0], x1
102        st1             {v5.s}[1],  [x7], x1
103        st1             {v5.s}[2],  [x0], x1
104        st1             {v5.s}[3],  [x7], x1
105        b.eq            0f
106        \type           v4,  v0,  v1,  v2,  v3
107        st1             {v4.s}[0],  [x0], x1
108        st1             {v4.s}[1],  [x7], x1
109        \type           v5,  v0,  v1,  v2,  v3
110        st1             {v4.s}[2],  [x0], x1
111        st1             {v4.s}[3],  [x7], x1
112        st1             {v5.s}[0],  [x0], x1
113        st1             {v5.s}[1],  [x7], x1
114        st1             {v5.s}[2],  [x0], x1
115        st1             {v5.s}[3],  [x7], x1
116        ret
11780:
118        AARCH64_VALID_JUMP_TARGET
119        add             x7,  x0,  x1
120        lsl             x1,  x1,  #1
1218:
122        st1             {v4.d}[0],  [x0], x1
123        \type           v5,  v0,  v1,  v2,  v3
124        st1             {v4.d}[1],  [x7], x1
125        st1             {v5.d}[0],  [x0], x1
126        subs            w5,  w5,  #4
127        st1             {v5.d}[1],  [x7], x1
128        b.le            0f
129        \type           v4,  v0,  v1,  v2,  v3
130        b               8b
13116:
132        AARCH64_VALID_JUMP_TARGET
133        \type           v5,  v0,  v1,  v2,  v3
134        st1             {v4.16b}, [x0], x1
135        \type           v6,  v0,  v1,  v2,  v3
136        st1             {v5.16b}, [x0], x1
137        \type           v7,  v0,  v1,  v2,  v3
138        st1             {v6.16b}, [x0], x1
139        subs            w5,  w5,  #4
140        st1             {v7.16b}, [x0], x1
141        b.le            0f
142        \type           v4,  v0,  v1,  v2,  v3
143        b               16b
144320:
145        AARCH64_VALID_JUMP_TARGET
146        add             x7,  x0,  x1
147        lsl             x1,  x1,  #1
14832:
149        \type           v5,  v0,  v1,  v2,  v3
150        \type           v6,  v0,  v1,  v2,  v3
151        st1             {v4.16b,v5.16b}, [x0], x1
152        \type           v7,  v0,  v1,  v2,  v3
153        subs            w5,  w5,  #2
154        st1             {v6.16b,v7.16b}, [x7], x1
155        b.le            0f
156        \type           v4,  v0,  v1,  v2,  v3
157        b               32b
158640:
159        AARCH64_VALID_JUMP_TARGET
160        add             x7,  x0,  x1
161        lsl             x1,  x1,  #1
16264:
163        \type           v5,  v0,  v1,  v2,  v3
164        \type           v6,  v0,  v1,  v2,  v3
165        \type           v7,  v0,  v1,  v2,  v3
166        \type           v16, v0,  v1,  v2,  v3
167        \type           v17, v0,  v1,  v2,  v3
168        st1             {v4.16b,v5.16b,v6.16b,v7.16b}, [x0], x1
169        \type           v18, v0,  v1,  v2,  v3
170        \type           v19, v0,  v1,  v2,  v3
171        subs            w5,  w5,  #2
172        st1             {v16.16b,v17.16b,v18.16b,v19.16b}, [x7], x1
173        b.le            0f
174        \type           v4, v0,  v1,  v2,  v3
175        b               64b
1761280:
177        AARCH64_VALID_JUMP_TARGET
178        add             x7,  x0,  #64
179128:
180        \type           v5,  v0,  v1,  v2,  v3
181        \type           v6,  v0,  v1,  v2,  v3
182        \type           v7,  v0,  v1,  v2,  v3
183        \type           v16, v0,  v1,  v2,  v3
184        \type           v17, v0,  v1,  v2,  v3
185        st1             {v4.16b,v5.16b,v6.16b,v7.16b}, [x0], x1
186        \type           v18, v0,  v1,  v2,  v3
187        \type           v19, v0,  v1,  v2,  v3
188        subs            w5,  w5,  #1
189        st1             {v16.16b,v17.16b,v18.16b,v19.16b}, [x7], x1
190        b.le            0f
191        \type           v4, v0,  v1,  v2,  v3
192        b               128b
1930:
194        ret
195L(\type\()_tbl):
196        .hword L(\type\()_tbl) - 1280b
197        .hword L(\type\()_tbl) -  640b
198        .hword L(\type\()_tbl) -  320b
199        .hword L(\type\()_tbl) -   16b
200        .hword L(\type\()_tbl) -   80b
201        .hword L(\type\()_tbl) -   40b
202endfunc
203.endm
204
205bidir_fn avg
206bidir_fn w_avg
207bidir_fn mask
208
209
210.macro w_mask_fn type
211function w_mask_\type\()_8bpc_neon, export=1
212        clz             w8,  w4
213        adr             x9,  L(w_mask_\type\()_tbl)
214        sub             w8,  w8,  #24
215        ldrh            w8,  [x9,  x8,  lsl #1]
216        sub             x9,  x9,  w8,  uxtw
217        mov             w10, #6903
218        dup             v0.8h,   w10
219.if \type == 444
220        movi            v1.16b,  #64
221.elseif \type == 422
222        dup             v2.8b,   w7
223        movi            v3.8b,   #129
224        sub             v3.8b,   v3.8b,   v2.8b
225.elseif \type == 420
226        dup             v2.8h,   w7
227        movi            v3.8h,   #1, lsl #8
228        sub             v3.8h,   v3.8h,   v2.8h
229.endif
230        add             x12,  x0,  x1
231        lsl             x1,   x1,  #1
232        br              x9
2334:
234        AARCH64_VALID_JUMP_TARGET
235        ld1             {v4.8h,   v5.8h},   [x2],  #32  // tmp1 (four rows at once)
236        ld1             {v6.8h,   v7.8h},   [x3],  #32  // tmp2 (four rows at once)
237        subs            w5,  w5,  #4
238        sub             v16.8h,  v6.8h,   v4.8h
239        sub             v17.8h,  v7.8h,   v5.8h
240        sabd            v18.8h,  v4.8h,   v6.8h
241        sabd            v19.8h,  v5.8h,   v7.8h
242        uqsub           v18.8h,  v0.8h,   v18.8h
243        uqsub           v19.8h,  v0.8h,   v19.8h
244        ushr            v18.8h,  v18.8h,  #8
245        ushr            v19.8h,  v19.8h,  #8
246        shl             v20.8h,  v18.8h,  #9
247        shl             v21.8h,  v19.8h,  #9
248        sqdmulh         v20.8h,  v20.8h,  v16.8h
249        sqdmulh         v21.8h,  v21.8h,  v17.8h
250        add             v20.8h,  v20.8h,  v4.8h
251        add             v21.8h,  v21.8h,  v5.8h
252        sqrshrun        v22.8b,  v20.8h,  #4
253        sqrshrun        v23.8b,  v21.8h,  #4
254.if \type == 444
255        uzp1            v18.16b,  v18.16b, v19.16b      // Same as xtn, xtn2
256        sub             v18.16b,  v1.16b,  v18.16b
257        st1             {v18.16b}, [x6],  #16
258.elseif \type == 422
259        addp            v18.8h,   v18.8h,  v19.8h
260        xtn             v18.8b,   v18.8h
261        uhsub           v18.8b,   v3.8b,   v18.8b
262        st1             {v18.8b},  [x6],  #8
263.elseif \type == 420
264        trn1            v24.2d,   v18.2d,  v19.2d
265        trn2            v25.2d,   v18.2d,  v19.2d
266        add             v24.8h,   v24.8h,  v25.8h
267        addp            v18.8h,   v24.8h,  v24.8h
268        sub             v18.4h,   v3.4h,   v18.4h
269        rshrn           v18.8b,   v18.8h,  #2
270        st1             {v18.s}[0],  [x6],  #4
271.endif
272        st1             {v22.s}[0],  [x0],  x1
273        st1             {v22.s}[1],  [x12], x1
274        st1             {v23.s}[0],  [x0],  x1
275        st1             {v23.s}[1],  [x12], x1
276        b.gt            4b
277        ret
2788:
279        AARCH64_VALID_JUMP_TARGET
280        ld1             {v4.8h,   v5.8h},   [x2],  #32
281        ld1             {v6.8h,   v7.8h},   [x3],  #32
282        subs            w5,  w5,  #2
283        sub             v16.8h,  v6.8h,   v4.8h
284        sub             v17.8h,  v7.8h,   v5.8h
285        sabd            v18.8h,  v4.8h,   v6.8h
286        sabd            v19.8h,  v5.8h,   v7.8h
287        uqsub           v18.8h,  v0.8h,   v18.8h
288        uqsub           v19.8h,  v0.8h,   v19.8h
289        ushr            v18.8h,  v18.8h,  #8
290        ushr            v19.8h,  v19.8h,  #8
291        shl             v20.8h,  v18.8h,  #9
292        shl             v21.8h,  v19.8h,  #9
293        sqdmulh         v20.8h,  v20.8h,  v16.8h
294        sqdmulh         v21.8h,  v21.8h,  v17.8h
295        add             v20.8h,  v20.8h,  v4.8h
296        add             v21.8h,  v21.8h,  v5.8h
297        sqrshrun        v22.8b,  v20.8h,  #4
298        sqrshrun        v23.8b,  v21.8h,  #4
299.if \type == 444
300        uzp1            v18.16b, v18.16b, v19.16b       // Same as xtn, xtn2
301        sub             v18.16b, v1.16b,  v18.16b
302        st1             {v18.16b}, [x6],  #16
303.elseif \type == 422
304        addp            v18.8h,  v18.8h,  v19.8h
305        xtn             v18.8b,  v18.8h
306        uhsub           v18.8b,  v3.8b,   v18.8b
307        st1             {v18.8b},  [x6],  #8
308.elseif \type == 420
309        add             v18.8h,  v18.8h,  v19.8h
310        addp            v18.8h,  v18.8h,  v18.8h
311        sub             v18.4h,  v3.4h,   v18.4h
312        rshrn           v18.8b,  v18.8h,  #2
313        st1             {v18.s}[0],  [x6],  #4
314.endif
315        st1             {v22.8b},  [x0],  x1
316        st1             {v23.8b},  [x12], x1
317        b.gt            8b
318        ret
3191280:
320640:
321320:
322160:
323        AARCH64_VALID_JUMP_TARGET
324        mov             w11, w4
325        sub             x1,  x1,  w4,  uxtw
326.if \type == 444
327        add             x10, x6,  w4,  uxtw
328.elseif \type == 422
329        add             x10, x6,  x11, lsr #1
330.endif
331        add             x9,  x3,  w4,  uxtw #1
332        add             x7,  x2,  w4,  uxtw #1
333161:
334        mov             w8,  w4
33516:
336        ld1             {v4.8h,   v5.8h},   [x2],  #32
337        ld1             {v6.8h,   v7.8h},   [x3],  #32
338        ld1             {v16.8h,  v17.8h},  [x7],  #32
339        ld1             {v18.8h,  v19.8h},  [x9],  #32
340        subs            w8,  w8,  #16
341        sub             v6.8h,   v6.8h,   v4.8h
342        sub             v7.8h,   v7.8h,   v5.8h
343        sub             v18.8h,  v18.8h,  v16.8h
344        sub             v19.8h,  v19.8h,  v17.8h
345        abs             v20.8h,  v6.8h
346        abs             v21.8h,  v7.8h
347        abs             v22.8h,  v18.8h
348        abs             v23.8h,  v19.8h
349        uqsub           v20.8h,  v0.8h,   v20.8h
350        uqsub           v21.8h,  v0.8h,   v21.8h
351        uqsub           v22.8h,  v0.8h,   v22.8h
352        uqsub           v23.8h,  v0.8h,   v23.8h
353        ushr            v20.8h,  v20.8h,  #8
354        ushr            v21.8h,  v21.8h,  #8
355        ushr            v22.8h,  v22.8h,  #8
356        ushr            v23.8h,  v23.8h,  #8
357        shl             v24.8h,  v20.8h,  #9
358        shl             v25.8h,  v21.8h,  #9
359        shl             v26.8h,  v22.8h,  #9
360        shl             v27.8h,  v23.8h,  #9
361        sqdmulh         v24.8h,  v24.8h,  v6.8h
362        sqdmulh         v25.8h,  v25.8h,  v7.8h
363        sqdmulh         v26.8h,  v26.8h,  v18.8h
364        sqdmulh         v27.8h,  v27.8h,  v19.8h
365        add             v24.8h,  v24.8h,  v4.8h
366        add             v25.8h,  v25.8h,  v5.8h
367        add             v26.8h,  v26.8h,  v16.8h
368        add             v27.8h,  v27.8h,  v17.8h
369        sqrshrun        v24.8b,  v24.8h,  #4
370        sqrshrun        v25.8b,  v25.8h,  #4
371        sqrshrun        v26.8b,  v26.8h,  #4
372        sqrshrun        v27.8b,  v27.8h,  #4
373.if \type == 444
374        uzp1            v20.16b, v20.16b, v21.16b       // Same as xtn, xtn2
375        uzp1            v21.16b, v22.16b, v23.16b       // Ditto
376        sub             v20.16b, v1.16b,  v20.16b
377        sub             v21.16b, v1.16b,  v21.16b
378        st1             {v20.16b}, [x6],  #16
379        st1             {v21.16b}, [x10], #16
380.elseif \type == 422
381        addp            v20.8h,  v20.8h,  v21.8h
382        addp            v21.8h,  v22.8h,  v23.8h
383        xtn             v20.8b,  v20.8h
384        xtn             v21.8b,  v21.8h
385        uhsub           v20.8b,  v3.8b,   v20.8b
386        uhsub           v21.8b,  v3.8b,   v21.8b
387        st1             {v20.8b},  [x6],  #8
388        st1             {v21.8b},  [x10], #8
389.elseif \type == 420
390        add             v20.8h,  v20.8h,  v22.8h
391        add             v21.8h,  v21.8h,  v23.8h
392        addp            v20.8h,  v20.8h,  v21.8h
393        sub             v20.8h,  v3.8h,   v20.8h
394        rshrn           v20.8b,  v20.8h,  #2
395        st1             {v20.8b},  [x6],  #8
396.endif
397        st1             {v24.8b,  v25.8b},  [x0],  #16
398        st1             {v26.8b,  v27.8b},  [x12], #16
399        b.gt            16b
400        subs            w5,  w5,  #2
401        add             x2,  x2,  w4,  uxtw #1
402        add             x3,  x3,  w4,  uxtw #1
403        add             x7,  x7,  w4,  uxtw #1
404        add             x9,  x9,  w4,  uxtw #1
405.if \type == 444
406        add             x6,  x6,  w4,  uxtw
407        add             x10, x10, w4,  uxtw
408.elseif \type == 422
409        add             x6,  x6,  x11, lsr #1
410        add             x10, x10, x11, lsr #1
411.endif
412        add             x0,  x0,  x1
413        add             x12, x12, x1
414        b.gt            161b
415        ret
416L(w_mask_\type\()_tbl):
417        .hword L(w_mask_\type\()_tbl) - 1280b
418        .hword L(w_mask_\type\()_tbl) -  640b
419        .hword L(w_mask_\type\()_tbl) -  320b
420        .hword L(w_mask_\type\()_tbl) -  160b
421        .hword L(w_mask_\type\()_tbl) -    8b
422        .hword L(w_mask_\type\()_tbl) -    4b
423endfunc
424.endm
425
426w_mask_fn 444
427w_mask_fn 422
428w_mask_fn 420
429
430
431function blend_8bpc_neon, export=1
432        adr             x6,  L(blend_tbl)
433        clz             w3,  w3
434        sub             w3,  w3,  #26
435        ldrh            w3,  [x6,  x3,  lsl #1]
436        sub             x6,  x6,  w3,  uxtw
437        movi            v4.16b,  #64
438        add             x8,  x0,  x1
439        lsl             x1,  x1,  #1
440        br              x6
4414:
442        AARCH64_VALID_JUMP_TARGET
443        ld1             {v2.8b},     [x5],  #8
444        ld1             {v1.d}[0],   [x2],  #8
445        ld1             {v0.s}[0],   [x0]
446        subs            w4,  w4,  #2
447        ld1             {v0.s}[1],   [x8]
448        sub             v3.8b,   v4.8b,   v2.8b
449        umull           v5.8h,   v1.8b,   v2.8b
450        umlal           v5.8h,   v0.8b,   v3.8b
451        rshrn           v6.8b,   v5.8h,   #6
452        st1             {v6.s}[0],   [x0],  x1
453        st1             {v6.s}[1],   [x8],  x1
454        b.gt            4b
455        ret
4568:
457        AARCH64_VALID_JUMP_TARGET
458        ld1             {v2.16b},  [x5],  #16
459        ld1             {v1.16b},  [x2],  #16
460        ld1             {v0.d}[0],   [x0]
461        ld1             {v0.d}[1],   [x8]
462        sub             v3.16b,  v4.16b,  v2.16b
463        subs            w4,  w4,  #2
464        umull           v5.8h,   v1.8b,   v2.8b
465        umlal           v5.8h,   v0.8b,   v3.8b
466        umull2          v6.8h,   v1.16b,  v2.16b
467        umlal2          v6.8h,   v0.16b,  v3.16b
468        rshrn           v7.8b,   v5.8h,   #6
469        rshrn2          v7.16b,  v6.8h,   #6
470        st1             {v7.d}[0],   [x0],  x1
471        st1             {v7.d}[1],   [x8],  x1
472        b.gt            8b
473        ret
47416:
475        AARCH64_VALID_JUMP_TARGET
476        ld1             {v1.16b,  v2.16b},  [x5],  #32
477        ld1             {v5.16b,  v6.16b},  [x2],  #32
478        ld1             {v0.16b},  [x0]
479        subs            w4,  w4,  #2
480        sub             v7.16b,  v4.16b,  v1.16b
481        sub             v20.16b, v4.16b,  v2.16b
482        ld1             {v3.16b},  [x8]
483        umull           v16.8h,  v5.8b,   v1.8b
484        umlal           v16.8h,  v0.8b,   v7.8b
485        umull2          v17.8h,  v5.16b,  v1.16b
486        umlal2          v17.8h,  v0.16b,  v7.16b
487        umull           v21.8h,  v6.8b,   v2.8b
488        umlal           v21.8h,  v3.8b,   v20.8b
489        umull2          v22.8h,  v6.16b,  v2.16b
490        umlal2          v22.8h,  v3.16b,  v20.16b
491        rshrn           v18.8b,  v16.8h,  #6
492        rshrn2          v18.16b, v17.8h,  #6
493        rshrn           v19.8b,  v21.8h,  #6
494        rshrn2          v19.16b, v22.8h,  #6
495        st1             {v18.16b}, [x0],  x1
496        st1             {v19.16b}, [x8],  x1
497        b.gt            16b
498        ret
49932:
500        AARCH64_VALID_JUMP_TARGET
501        ld1             {v0.16b,  v1.16b,  v2.16b,  v3.16b},  [x5],  #64
502        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x2],  #64
503        ld1             {v20.16b, v21.16b}, [x0]
504        subs            w4,  w4,  #2
505        ld1             {v22.16b, v23.16b}, [x8]
506        sub             v5.16b,  v4.16b,  v0.16b
507        sub             v6.16b,  v4.16b,  v1.16b
508        sub             v30.16b, v4.16b,  v2.16b
509        sub             v31.16b, v4.16b,  v3.16b
510        umull           v24.8h,  v16.8b,  v0.8b
511        umlal           v24.8h,  v20.8b,  v5.8b
512        umull2          v26.8h,  v16.16b, v0.16b
513        umlal2          v26.8h,  v20.16b, v5.16b
514        umull           v28.8h,  v17.8b,  v1.8b
515        umlal           v28.8h,  v21.8b,  v6.8b
516        umull2          v7.8h,   v17.16b, v1.16b
517        umlal2          v7.8h,   v21.16b, v6.16b
518        umull           v27.8h,  v18.8b,  v2.8b
519        umlal           v27.8h,  v22.8b,  v30.8b
520        umull2          v1.8h,   v18.16b, v2.16b
521        umlal2          v1.8h,   v22.16b, v30.16b
522        umull           v29.8h,  v19.8b,  v3.8b
523        umlal           v29.8h,  v23.8b,  v31.8b
524        umull2          v21.8h,  v19.16b, v3.16b
525        umlal2          v21.8h,  v23.16b, v31.16b
526        rshrn           v24.8b,  v24.8h,  #6
527        rshrn2          v24.16b, v26.8h,  #6
528        rshrn           v25.8b,  v28.8h,  #6
529        rshrn2          v25.16b, v7.8h,   #6
530        rshrn           v27.8b,  v27.8h,  #6
531        rshrn2          v27.16b, v1.8h,   #6
532        rshrn           v28.8b,  v29.8h,  #6
533        rshrn2          v28.16b, v21.8h,  #6
534        st1             {v24.16b, v25.16b}, [x0],  x1
535        st1             {v27.16b, v28.16b}, [x8],  x1
536        b.gt            32b
537        ret
538L(blend_tbl):
539        .hword L(blend_tbl) - 32b
540        .hword L(blend_tbl) - 16b
541        .hword L(blend_tbl) -  8b
542        .hword L(blend_tbl) -  4b
543endfunc
544
545function blend_h_8bpc_neon, export=1
546        adr             x6,  L(blend_h_tbl)
547        movrel          x5,  X(obmc_masks)
548        add             x5,  x5,  w4,  uxtw
549        sub             w4,  w4,  w4,  lsr #2
550        clz             w7,  w3
551        movi            v4.16b,  #64
552        add             x8,  x0,  x1
553        lsl             x1,  x1,  #1
554        sub             w7,  w7,  #24
555        ldrh            w7,  [x6,  x7,  lsl #1]
556        sub             x6,  x6,  w7, uxtw
557        br              x6
5582:
559        AARCH64_VALID_JUMP_TARGET
560        ld1             {v0.h}[0],   [x5],  #2
561        ld1             {v1.s}[0],   [x2],  #4
562        subs            w4,  w4,  #2
563        ld1             {v2.h}[0],   [x0]
564        zip1            v0.8b,   v0.8b,   v0.8b
565        sub             v3.8b,   v4.8b,   v0.8b
566        ld1             {v2.h}[1],   [x8]
567        umull           v5.8h,   v1.8b,   v0.8b
568        umlal           v5.8h,   v2.8b,   v3.8b
569        rshrn           v5.8b,   v5.8h,   #6
570        st1             {v5.h}[0],   [x0],  x1
571        st1             {v5.h}[1],   [x8],  x1
572        b.gt            2b
573        ret
5744:
575        AARCH64_VALID_JUMP_TARGET
576        ld2r            {v0.8b,   v1.8b},   [x5],  #2
577        ld1             {v2.8b},   [x2],  #8
578        subs            w4,  w4,  #2
579        ext             v0.8b,   v0.8b,   v1.8b,   #4
580        ld1             {v3.s}[0],   [x0]
581        sub             v5.8b,   v4.8b,   v0.8b
582        ld1             {v3.s}[1],   [x8]
583        umull           v6.8h,   v2.8b,   v0.8b
584        umlal           v6.8h,   v3.8b,   v5.8b
585        rshrn           v6.8b,   v6.8h,   #6
586        st1             {v6.s}[0],   [x0],  x1
587        st1             {v6.s}[1],   [x8],  x1
588        b.gt            4b
589        ret
5908:
591        AARCH64_VALID_JUMP_TARGET
592        ld2r            {v0.16b,  v1.16b},  [x5],  #2
593        ld1             {v2.16b},  [x2],  #16
594        ld1             {v3.d}[0],   [x0]
595        ext             v0.16b,  v0.16b,  v1.16b,  #8
596        sub             v5.16b,  v4.16b,  v0.16b
597        ld1             {v3.d}[1],   [x8]
598        subs            w4,  w4,  #2
599        umull           v6.8h,   v0.8b,   v2.8b
600        umlal           v6.8h,   v3.8b,   v5.8b
601        umull2          v7.8h,   v0.16b,  v2.16b
602        umlal2          v7.8h,   v3.16b,  v5.16b
603        rshrn           v16.8b,  v6.8h,   #6
604        rshrn2          v16.16b, v7.8h,   #6
605        st1             {v16.d}[0],  [x0],  x1
606        st1             {v16.d}[1],  [x8],  x1
607        b.gt            8b
608        ret
60916:
610        AARCH64_VALID_JUMP_TARGET
611        ld2r            {v0.16b,  v1.16b},  [x5],  #2
612        ld1             {v2.16b,  v3.16b},  [x2],  #32
613        ld1             {v5.16b},  [x0]
614        sub             v7.16b,  v4.16b,  v0.16b
615        sub             v16.16b, v4.16b,  v1.16b
616        ld1             {v6.16b},  [x8]
617        subs            w4,  w4,  #2
618        umull           v17.8h,  v0.8b,   v2.8b
619        umlal           v17.8h,  v5.8b,   v7.8b
620        umull2          v18.8h,  v0.16b,  v2.16b
621        umlal2          v18.8h,  v5.16b,  v7.16b
622        umull           v19.8h,  v1.8b,   v3.8b
623        umlal           v19.8h,  v6.8b,   v16.8b
624        umull2          v20.8h,  v1.16b,  v3.16b
625        umlal2          v20.8h,  v6.16b,  v16.16b
626        rshrn           v21.8b,  v17.8h,  #6
627        rshrn2          v21.16b, v18.8h,  #6
628        rshrn           v22.8b,  v19.8h,  #6
629        rshrn2          v22.16b, v20.8h,  #6
630        st1             {v21.16b}, [x0],  x1
631        st1             {v22.16b}, [x8],  x1
632        b.gt            16b
633        ret
6341280:
635640:
636320:
637        AARCH64_VALID_JUMP_TARGET
638        sub             x1,  x1,  w3,  uxtw
639        add             x7,  x2,  w3,  uxtw
640321:
641        ld2r            {v0.16b,  v1.16b},  [x5],  #2
642        mov             w6,  w3
643        sub             v20.16b, v4.16b,  v0.16b
644        sub             v21.16b, v4.16b,  v1.16b
64532:
646        ld1             {v16.16b, v17.16b}, [x2],  #32
647        ld1             {v2.16b,  v3.16b},  [x0]
648        subs            w6,  w6,  #32
649        umull           v23.8h,  v0.8b,   v16.8b
650        umlal           v23.8h,  v2.8b,   v20.8b
651        ld1             {v18.16b, v19.16b}, [x7],  #32
652        umull2          v27.8h,  v0.16b,  v16.16b
653        umlal2          v27.8h,  v2.16b,  v20.16b
654        ld1             {v6.16b,  v7.16b},  [x8]
655        umull           v24.8h,  v0.8b,   v17.8b
656        umlal           v24.8h,  v3.8b,   v20.8b
657        umull2          v28.8h,  v0.16b,  v17.16b
658        umlal2          v28.8h,  v3.16b,  v20.16b
659        umull           v25.8h,  v1.8b,   v18.8b
660        umlal           v25.8h,  v6.8b,   v21.8b
661        umull2          v5.8h,   v1.16b,  v18.16b
662        umlal2          v5.8h,   v6.16b,  v21.16b
663        rshrn           v29.8b,  v23.8h,  #6
664        rshrn2          v29.16b, v27.8h,  #6
665        umull           v26.8h,  v1.8b,   v19.8b
666        umlal           v26.8h,  v7.8b,   v21.8b
667        umull2          v31.8h,  v1.16b,  v19.16b
668        umlal2          v31.8h,  v7.16b,  v21.16b
669        rshrn           v30.8b,  v24.8h,  #6
670        rshrn2          v30.16b, v28.8h,  #6
671        rshrn           v23.8b,  v25.8h,  #6
672        rshrn2          v23.16b, v5.8h,   #6
673        rshrn           v24.8b,  v26.8h,  #6
674        st1             {v29.16b, v30.16b}, [x0],  #32
675        rshrn2          v24.16b, v31.8h,  #6
676        st1             {v23.16b, v24.16b}, [x8],  #32
677        b.gt            32b
678        subs            w4,  w4,  #2
679        add             x0,  x0,  x1
680        add             x8,  x8,  x1
681        add             x2,  x2,  w3,  uxtw
682        add             x7,  x7,  w3,  uxtw
683        b.gt            321b
684        ret
685L(blend_h_tbl):
686        .hword L(blend_h_tbl) - 1280b
687        .hword L(blend_h_tbl) -  640b
688        .hword L(blend_h_tbl) -  320b
689        .hword L(blend_h_tbl) -   16b
690        .hword L(blend_h_tbl) -    8b
691        .hword L(blend_h_tbl) -    4b
692        .hword L(blend_h_tbl) -    2b
693endfunc
694
695function blend_v_8bpc_neon, export=1
696        adr             x6,  L(blend_v_tbl)
697        movrel          x5,  X(obmc_masks)
698        add             x5,  x5,  w3,  uxtw
699        clz             w3,  w3
700        movi            v4.16b,  #64
701        add             x8,  x0,  x1
702        lsl             x1,  x1,  #1
703        sub             w3,  w3,  #26
704        ldrh            w3,  [x6,  x3,  lsl #1]
705        sub             x6,  x6,  w3,  uxtw
706        br              x6
70720:
708        AARCH64_VALID_JUMP_TARGET
709        ld1r            {v0.8b},   [x5]
710        sub             v1.8b,   v4.8b,   v0.8b
7112:
712        ld1             {v2.h}[0],   [x2],  #2
713        ld1             {v3.b}[0],   [x0]
714        subs            w4,  w4,  #2
715        ld1             {v2.b}[1],   [x2]
716        ld1             {v3.b}[1],   [x8]
717        umull           v5.8h,   v2.8b,   v0.8b
718        umlal           v5.8h,   v3.8b,   v1.8b
719        rshrn           v5.8b,   v5.8h,   #6
720        add             x2,  x2,  #2
721        st1             {v5.b}[0],   [x0],  x1
722        st1             {v5.b}[1],   [x8],  x1
723        b.gt            2b
724        ret
72540:
726        AARCH64_VALID_JUMP_TARGET
727        ld1r            {v0.2s},   [x5]
728        sub             x1,  x1,  #2
729        sub             v1.8b,   v4.8b,   v0.8b
7304:
731        ld1             {v2.8b},   [x2],  #8
732        ld1             {v3.s}[0],   [x0]
733        ld1             {v3.s}[1],   [x8]
734        subs            w4,  w4,  #2
735        umull           v5.8h,   v2.8b,   v0.8b
736        umlal           v5.8h,   v3.8b,   v1.8b
737        rshrn           v5.8b,   v5.8h,   #6
738        st1             {v5.h}[0],   [x0],  #2
739        st1             {v5.h}[2],   [x8],  #2
740        st1             {v5.b}[2],   [x0],  x1
741        st1             {v5.b}[6],   [x8],  x1
742        b.gt            4b
743        ret
74480:
745        AARCH64_VALID_JUMP_TARGET
746        ld1r            {v0.2d},   [x5]
747        sub             x1,  x1,  #4
748        sub             v1.16b,  v4.16b,  v0.16b
7498:
750        ld1             {v2.16b},  [x2],  #16
751        ld1             {v3.d}[0],   [x0]
752        ld1             {v3.d}[1],   [x8]
753        subs            w4,  w4,  #2
754        umull           v5.8h,  v0.8b,  v2.8b
755        umlal           v5.8h,  v3.8b,  v1.8b
756        umull2          v6.8h,  v0.16b, v2.16b
757        umlal2          v6.8h,  v3.16b, v1.16b
758        rshrn           v7.8b,  v5.8h,  #6
759        rshrn2          v7.16b, v6.8h,  #6
760        st1             {v7.s}[0],   [x0],  #4
761        st1             {v7.s}[2],   [x8],  #4
762        st1             {v7.h}[2],   [x0],  x1
763        st1             {v7.h}[6],   [x8],  x1
764        b.gt            8b
765        ret
766160:
767        AARCH64_VALID_JUMP_TARGET
768        ld1             {v0.16b},  [x5]
769        sub             x1,  x1,  #8
770        sub             v2.16b,  v4.16b,  v0.16b
77116:
772        ld1             {v5.16b,  v6.16b},  [x2],  #32
773        ld1             {v7.16b},  [x0]
774        subs            w4,  w4,  #2
775        ld1             {v16.16b}, [x8]
776        umull           v17.8h,  v5.8b,   v0.8b
777        umlal           v17.8h,  v7.8b,   v2.8b
778        umull2          v18.8h,  v5.16b,  v0.16b
779        umlal2          v18.8h,  v7.16b,  v2.16b
780        umull           v20.8h,  v6.8b,   v0.8b
781        umlal           v20.8h,  v16.8b,  v2.8b
782        umull2          v21.8h,  v6.16b,  v0.16b
783        umlal2          v21.8h,  v16.16b, v2.16b
784        rshrn           v19.8b,  v17.8h,  #6
785        rshrn2          v19.16b, v18.8h,  #6
786        rshrn           v22.8b,  v20.8h,  #6
787        rshrn2          v22.16b, v21.8h,  #6
788        st1             {v19.8b},  [x0],  #8
789        st1             {v22.8b},  [x8],  #8
790        st1             {v19.s}[2],  [x0],  x1
791        st1             {v22.s}[2],  [x8],  x1
792        b.gt            16b
793        ret
794320:
795        AARCH64_VALID_JUMP_TARGET
796        ld1             {v0.16b,  v1.16b},  [x5]
797        sub             x1,  x1,  #16
798        sub             v2.16b,  v4.16b,  v0.16b
799        sub             v3.8b,   v4.8b,   v1.8b
80032:
801        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x2],  #64
802        ld1             {v5.16b,  v6.16b},  [x0]
803        subs            w4,  w4,  #2
804        ld1             {v20.16b, v21.16b}, [x8]
805        umull           v22.8h,  v16.8b,  v0.8b
806        umlal           v22.8h,  v5.8b,   v2.8b
807        umull2          v23.8h,  v16.16b, v0.16b
808        umlal2          v23.8h,  v5.16b,  v2.16b
809        umull           v28.8h,  v17.8b,  v1.8b
810        umlal           v28.8h,  v6.8b,   v3.8b
811        umull           v30.8h,  v18.8b,  v0.8b
812        umlal           v30.8h,  v20.8b,  v2.8b
813        umull2          v31.8h,  v18.16b, v0.16b
814        umlal2          v31.8h,  v20.16b, v2.16b
815        umull           v25.8h,  v19.8b,  v1.8b
816        umlal           v25.8h,  v21.8b,  v3.8b
817        rshrn           v24.8b,  v22.8h,  #6
818        rshrn2          v24.16b, v23.8h,  #6
819        rshrn           v28.8b,  v28.8h,  #6
820        rshrn           v30.8b,  v30.8h,  #6
821        rshrn2          v30.16b, v31.8h,  #6
822        rshrn           v27.8b,  v25.8h,  #6
823        st1             {v24.16b}, [x0],  #16
824        st1             {v30.16b}, [x8],  #16
825        st1             {v28.8b},  [x0],  x1
826        st1             {v27.8b},  [x8],  x1
827        b.gt            32b
828        ret
829L(blend_v_tbl):
830        .hword L(blend_v_tbl) - 320b
831        .hword L(blend_v_tbl) - 160b
832        .hword L(blend_v_tbl) -  80b
833        .hword L(blend_v_tbl) -  40b
834        .hword L(blend_v_tbl) -  20b
835endfunc
836
837
838// This has got the same signature as the put_8tap functions,
839// and assumes that x8 is set to (clz(w)-24).
840function put_neon
841        adr             x9,  L(put_tbl)
842        ldrh            w8,  [x9, x8, lsl #1]
843        sub             x9,  x9,  w8, uxtw
844        br              x9
845
8462:
847        AARCH64_VALID_JUMP_TARGET
848        ld1             {v0.h}[0], [x2], x3
849        ld1             {v1.h}[0], [x2], x3
850        subs            w5,  w5,  #2
851        st1             {v0.h}[0], [x0], x1
852        st1             {v1.h}[0], [x0], x1
853        b.gt            2b
854        ret
8554:
856        AARCH64_VALID_JUMP_TARGET
857        ld1             {v0.s}[0], [x2], x3
858        ld1             {v1.s}[0], [x2], x3
859        subs            w5,  w5,  #2
860        st1             {v0.s}[0], [x0], x1
861        st1             {v1.s}[0], [x0], x1
862        b.gt            4b
863        ret
8648:
865        AARCH64_VALID_JUMP_TARGET
866        ld1             {v0.8b}, [x2], x3
867        ld1             {v1.8b}, [x2], x3
868        subs            w5,  w5,  #2
869        st1             {v0.8b}, [x0], x1
870        st1             {v1.8b}, [x0], x1
871        b.gt            8b
872        ret
873160:
874        AARCH64_VALID_JUMP_TARGET
875        add             x8,  x0,  x1
876        lsl             x1,  x1,  #1
877        add             x9,  x2,  x3
878        lsl             x3,  x3,  #1
87916:
880        ld1             {v0.16b}, [x2], x3
881        ld1             {v1.16b}, [x9], x3
882        subs            w5,  w5,  #2
883        st1             {v0.16b}, [x0], x1
884        st1             {v1.16b}, [x8], x1
885        b.gt            16b
886        ret
88732:
888        AARCH64_VALID_JUMP_TARGET
889        ldp             x6,  x7,  [x2]
890        ldp             x8,  x9,  [x2, #16]
891        stp             x6,  x7,  [x0]
892        subs            w5,  w5,  #1
893        stp             x8,  x9,  [x0, #16]
894        add             x2,  x2,  x3
895        add             x0,  x0,  x1
896        b.gt            32b
897        ret
89864:
899        AARCH64_VALID_JUMP_TARGET
900        ldp             x6,  x7,  [x2]
901        ldp             x8,  x9,  [x2, #16]
902        stp             x6,  x7,  [x0]
903        ldp             x10, x11, [x2, #32]
904        stp             x8,  x9,  [x0, #16]
905        subs            w5,  w5,  #1
906        ldp             x12, x13, [x2, #48]
907        stp             x10, x11, [x0, #32]
908        stp             x12, x13, [x0, #48]
909        add             x2,  x2,  x3
910        add             x0,  x0,  x1
911        b.gt            64b
912        ret
913128:
914        AARCH64_VALID_JUMP_TARGET
915        ldp             q0,  q1,  [x2]
916        ldp             q2,  q3,  [x2, #32]
917        stp             q0,  q1,  [x0]
918        ldp             q4,  q5,  [x2, #64]
919        stp             q2,  q3,  [x0, #32]
920        ldp             q6,  q7,  [x2, #96]
921        subs            w5,  w5,  #1
922        stp             q4,  q5,  [x0, #64]
923        stp             q6,  q7,  [x0, #96]
924        add             x2,  x2,  x3
925        add             x0,  x0,  x1
926        b.gt            128b
927        ret
928
929L(put_tbl):
930        .hword L(put_tbl) - 128b
931        .hword L(put_tbl) -  64b
932        .hword L(put_tbl) -  32b
933        .hword L(put_tbl) - 160b
934        .hword L(put_tbl) -   8b
935        .hword L(put_tbl) -   4b
936        .hword L(put_tbl) -   2b
937endfunc
938
939
940// This has got the same signature as the prep_8tap functions,
941// and assumes that x8 is set to (clz(w)-24), and x7 to w*2.
942function prep_neon
943        adr             x9,  L(prep_tbl)
944        ldrh            w8,  [x9, x8, lsl #1]
945        sub             x9,  x9,  w8, uxtw
946        br              x9
947
9484:
949        AARCH64_VALID_JUMP_TARGET
950        ld1             {v0.s}[0], [x1], x2
951        ld1             {v1.s}[0], [x1], x2
952        subs            w4,  w4,  #2
953        ushll           v0.8h, v0.8b, #4
954        ushll           v1.8h, v1.8b, #4
955        st1             {v0.4h, v1.4h}, [x0], #16
956        b.gt            4b
957        ret
9588:
959        AARCH64_VALID_JUMP_TARGET
960        ld1             {v0.8b}, [x1], x2
961        ld1             {v1.8b}, [x1], x2
962        subs            w4,  w4,  #2
963        ushll           v0.8h, v0.8b, #4
964        ushll           v1.8h, v1.8b, #4
965        st1             {v0.8h, v1.8h}, [x0], #32
966        b.gt            8b
967        ret
968160:
969        AARCH64_VALID_JUMP_TARGET
970        add             x9,  x1,  x2
971        lsl             x2,  x2,  #1
97216:
973        ld1             {v0.16b}, [x1], x2
974        ld1             {v1.16b}, [x9], x2
975        subs            w4,  w4,  #2
976        ushll           v4.8h, v0.8b,  #4
977        ushll2          v5.8h, v0.16b, #4
978        ushll           v6.8h, v1.8b,  #4
979        ushll2          v7.8h, v1.16b, #4
980        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
981        b.gt            16b
982        ret
983320:
984        AARCH64_VALID_JUMP_TARGET
985        add             x8,  x0,  w3, uxtw
98632:
987        ld1             {v0.16b, v1.16b},  [x1], x2
988        subs            w4,  w4,  #2
989        ushll           v4.8h,  v0.8b,  #4
990        ushll2          v5.8h,  v0.16b, #4
991        ld1             {v2.16b, v3.16b},  [x1], x2
992        ushll           v6.8h,  v1.8b,  #4
993        ushll2          v7.8h,  v1.16b, #4
994        ushll           v16.8h, v2.8b,  #4
995        st1             {v4.8h,  v5.8h},  [x0], x7
996        ushll2          v17.8h, v2.16b, #4
997        st1             {v6.8h,  v7.8h},  [x8], x7
998        ushll           v18.8h, v3.8b,  #4
999        st1             {v16.8h, v17.8h}, [x0], x7
1000        ushll2          v19.8h, v3.16b, #4
1001        st1             {v18.8h, v19.8h}, [x8], x7
1002        b.gt            32b
1003        ret
1004640:
1005        AARCH64_VALID_JUMP_TARGET
1006        add             x8,  x0,  #32
1007        mov             x6,  #64
100864:
1009        ldp             q0,  q1,  [x1]
1010        subs            w4,  w4,  #1
1011        ushll           v4.8h,  v0.8b,  #4
1012        ushll2          v5.8h,  v0.16b, #4
1013        ldp             q2,  q3,  [x1, #32]
1014        ushll           v6.8h,  v1.8b,  #4
1015        ushll2          v7.8h,  v1.16b, #4
1016        add             x1,  x1,  x2
1017        ushll           v16.8h, v2.8b,  #4
1018        st1             {v4.8h,  v5.8h},  [x0], x6
1019        ushll2          v17.8h, v2.16b, #4
1020        ushll           v18.8h, v3.8b,  #4
1021        st1             {v6.8h,  v7.8h},  [x8], x6
1022        ushll2          v19.8h, v3.16b, #4
1023        st1             {v16.8h, v17.8h}, [x0], x6
1024        st1             {v18.8h, v19.8h}, [x8], x6
1025        b.gt            64b
1026        ret
10271280:
1028        AARCH64_VALID_JUMP_TARGET
1029        add             x8,  x0,  #64
1030        mov             x6,  #128
1031128:
1032        ldp             q0,  q1,  [x1]
1033        ldp             q2,  q3,  [x1, #32]
1034        ushll           v16.8h,  v0.8b,  #4
1035        ushll2          v17.8h,  v0.16b, #4
1036        ushll           v18.8h,  v1.8b,  #4
1037        ushll2          v19.8h,  v1.16b, #4
1038        ushll           v20.8h,  v2.8b,  #4
1039        ushll2          v21.8h,  v2.16b, #4
1040        ldp             q4,  q5,  [x1, #64]
1041        st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x6
1042        ushll           v22.8h,  v3.8b,  #4
1043        ushll2          v23.8h,  v3.16b, #4
1044        ushll           v24.8h,  v4.8b,  #4
1045        ushll2          v25.8h,  v4.16b, #4
1046        ushll           v26.8h,  v5.8b,  #4
1047        ushll2          v27.8h,  v5.16b, #4
1048        ldp             q6,  q7,  [x1, #96]
1049        st1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x8], x6
1050        ushll           v28.8h,  v6.8b,  #4
1051        ushll2          v29.8h,  v6.16b, #4
1052        ushll           v30.8h,  v7.8b,  #4
1053        ushll2          v31.8h,  v7.16b, #4
1054        subs            w4,  w4,  #1
1055        add             x1,  x1,  x2
1056        st1             {v24.8h, v25.8h, v26.8h, v27.8h}, [x0], x6
1057        st1             {v28.8h, v29.8h, v30.8h, v31.8h}, [x8], x6
1058        b.gt            128b
1059        ret
1060
1061L(prep_tbl):
1062        .hword L(prep_tbl) - 1280b
1063        .hword L(prep_tbl) -  640b
1064        .hword L(prep_tbl) -  320b
1065        .hword L(prep_tbl) -  160b
1066        .hword L(prep_tbl) -    8b
1067        .hword L(prep_tbl) -    4b
1068endfunc
1069
1070
1071.macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6
1072        ld1             {\d0\wd}[0], [\s0], \strd
1073        ld1             {\d1\wd}[0], [\s1], \strd
1074.ifnb \d2
1075        ld1             {\d2\wd}[0], [\s0], \strd
1076        ld1             {\d3\wd}[0], [\s1], \strd
1077.endif
1078.ifnb \d4
1079        ld1             {\d4\wd}[0], [\s0], \strd
1080.endif
1081.ifnb \d5
1082        ld1             {\d5\wd}[0], [\s1], \strd
1083.endif
1084.ifnb \d6
1085        ld1             {\d6\wd}[0], [\s0], \strd
1086.endif
1087.endm
1088.macro load_reg s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6
1089        ld1             {\d0\wd}, [\s0], \strd
1090        ld1             {\d1\wd}, [\s1], \strd
1091.ifnb \d2
1092        ld1             {\d2\wd}, [\s0], \strd
1093        ld1             {\d3\wd}, [\s1], \strd
1094.endif
1095.ifnb \d4
1096        ld1             {\d4\wd}, [\s0], \strd
1097.endif
1098.ifnb \d5
1099        ld1             {\d5\wd}, [\s1], \strd
1100.endif
1101.ifnb \d6
1102        ld1             {\d6\wd}, [\s0], \strd
1103.endif
1104.endm
1105.macro load_h s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
1106        load_slice      \s0, \s1, \strd, .h, \d0, \d1, \d2, \d3, \d4, \d5, \d6
1107.endm
1108.macro load_s s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
1109        load_slice      \s0, \s1, \strd, .s, \d0, \d1, \d2, \d3, \d4, \d5, \d6
1110.endm
1111.macro load_8b s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
1112        load_reg        \s0, \s1, \strd, .8b, \d0, \d1, \d2, \d3, \d4, \d5, \d6
1113.endm
1114.macro load_16b s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
1115        load_reg        \s0, \s1, \strd, .16b, \d0, \d1, \d2, \d3, \d4, \d5, \d6
1116.endm
1117.macro interleave_1 wd, r0, r1, r2, r3, r4
1118        trn1            \r0\wd, \r0\wd, \r1\wd
1119        trn1            \r1\wd, \r1\wd, \r2\wd
1120.ifnb \r3
1121        trn1            \r2\wd, \r2\wd, \r3\wd
1122        trn1            \r3\wd, \r3\wd, \r4\wd
1123.endif
1124.endm
1125.macro interleave_1_h r0, r1, r2, r3, r4
1126        interleave_1    .4h, \r0, \r1, \r2, \r3, \r4
1127.endm
1128.macro interleave_1_s r0, r1, r2, r3, r4
1129        interleave_1    .2s, \r0, \r1, \r2, \r3, \r4
1130.endm
1131.macro interleave_2 wd, r0, r1, r2, r3, r4, r5
1132        trn1            \r0\wd,  \r0\wd, \r2\wd
1133        trn1            \r1\wd,  \r1\wd, \r3\wd
1134        trn1            \r2\wd,  \r2\wd, \r4\wd
1135        trn1            \r3\wd,  \r3\wd, \r5\wd
1136.endm
1137.macro interleave_2_s r0, r1, r2, r3, r4, r5
1138        interleave_2    .2s, \r0, \r1, \r2, \r3, \r4, \r5
1139.endm
1140.macro uxtl_b r0, r1, r2, r3, r4, r5, r6
1141        uxtl            \r0\().8h, \r0\().8b
1142        uxtl            \r1\().8h, \r1\().8b
1143.ifnb \r2
1144        uxtl            \r2\().8h, \r2\().8b
1145        uxtl            \r3\().8h, \r3\().8b
1146.endif
1147.ifnb \r4
1148        uxtl            \r4\().8h, \r4\().8b
1149.endif
1150.ifnb \r5
1151        uxtl            \r5\().8h, \r5\().8b
1152.endif
1153.ifnb \r6
1154        uxtl            \r6\().8h, \r6\().8b
1155.endif
1156.endm
1157.macro mul_mla_4 d, s0, s1, s2, s3, wd
1158        mul             \d\wd,  \s0\wd,  v0.h[0]
1159        mla             \d\wd,  \s1\wd,  v0.h[1]
1160        mla             \d\wd,  \s2\wd,  v0.h[2]
1161        mla             \d\wd,  \s3\wd,  v0.h[3]
1162.endm
1163// Interleaving the mul/mla chains actually hurts performance
1164// significantly on Cortex A53, thus keeping mul/mla tightly
1165// chained like this.
1166.macro mul_mla_8_0_4h d0, s0, s1, s2, s3, s4, s5, s6, s7
1167        mul             \d0\().4h, \s0\().4h, v0.h[0]
1168        mla             \d0\().4h, \s1\().4h, v0.h[1]
1169        mla             \d0\().4h, \s2\().4h, v0.h[2]
1170        mla             \d0\().4h, \s3\().4h, v0.h[3]
1171        mla             \d0\().4h, \s4\().4h, v0.h[4]
1172        mla             \d0\().4h, \s5\().4h, v0.h[5]
1173        mla             \d0\().4h, \s6\().4h, v0.h[6]
1174        mla             \d0\().4h, \s7\().4h, v0.h[7]
1175.endm
1176.macro mul_mla_8_0 d0, s0, s1, s2, s3, s4, s5, s6, s7
1177        mul             \d0\().8h, \s0\().8h, v0.h[0]
1178        mla             \d0\().8h, \s1\().8h, v0.h[1]
1179        mla             \d0\().8h, \s2\().8h, v0.h[2]
1180        mla             \d0\().8h, \s3\().8h, v0.h[3]
1181        mla             \d0\().8h, \s4\().8h, v0.h[4]
1182        mla             \d0\().8h, \s5\().8h, v0.h[5]
1183        mla             \d0\().8h, \s6\().8h, v0.h[6]
1184        mla             \d0\().8h, \s7\().8h, v0.h[7]
1185.endm
1186.macro mul_mla_8_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8
1187        mul             \d0\().8h, \s0\().8h, v0.h[0]
1188        mla             \d0\().8h, \s1\().8h, v0.h[1]
1189        mla             \d0\().8h, \s2\().8h, v0.h[2]
1190        mla             \d0\().8h, \s3\().8h, v0.h[3]
1191        mla             \d0\().8h, \s4\().8h, v0.h[4]
1192        mla             \d0\().8h, \s5\().8h, v0.h[5]
1193        mla             \d0\().8h, \s6\().8h, v0.h[6]
1194        mla             \d0\().8h, \s7\().8h, v0.h[7]
1195        mul             \d1\().8h, \s1\().8h, v0.h[0]
1196        mla             \d1\().8h, \s2\().8h, v0.h[1]
1197        mla             \d1\().8h, \s3\().8h, v0.h[2]
1198        mla             \d1\().8h, \s4\().8h, v0.h[3]
1199        mla             \d1\().8h, \s5\().8h, v0.h[4]
1200        mla             \d1\().8h, \s6\().8h, v0.h[5]
1201        mla             \d1\().8h, \s7\().8h, v0.h[6]
1202        mla             \d1\().8h, \s8\().8h, v0.h[7]
1203.endm
1204.macro mul_mla_8_2 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9
1205        mul             \d0\().8h, \s0\().8h, v0.h[0]
1206        mla             \d0\().8h, \s1\().8h, v0.h[1]
1207        mla             \d0\().8h, \s2\().8h, v0.h[2]
1208        mla             \d0\().8h, \s3\().8h, v0.h[3]
1209        mla             \d0\().8h, \s4\().8h, v0.h[4]
1210        mla             \d0\().8h, \s5\().8h, v0.h[5]
1211        mla             \d0\().8h, \s6\().8h, v0.h[6]
1212        mla             \d0\().8h, \s7\().8h, v0.h[7]
1213        mul             \d1\().8h, \s2\().8h, v0.h[0]
1214        mla             \d1\().8h, \s3\().8h, v0.h[1]
1215        mla             \d1\().8h, \s4\().8h, v0.h[2]
1216        mla             \d1\().8h, \s5\().8h, v0.h[3]
1217        mla             \d1\().8h, \s6\().8h, v0.h[4]
1218        mla             \d1\().8h, \s7\().8h, v0.h[5]
1219        mla             \d1\().8h, \s8\().8h, v0.h[6]
1220        mla             \d1\().8h, \s9\().8h, v0.h[7]
1221.endm
1222.macro sqrshrun_b shift, r0, r1, r2, r3
1223        sqrshrun        \r0\().8b, \r0\().8h,  #\shift
1224.ifnb \r1
1225        sqrshrun        \r1\().8b, \r1\().8h,  #\shift
1226.endif
1227.ifnb \r2
1228        sqrshrun        \r2\().8b, \r2\().8h,  #\shift
1229        sqrshrun        \r3\().8b, \r3\().8h,  #\shift
1230.endif
1231.endm
1232.macro srshr_h shift, r0, r1, r2, r3
1233        srshr           \r0\().8h, \r0\().8h,  #\shift
1234.ifnb \r1
1235        srshr           \r1\().8h, \r1\().8h,  #\shift
1236.endif
1237.ifnb \r2
1238        srshr           \r2\().8h, \r2\().8h,  #\shift
1239        srshr           \r3\().8h, \r3\().8h,  #\shift
1240.endif
1241.endm
1242.macro st_h strd, reg, lanes
1243        st1             {\reg\().h}[0], [x0], \strd
1244        st1             {\reg\().h}[1], [x8], \strd
1245.if \lanes > 2
1246        st1             {\reg\().h}[2], [x0], \strd
1247        st1             {\reg\().h}[3], [x8], \strd
1248.endif
1249.endm
1250.macro st_s strd, r0, r1
1251        st1             {\r0\().s}[0], [x0], \strd
1252        st1             {\r0\().s}[1], [x8], \strd
1253.ifnb \r1
1254        st1             {\r1\().s}[0], [x0], \strd
1255        st1             {\r1\().s}[1], [x8], \strd
1256.endif
1257.endm
1258.macro st_d strd, r0, r1
1259        st1             {\r0\().d}[0], [x0], \strd
1260        st1             {\r0\().d}[1], [x8], \strd
1261.ifnb \r1
1262        st1             {\r1\().d}[0], [x0], \strd
1263        st1             {\r1\().d}[1], [x8], \strd
1264.endif
1265.endm
1266.macro shift_store_4 type, strd, r0, r1
1267.ifc \type, put
1268        sqrshrun_b      6,     \r0, \r1
1269        st_s            \strd, \r0, \r1
1270.else
1271        srshr_h         2,     \r0, \r1
1272        st_d            \strd, \r0, \r1
1273.endif
1274.endm
1275.macro st_reg strd, wd, r0, r1, r2, r3, r4, r5, r6, r7
1276        st1             {\r0\wd}, [x0], \strd
1277        st1             {\r1\wd}, [x8], \strd
1278.ifnb \r2
1279        st1             {\r2\wd}, [x0], \strd
1280        st1             {\r3\wd}, [x8], \strd
1281.endif
1282.ifnb \r4
1283        st1             {\r4\wd}, [x0], \strd
1284        st1             {\r5\wd}, [x8], \strd
1285        st1             {\r6\wd}, [x0], \strd
1286        st1             {\r7\wd}, [x8], \strd
1287.endif
1288.endm
1289.macro st_8b strd, r0, r1, r2, r3, r4, r5, r6, r7
1290        st_reg          \strd, .8b,  \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7
1291.endm
1292.macro st_16b strd, r0, r1, r2, r3, r4, r5, r6, r7
1293        st_reg          \strd, .16b, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7
1294.endm
1295.macro shift_store_8 type, strd, r0, r1, r2, r3
1296.ifc \type, put
1297        sqrshrun_b      6,     \r0, \r1, \r2, \r3
1298        st_8b           \strd, \r0, \r1, \r2, \r3
1299.else
1300        srshr_h         2,     \r0, \r1, \r2, \r3
1301        st_16b          \strd, \r0, \r1, \r2, \r3
1302.endif
1303.endm
1304.macro shift_store_16 type, strd, r0, r1, r2, r3
1305.ifc \type, put
1306        sqrshrun        \r0\().8b,  \r0\().8h, #6
1307        sqrshrun2       \r0\().16b, \r1\().8h, #6
1308        sqrshrun        \r2\().8b,  \r2\().8h, #6
1309        sqrshrun2       \r2\().16b, \r3\().8h, #6
1310        st_16b          \strd, \r0, \r2
1311.else
1312        srshr_h         2,     \r0, \r1, \r2, \r3
1313        st1             {\r0\().8h, \r1\().8h}, [x0], \strd
1314        st1             {\r2\().8h, \r3\().8h}, [x8], \strd
1315.endif
1316.endm
1317
1318.macro make_8tap_fn op, type, type_h, type_v
1319function \op\()_8tap_\type\()_8bpc_neon, export=1
1320        mov             x8,  \type_h
1321        mov             x9,  \type_v
1322        b               \op\()_8tap_neon
1323endfunc
1324.endm
1325
1326// No spaces in these expressions, due to gas-preprocessor.
1327#define REGULAR ((0*15<<7)|3*15)
1328#define SMOOTH  ((1*15<<7)|4*15)
1329#define SHARP   ((2*15<<7)|3*15)
1330
1331.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, ds2, sr2, shift_hv
1332make_8tap_fn \type, regular,        REGULAR, REGULAR
1333make_8tap_fn \type, regular_smooth, REGULAR, SMOOTH
1334make_8tap_fn \type, regular_sharp,  REGULAR, SHARP
1335make_8tap_fn \type, smooth,         SMOOTH,  SMOOTH
1336make_8tap_fn \type, smooth_regular, SMOOTH,  REGULAR
1337make_8tap_fn \type, smooth_sharp,   SMOOTH,  SHARP
1338make_8tap_fn \type, sharp,          SHARP,   SHARP
1339make_8tap_fn \type, sharp_regular,  SHARP,   REGULAR
1340make_8tap_fn \type, sharp_smooth,   SHARP,   SMOOTH
1341
1342function \type\()_8tap_neon
1343        mov             w10,  #0x4081  // (1 << 14) | (1 << 7) | (1 << 0)
1344        mul             \mx,  \mx, w10
1345        mul             \my,  \my, w10
1346        add             \mx,  \mx, w8 // mx, 8tap_h, 4tap_h
1347        add             \my,  \my, w9 // my, 8tap_v, 4tap_v
1348.ifc \type, prep
1349        uxtw            \d_strd, \w
1350        lsl             \d_strd, \d_strd, #1
1351.endif
1352
1353        clz             w8,  \w
1354        tst             \mx, #(0x7f << 14)
1355        sub             w8,  w8,  #24
1356        movrel          x10, X(mc_subpel_filters), -8
1357        b.ne            L(\type\()_8tap_h)
1358        tst             \my, #(0x7f << 14)
1359        b.ne            L(\type\()_8tap_v)
1360        b               \type\()_neon
1361
1362L(\type\()_8tap_h):
1363        cmp             \w,  #4
1364        ubfx            w9,  \mx, #7, #7
1365        and             \mx, \mx, #0x7f
1366        b.le            4f
1367        mov             \mx,  w9
13684:
1369        tst             \my,  #(0x7f << 14)
1370        add             \xmx, x10, \mx, uxtw #3
1371        b.ne            L(\type\()_8tap_hv)
1372
1373        adr             x9,  L(\type\()_8tap_h_tbl)
1374        ldrh            w8,  [x9, x8, lsl #1]
1375        sub             x9,  x9,  w8, uxtw
1376        br              x9
1377
137820:     // 2xN h
1379        AARCH64_VALID_JUMP_TARGET
1380.ifc \type, put
1381        add             \xmx,  \xmx,  #2
1382        ld1             {v0.s}[0], [\xmx]
1383        sub             \src,  \src,  #1
1384        add             \ds2,  \dst,  \d_strd
1385        add             \sr2,  \src,  \s_strd
1386        lsl             \d_strd,  \d_strd,  #1
1387        lsl             \s_strd,  \s_strd,  #1
1388        sxtl            v0.8h,  v0.8b
13892:
1390        ld1             {v4.8b},  [\src], \s_strd
1391        ld1             {v6.8b},  [\sr2], \s_strd
1392        uxtl            v4.8h,  v4.8b
1393        uxtl            v6.8h,  v6.8b
1394        ext             v5.16b, v4.16b, v4.16b, #2
1395        ext             v7.16b, v6.16b, v6.16b, #2
1396        subs            \h,  \h,  #2
1397        trn1            v3.2s,  v4.2s,  v6.2s
1398        trn2            v6.2s,  v4.2s,  v6.2s
1399        trn1            v4.2s,  v5.2s,  v7.2s
1400        trn2            v7.2s,  v5.2s,  v7.2s
1401        mul             v3.4h,  v3.4h,  v0.h[0]
1402        mla             v3.4h,  v4.4h,  v0.h[1]
1403        mla             v3.4h,  v6.4h,  v0.h[2]
1404        mla             v3.4h,  v7.4h,  v0.h[3]
1405        srshr           v3.4h,  v3.4h,  #2
1406        sqrshrun        v3.8b,  v3.8h,  #4
1407        st1             {v3.h}[0], [\dst], \d_strd
1408        st1             {v3.h}[1], [\ds2], \d_strd
1409        b.gt            2b
1410        ret
1411.endif
1412
141340:     // 4xN h
1414        AARCH64_VALID_JUMP_TARGET
1415        add             \xmx,  \xmx,  #2
1416        ld1             {v0.s}[0], [\xmx]
1417        sub             \src,  \src,  #1
1418        add             \ds2,  \dst,  \d_strd
1419        add             \sr2,  \src,  \s_strd
1420        lsl             \d_strd,  \d_strd,  #1
1421        lsl             \s_strd,  \s_strd,  #1
1422        sxtl            v0.8h,  v0.8b
14234:
1424        ld1             {v16.8b}, [\src], \s_strd
1425        ld1             {v20.8b}, [\sr2], \s_strd
1426        uxtl            v16.8h,  v16.8b
1427        uxtl            v20.8h,  v20.8b
1428        ext             v17.16b, v16.16b, v16.16b, #2
1429        ext             v18.16b, v16.16b, v16.16b, #4
1430        ext             v19.16b, v16.16b, v16.16b, #6
1431        ext             v21.16b, v20.16b, v20.16b, #2
1432        ext             v22.16b, v20.16b, v20.16b, #4
1433        ext             v23.16b, v20.16b, v20.16b, #6
1434        subs            \h,  \h,  #2
1435        mul             v16.4h,  v16.4h,  v0.h[0]
1436        mla             v16.4h,  v17.4h,  v0.h[1]
1437        mla             v16.4h,  v18.4h,  v0.h[2]
1438        mla             v16.4h,  v19.4h,  v0.h[3]
1439        mul             v20.4h,  v20.4h,  v0.h[0]
1440        mla             v20.4h,  v21.4h,  v0.h[1]
1441        mla             v20.4h,  v22.4h,  v0.h[2]
1442        mla             v20.4h,  v23.4h,  v0.h[3]
1443        srshr           v16.4h,  v16.4h,  #2
1444        srshr           v20.4h,  v20.4h,  #2
1445.ifc \type, put
1446        sqrshrun        v16.8b,  v16.8h,  #4
1447        sqrshrun        v20.8b,  v20.8h,  #4
1448        st1             {v16.s}[0], [\dst], \d_strd
1449        st1             {v20.s}[0], [\ds2], \d_strd
1450.else
1451        st1             {v16.4h}, [\dst], \d_strd
1452        st1             {v20.4h}, [\ds2], \d_strd
1453.endif
1454        b.gt            4b
1455        ret
1456
145780:     // 8xN h
1458        AARCH64_VALID_JUMP_TARGET
1459        ld1             {v0.8b}, [\xmx]
1460        sub             \src,  \src,  #3
1461        add             \ds2,  \dst,  \d_strd
1462        add             \sr2,  \src,  \s_strd
1463        lsl             \d_strd,  \d_strd,  #1
1464        lsl             \s_strd,  \s_strd,  #1
1465        sxtl            v0.8h, v0.8b
14668:
1467        ld1             {v16.8b, v17.8b},  [\src], \s_strd
1468        ld1             {v20.8b, v21.8b},  [\sr2], \s_strd
1469        uxtl            v16.8h,  v16.8b
1470        uxtl            v17.8h,  v17.8b
1471        uxtl            v20.8h,  v20.8b
1472        uxtl            v21.8h,  v21.8b
1473
1474        mul             v18.8h,  v16.8h,  v0.h[0]
1475        mul             v22.8h,  v20.8h,  v0.h[0]
1476.irpc i, 1234567
1477        ext             v19.16b, v16.16b, v17.16b, #(2*\i)
1478        ext             v23.16b, v20.16b, v21.16b, #(2*\i)
1479        mla             v18.8h,  v19.8h,  v0.h[\i]
1480        mla             v22.8h,  v23.8h,  v0.h[\i]
1481.endr
1482        subs            \h,  \h,  #2
1483        srshr           v18.8h,  v18.8h, #2
1484        srshr           v22.8h,  v22.8h, #2
1485.ifc \type, put
1486        sqrshrun        v18.8b,  v18.8h, #4
1487        sqrshrun        v22.8b,  v22.8h, #4
1488        st1             {v18.8b}, [\dst], \d_strd
1489        st1             {v22.8b}, [\ds2], \d_strd
1490.else
1491        st1             {v18.8h}, [\dst], \d_strd
1492        st1             {v22.8h}, [\ds2], \d_strd
1493.endif
1494        b.gt            8b
1495        ret
1496160:
1497320:
1498640:
14991280:   // 16xN, 32xN, ... h
1500        AARCH64_VALID_JUMP_TARGET
1501        ld1             {v0.8b}, [\xmx]
1502        sub             \src,  \src,  #3
1503        add             \ds2,  \dst,  \d_strd
1504        add             \sr2,  \src,  \s_strd
1505        lsl             \s_strd,  \s_strd,  #1
1506        sxtl            v0.8h, v0.8b
1507
1508        sub             \s_strd,  \s_strd,  \w, uxtw
1509        sub             \s_strd,  \s_strd,  #8
1510.ifc \type, put
1511        lsl             \d_strd,  \d_strd,  #1
1512        sub             \d_strd,  \d_strd,  \w, uxtw
1513.endif
1514161:
1515        ld1             {v16.8b, v17.8b, v18.8b},  [\src], #24
1516        ld1             {v20.8b, v21.8b, v22.8b},  [\sr2], #24
1517        mov             \mx, \w
1518        uxtl            v16.8h,  v16.8b
1519        uxtl            v17.8h,  v17.8b
1520        uxtl            v18.8h,  v18.8b
1521        uxtl            v20.8h,  v20.8b
1522        uxtl            v21.8h,  v21.8b
1523        uxtl            v22.8h,  v22.8b
1524
152516:
1526        mul             v24.8h,  v16.8h,  v0.h[0]
1527        mul             v25.8h,  v17.8h,  v0.h[0]
1528        mul             v26.8h,  v20.8h,  v0.h[0]
1529        mul             v27.8h,  v21.8h,  v0.h[0]
1530.irpc i, 1234567
1531        ext             v28.16b, v16.16b, v17.16b, #(2*\i)
1532        ext             v29.16b, v17.16b, v18.16b, #(2*\i)
1533        ext             v30.16b, v20.16b, v21.16b, #(2*\i)
1534        ext             v31.16b, v21.16b, v22.16b, #(2*\i)
1535        mla             v24.8h,  v28.8h,  v0.h[\i]
1536        mla             v25.8h,  v29.8h,  v0.h[\i]
1537        mla             v26.8h,  v30.8h,  v0.h[\i]
1538        mla             v27.8h,  v31.8h,  v0.h[\i]
1539.endr
1540        srshr           v24.8h,  v24.8h, #2
1541        srshr           v25.8h,  v25.8h, #2
1542        srshr           v26.8h,  v26.8h, #2
1543        srshr           v27.8h,  v27.8h, #2
1544        subs            \mx, \mx, #16
1545.ifc \type, put
1546        sqrshrun        v24.8b,  v24.8h, #4
1547        sqrshrun2       v24.16b, v25.8h, #4
1548        sqrshrun        v26.8b,  v26.8h, #4
1549        sqrshrun2       v26.16b, v27.8h, #4
1550        st1             {v24.16b}, [\dst], #16
1551        st1             {v26.16b}, [\ds2], #16
1552.else
1553        st1             {v24.8h, v25.8h}, [\dst], #32
1554        st1             {v26.8h, v27.8h}, [\ds2], #32
1555.endif
1556        b.le            9f
1557
1558        mov             v16.16b, v18.16b
1559        mov             v20.16b, v22.16b
1560        ld1             {v17.8b, v18.8b}, [\src], #16
1561        ld1             {v21.8b, v22.8b}, [\sr2], #16
1562        uxtl            v17.8h,  v17.8b
1563        uxtl            v18.8h,  v18.8b
1564        uxtl            v21.8h,  v21.8b
1565        uxtl            v22.8h,  v22.8b
1566        b               16b
1567
15689:
1569        add             \dst,  \dst,  \d_strd
1570        add             \ds2,  \ds2,  \d_strd
1571        add             \src,  \src,  \s_strd
1572        add             \sr2,  \sr2,  \s_strd
1573
1574        subs            \h,  \h,  #2
1575        b.gt            161b
1576        ret
1577
1578L(\type\()_8tap_h_tbl):
1579        .hword L(\type\()_8tap_h_tbl) - 1280b
1580        .hword L(\type\()_8tap_h_tbl) -  640b
1581        .hword L(\type\()_8tap_h_tbl) -  320b
1582        .hword L(\type\()_8tap_h_tbl) -  160b
1583        .hword L(\type\()_8tap_h_tbl) -   80b
1584        .hword L(\type\()_8tap_h_tbl) -   40b
1585        .hword L(\type\()_8tap_h_tbl) -   20b
1586        .hword 0
1587
1588
1589L(\type\()_8tap_v):
1590        cmp             \h,  #4
1591        ubfx            w9,  \my, #7, #7
1592        and             \my, \my, #0x7f
1593        b.le            4f
1594        mov             \my, w9
15954:
1596        add             \xmy, x10, \my, uxtw #3
1597
1598        adr             x9,  L(\type\()_8tap_v_tbl)
1599        ldrh            w8,  [x9, x8, lsl #1]
1600        sub             x9,  x9,  w8, uxtw
1601        br              x9
1602
160320:     // 2xN v
1604        AARCH64_VALID_JUMP_TARGET
1605.ifc \type, put
1606        b.gt            28f
1607
1608        cmp             \h,  #2
1609        add             \xmy, \xmy, #2
1610        ld1             {v0.s}[0], [\xmy]
1611        sub             \src,  \src,  \s_strd
1612        add             \ds2,  \dst,  \d_strd
1613        add             \sr2,  \src,  \s_strd
1614        lsl             \s_strd,  \s_strd,  #1
1615        lsl             \d_strd,  \d_strd,  #1
1616        sxtl            v0.8h, v0.8b
1617
1618        // 2x2 v
1619        load_h          \src, \sr2, \s_strd, v1, v2, v3, v4, v5
1620        interleave_1_h  v1, v2, v3, v4, v5
1621        b.gt            24f
1622        uxtl_b          v1, v2, v3, v4
1623        mul_mla_4       v6, v1, v2, v3, v4, .4h
1624        sqrshrun_b      6,  v6
1625        st_h            \d_strd, v6, 2
1626        ret
1627
162824:     // 2x4 v
1629        load_h          \sr2, \src, \s_strd, v6, v7
1630        interleave_1_h  v5, v6, v7
1631        interleave_2_s  v1, v2, v3, v4, v5, v6
1632        uxtl_b          v1, v2, v3, v4
1633        mul_mla_4       v6, v1, v2, v3, v4, .8h
1634        sqrshrun_b      6,  v6
1635        st_h            \d_strd, v6, 4
1636        ret
1637
163828:     // 2x6, 2x8, 2x12, 2x16 v
1639        ld1             {v0.8b}, [\xmy]
1640        sub             \sr2,  \src,  \s_strd, lsl #1
1641        add             \ds2,  \dst,  \d_strd
1642        sub             \src,  \sr2,  \s_strd
1643        lsl             \d_strd,  \d_strd,  #1
1644        lsl             \s_strd,  \s_strd,  #1
1645        sxtl            v0.8h, v0.8b
1646
1647        load_h          \src, \sr2, \s_strd, v1,  v2,  v3,  v4, v5, v6, v7
1648        interleave_1_h  v1,  v2,  v3,  v4,  v5
1649        interleave_1_h  v5,  v6,  v7
1650        interleave_2_s  v1,  v2,  v3,  v4,  v5,  v6
1651        uxtl_b          v1,  v2,  v3,  v4
1652216:
1653        subs            \h,  \h,  #4
1654        load_h          \sr2, \src, \s_strd, v16, v17, v18, v19
1655        interleave_1_h  v7,  v16, v17, v18, v19
1656        interleave_2_s  v5,  v6,  v7,  v16, v17, v18
1657        uxtl_b          v5,  v6,  v7,  v16
1658        mul_mla_8_0     v30, v1,  v2,  v3,  v4,  v5,  v6,  v7,  v16
1659        sqrshrun_b      6,   v30
1660        st_h            \d_strd, v30, 4
1661        b.le            0f
1662        cmp             \h,  #2
1663        mov             v1.16b,  v5.16b
1664        mov             v2.16b,  v6.16b
1665        mov             v3.16b,  v7.16b
1666        mov             v4.16b,  v16.16b
1667        mov             v5.16b,  v17.16b
1668        mov             v6.16b,  v18.16b
1669        mov             v7.16b,  v19.16b
1670        b.eq            26f
1671        b               216b
167226:
1673        load_h          \sr2, \src, \s_strd, v16, v17
1674        interleave_1_h  v7,  v16, v17
1675        uxtl_b          v5,  v6,  v7,  v16
1676        mul_mla_8_0_4h  v30, v1,  v2,  v3,  v4,  v5,  v6,  v7,  v16
1677        sqrshrun_b      6,   v30
1678        st_h            \d_strd, v30, 2
16790:
1680        ret
1681.endif
1682
168340:
1684        AARCH64_VALID_JUMP_TARGET
1685        b.gt            480f
1686
1687        // 4x2, 4x4 v
1688        cmp             \h,  #2
1689        add             \xmy, \xmy, #2
1690        ld1             {v0.s}[0], [\xmy]
1691        sub             \src, \src, \s_strd
1692        add             \ds2, \dst, \d_strd
1693        add             \sr2, \src, \s_strd
1694        lsl             \s_strd, \s_strd, #1
1695        lsl             \d_strd, \d_strd, #1
1696        sxtl            v0.8h, v0.8b
1697
1698        load_s          \src, \sr2, \s_strd, v1, v2, v3, v4, v5
1699        interleave_1_s  v1, v2, v3, v4, v5
1700        uxtl_b          v1, v2, v3, v4
1701        mul_mla_4       v6, v1, v2, v3, v4, .8h
1702        shift_store_4   \type, \d_strd, v6
1703        b.le            0f
1704        load_s          \sr2, \src, \s_strd, v6, v7
1705        interleave_1_s  v5, v6, v7
1706        uxtl_b          v5, v6
1707        mul_mla_4       v7, v3, v4, v5, v6, .8h
1708        shift_store_4   \type, \d_strd, v7
17090:
1710        ret
1711
1712480:    // 4x6, 4x8, 4x12, 4x16 v
1713        ld1             {v0.8b}, [\xmy]
1714        sub             \sr2, \src, \s_strd, lsl #1
1715        add             \ds2, \dst, \d_strd
1716        sub             \src, \sr2, \s_strd
1717        lsl             \s_strd, \s_strd, #1
1718        lsl             \d_strd, \d_strd, #1
1719        sxtl            v0.8h, v0.8b
1720
1721        load_s          \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22
1722        interleave_1_s  v16, v17, v18
1723        interleave_1_s  v18, v19, v20, v21, v22
1724        uxtl_b          v16, v17
1725        uxtl_b          v18, v19, v20, v21
1726
172748:
1728        subs            \h,  \h,  #4
1729        load_s          \sr2, \src, \s_strd, v23, v24, v25, v26
1730        interleave_1_s  v22, v23, v24, v25, v26
1731        uxtl_b          v22, v23, v24, v25
1732        mul_mla_8_2     v1,  v2,  v16, v17, v18, v19, v20, v21, v22, v23, v24, v25
1733        shift_store_4   \type, \d_strd, v1, v2
1734        b.le            0f
1735        load_s          \sr2,  \src, \s_strd, v27, v16
1736        subs            \h,  \h,  #2
1737        interleave_1_s  v26, v27, v16
1738        uxtl_b          v26, v27
1739        mul_mla_8_0     v1,  v20, v21, v22, v23, v24, v25, v26, v27
1740        shift_store_4   \type, \d_strd, v1
1741        b.le            0f
1742        load_s          \sr2,  \src, \s_strd, v17, v18
1743        subs            \h,  \h,  #2
1744        interleave_1_s  v16, v17, v18
1745        uxtl_b          v16, v17
1746        mul_mla_8_0     v2,  v22, v23, v24, v25, v26, v27, v16, v17
1747        shift_store_4   \type, \d_strd, v2
1748        b.le            0f
1749        subs            \h,  \h,  #4
1750        load_s          \sr2, \src, \s_strd, v19, v20, v21, v22
1751        interleave_1_s  v18, v19, v20, v21, v22
1752        uxtl_b          v18, v19, v20, v21
1753        mul_mla_8_2     v1,  v2,  v24, v25, v26, v27, v16, v17, v18, v19, v20, v21
1754        shift_store_4   \type, \d_strd, v1, v2
1755        b.gt            48b
17560:
1757        ret
1758
175980:
1760        AARCH64_VALID_JUMP_TARGET
1761        b.gt            880f
1762
1763        // 8x2, 8x4 v
1764        cmp             \h,  #2
1765        add             \xmy, \xmy, #2
1766        ld1             {v0.s}[0], [\xmy]
1767        sub             \src, \src, \s_strd
1768        add             \ds2, \dst, \d_strd
1769        add             \sr2, \src, \s_strd
1770        lsl             \s_strd, \s_strd, #1
1771        lsl             \d_strd, \d_strd, #1
1772        sxtl            v0.8h, v0.8b
1773
1774        load_8b         \src, \sr2, \s_strd, v1, v2, v3, v4, v5
1775        uxtl_b          v1, v2, v3, v4, v5
1776        mul_mla_4       v6, v1, v2, v3, v4, .8h
1777        mul_mla_4       v7, v2, v3, v4, v5, .8h
1778        shift_store_8   \type, \d_strd, v6, v7
1779        b.le            0f
1780        load_8b         \sr2, \src, \s_strd, v6, v7
1781        uxtl_b          v6, v7
1782        mul_mla_4       v1, v3, v4, v5, v6, .8h
1783        mul_mla_4       v2, v4, v5, v6, v7, .8h
1784        shift_store_8   \type, \d_strd, v1, v2
17850:
1786        ret
1787
1788880:    // 8x6, 8x8, 8x16, 8x32 v
17891680:   // 16x8, 16x16, ...
1790320:    // 32x8, 32x16, ...
1791640:
17921280:
1793        AARCH64_VALID_JUMP_TARGET
1794        ld1             {v0.8b}, [\xmy]
1795        sub             \src, \src, \s_strd
1796        sub             \src, \src, \s_strd, lsl #1
1797        sxtl            v0.8h, v0.8b
1798        mov             \my,  \h
1799168:
1800        add             \ds2, \dst, \d_strd
1801        add             \sr2, \src, \s_strd
1802        lsl             \s_strd, \s_strd, #1
1803        lsl             \d_strd, \d_strd, #1
1804
1805        load_8b         \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22
1806        uxtl_b          v16, v17, v18, v19, v20, v21, v22
1807
180888:
1809        subs            \h,  \h,  #2
1810        load_8b         \sr2, \src, \s_strd, v23, v24
1811        uxtl_b          v23, v24
1812        mul_mla_8_1     v1,  v2,  v16, v17, v18, v19, v20, v21, v22, v23, v24
1813        shift_store_8   \type, \d_strd, v1, v2
1814        b.le            9f
1815        subs            \h,  \h,  #2
1816        load_8b         \sr2, \src, \s_strd, v25, v26
1817        uxtl_b          v25, v26
1818        mul_mla_8_1     v3,  v4,  v18, v19, v20, v21, v22, v23, v24, v25, v26
1819        shift_store_8   \type, \d_strd, v3, v4
1820        b.le            9f
1821        subs            \h,  \h,  #2
1822        load_8b         \sr2, \src, \s_strd, v27, v16
1823        uxtl_b          v27, v16
1824        mul_mla_8_1     v1,  v2,  v20, v21, v22, v23, v24, v25, v26, v27, v16
1825        shift_store_8   \type, \d_strd, v1, v2
1826        b.le            9f
1827        subs            \h,  \h,  #2
1828        load_8b         \sr2, \src, \s_strd, v17, v18
1829        uxtl_b          v17, v18
1830        mul_mla_8_1     v3,  v4,  v22, v23, v24, v25, v26, v27, v16, v17, v18
1831        shift_store_8   \type, \d_strd, v3, v4
1832        b.le            9f
1833        subs            \h,  \h,  #4
1834        load_8b         \sr2, \src, \s_strd, v19, v20, v21, v22
1835        uxtl_b          v19, v20, v21, v22
1836        mul_mla_8_1     v1,  v2,  v24, v25, v26, v27, v16, v17, v18, v19, v20
1837        mul_mla_8_1     v3,  v4,  v26, v27, v16, v17, v18, v19, v20, v21, v22
1838        shift_store_8   \type, \d_strd, v1, v2, v3, v4
1839        b.gt            88b
18409:
1841        subs            \w,  \w,  #8
1842        b.le            0f
1843        asr             \s_strd, \s_strd, #1
1844        asr             \d_strd, \d_strd, #1
1845        msub            \src, \s_strd, \xmy, \src
1846        msub            \dst, \d_strd, \xmy, \dst
1847        sub             \src, \src, \s_strd, lsl #3
1848        mov             \h,  \my
1849        add             \src, \src, #8
1850.ifc \type, put
1851        add             \dst, \dst, #8
1852.else
1853        add             \dst, \dst, #16
1854.endif
1855        b               168b
18560:
1857        ret
1858
1859160:
1860        AARCH64_VALID_JUMP_TARGET
1861        b.gt            1680b
1862
1863        // 16x2, 16x4 v
1864        add             \xmy, \xmy, #2
1865        ld1             {v0.s}[0], [\xmy]
1866        sub             \src, \src, \s_strd
1867        add             \ds2, \dst, \d_strd
1868        add             \sr2, \src, \s_strd
1869        lsl             \s_strd, \s_strd, #1
1870        lsl             \d_strd, \d_strd, #1
1871        sxtl            v0.8h, v0.8b
1872
1873        cmp             \h,  #2
1874        load_16b        \src, \sr2, \s_strd, v1,  v2,  v3,  v4,  v5
1875        uxtl            v16.8h, v1.8b
1876        uxtl            v17.8h, v2.8b
1877        uxtl            v18.8h, v3.8b
1878        uxtl            v19.8h, v4.8b
1879        uxtl            v20.8h, v5.8b
1880        uxtl2           v23.8h, v1.16b
1881        uxtl2           v24.8h, v2.16b
1882        uxtl2           v25.8h, v3.16b
1883        uxtl2           v26.8h, v4.16b
1884        uxtl2           v27.8h, v5.16b
1885        mul_mla_4       v1,  v16, v17, v18, v19, .8h
1886        mul_mla_4       v16, v17, v18, v19, v20, .8h
1887        mul_mla_4       v2,  v23, v24, v25, v26, .8h
1888        mul_mla_4       v17, v24, v25, v26, v27, .8h
1889        shift_store_16  \type, \d_strd, v1, v2, v16, v17
1890        b.le            0f
1891        load_16b        \sr2, \src, \s_strd, v6,  v7
1892        uxtl            v21.8h, v6.8b
1893        uxtl            v22.8h, v7.8b
1894        uxtl2           v28.8h, v6.16b
1895        uxtl2           v29.8h, v7.16b
1896        mul_mla_4       v1,  v18, v19, v20, v21, .8h
1897        mul_mla_4       v3,  v19, v20, v21, v22, .8h
1898        mul_mla_4       v2,  v25, v26, v27, v28, .8h
1899        mul_mla_4       v4,  v26, v27, v28, v29, .8h
1900        shift_store_16  \type, \d_strd, v1, v2, v3, v4
19010:
1902        ret
1903
1904L(\type\()_8tap_v_tbl):
1905        .hword L(\type\()_8tap_v_tbl) - 1280b
1906        .hword L(\type\()_8tap_v_tbl) -  640b
1907        .hword L(\type\()_8tap_v_tbl) -  320b
1908        .hword L(\type\()_8tap_v_tbl) -  160b
1909        .hword L(\type\()_8tap_v_tbl) -   80b
1910        .hword L(\type\()_8tap_v_tbl) -   40b
1911        .hword L(\type\()_8tap_v_tbl) -   20b
1912        .hword 0
1913
1914L(\type\()_8tap_hv):
1915        cmp             \h,  #4
1916        ubfx            w9,  \my, #7, #7
1917        and             \my, \my, #0x7f
1918        b.le            4f
1919        mov             \my,  w9
19204:
1921        add             \xmy,  x10, \my, uxtw #3
1922
1923        adr             x9,  L(\type\()_8tap_hv_tbl)
1924        ldrh            w8,  [x9, x8, lsl #1]
1925        sub             x9,  x9,  w8, uxtw
1926        br              x9
1927
192820:
1929        AARCH64_VALID_JUMP_TARGET
1930.ifc \type, put
1931        add             \xmx,  \xmx,  #2
1932        ld1             {v0.s}[0],  [\xmx]
1933        b.gt            280f
1934        add             \xmy,  \xmy,  #2
1935        ld1             {v1.s}[0],  [\xmy]
1936
1937        // 2x2, 2x4 hv
1938        sub             \sr2, \src, #1
1939        sub             \src, \sr2, \s_strd
1940        add             \ds2, \dst, \d_strd
1941        lsl             \s_strd, \s_strd, #1
1942        lsl             \d_strd, \d_strd, #1
1943        sxtl            v0.8h,  v0.8b
1944        sxtl            v1.8h,  v1.8b
1945        mov             x15, x30
1946
1947        ld1             {v28.8b}, [\src], \s_strd
1948        uxtl            v28.8h,  v28.8b
1949        ext             v29.16b, v28.16b, v28.16b, #2
1950        mul             v28.4h,  v28.4h,  v0.4h
1951        mul             v29.4h,  v29.4h,  v0.4h
1952        addp            v28.4h,  v28.4h,  v29.4h
1953        addp            v16.4h,  v28.4h,  v28.4h
1954        srshr           v16.4h,  v16.4h,  #2
1955        bl              L(\type\()_8tap_filter_2)
1956
1957        trn1            v16.2s, v16.2s, v28.2s
1958        mov             v17.8b, v28.8b
1959
19602:
1961        bl              L(\type\()_8tap_filter_2)
1962
1963        ext             v18.8b, v17.8b, v28.8b, #4
1964        smull           v2.4s,  v16.4h, v1.h[0]
1965        smlal           v2.4s,  v17.4h, v1.h[1]
1966        smlal           v2.4s,  v18.4h, v1.h[2]
1967        smlal           v2.4s,  v28.4h, v1.h[3]
1968
1969        sqrshrn         v2.4h,  v2.4s,  #\shift_hv
1970        sqxtun          v2.8b,  v2.8h
1971        subs            \h,  \h,  #2
1972        st1             {v2.h}[0], [\dst], \d_strd
1973        st1             {v2.h}[1], [\ds2], \d_strd
1974        b.le            0f
1975        mov             v16.8b, v18.8b
1976        mov             v17.8b, v28.8b
1977        b               2b
1978
1979280:    // 2x8, 2x16, 2x32 hv
1980        ld1             {v1.8b},  [\xmy]
1981        sub             \src, \src, #1
1982        sub             \sr2, \src, \s_strd, lsl #1
1983        sub             \src, \sr2, \s_strd
1984        add             \ds2, \dst, \d_strd
1985        lsl             \s_strd, \s_strd, #1
1986        lsl             \d_strd, \d_strd, #1
1987        sxtl            v0.8h,  v0.8b
1988        sxtl            v1.8h,  v1.8b
1989        mov             x15, x30
1990
1991        ld1             {v28.8b}, [\src], \s_strd
1992        uxtl            v28.8h,  v28.8b
1993        ext             v29.16b, v28.16b, v28.16b, #2
1994        mul             v28.4h,  v28.4h,  v0.4h
1995        mul             v29.4h,  v29.4h,  v0.4h
1996        addp            v28.4h,  v28.4h,  v29.4h
1997        addp            v16.4h,  v28.4h,  v28.4h
1998        srshr           v16.4h,  v16.4h,  #2
1999
2000        bl              L(\type\()_8tap_filter_2)
2001        trn1            v16.2s, v16.2s, v28.2s
2002        mov             v17.8b, v28.8b
2003        bl              L(\type\()_8tap_filter_2)
2004        ext             v18.8b, v17.8b, v28.8b, #4
2005        mov             v19.8b, v28.8b
2006        bl              L(\type\()_8tap_filter_2)
2007        ext             v20.8b, v19.8b, v28.8b, #4
2008        mov             v21.8b, v28.8b
2009
201028:
2011        bl              L(\type\()_8tap_filter_2)
2012        ext             v22.8b, v21.8b, v28.8b, #4
2013        smull           v2.4s,  v16.4h, v1.h[0]
2014        smlal           v2.4s,  v17.4h, v1.h[1]
2015        smlal           v2.4s,  v18.4h, v1.h[2]
2016        smlal           v2.4s,  v19.4h, v1.h[3]
2017        smlal           v2.4s,  v20.4h, v1.h[4]
2018        smlal           v2.4s,  v21.4h, v1.h[5]
2019        smlal           v2.4s,  v22.4h, v1.h[6]
2020        smlal           v2.4s,  v28.4h, v1.h[7]
2021
2022        sqrshrn         v2.4h,  v2.4s,  #\shift_hv
2023        sqxtun          v2.8b,  v2.8h
2024        subs            \h,  \h,  #2
2025        st1             {v2.h}[0], [\dst], \d_strd
2026        st1             {v2.h}[1], [\ds2], \d_strd
2027        b.le            0f
2028        mov             v16.8b, v18.8b
2029        mov             v17.8b, v19.8b
2030        mov             v18.8b, v20.8b
2031        mov             v19.8b, v21.8b
2032        mov             v20.8b, v22.8b
2033        mov             v21.8b, v28.8b
2034        b               28b
2035
20360:
2037        ret             x15
2038
2039L(\type\()_8tap_filter_2):
2040        ld1             {v28.8b},  [\sr2], \s_strd
2041        ld1             {v30.8b},  [\src], \s_strd
2042        uxtl            v28.8h,  v28.8b
2043        uxtl            v30.8h,  v30.8b
2044        ext             v29.16b, v28.16b, v28.16b, #2
2045        ext             v31.16b, v30.16b, v30.16b, #2
2046        trn1            v27.2s,  v28.2s,  v30.2s
2047        trn2            v30.2s,  v28.2s,  v30.2s
2048        trn1            v28.2s,  v29.2s,  v31.2s
2049        trn2            v31.2s,  v29.2s,  v31.2s
2050        mul             v27.4h,  v27.4h,  v0.h[0]
2051        mla             v27.4h,  v28.4h,  v0.h[1]
2052        mla             v27.4h,  v30.4h,  v0.h[2]
2053        mla             v27.4h,  v31.4h,  v0.h[3]
2054        srshr           v28.4h,  v27.4h,  #2
2055        ret
2056.endif
2057
205840:
2059        AARCH64_VALID_JUMP_TARGET
2060        add             \xmx, \xmx, #2
2061        ld1             {v0.s}[0],  [\xmx]
2062        b.gt            480f
2063        add             \xmy, \xmy,  #2
2064        ld1             {v1.s}[0],  [\xmy]
2065        sub             \sr2, \src, #1
2066        sub             \src, \sr2, \s_strd
2067        add             \ds2, \dst, \d_strd
2068        lsl             \s_strd, \s_strd, #1
2069        lsl             \d_strd, \d_strd, #1
2070        sxtl            v0.8h,  v0.8b
2071        sxtl            v1.8h,  v1.8b
2072        mov             x15, x30
2073
2074        // 4x2, 4x4 hv
2075        ld1             {v26.8b}, [\src], \s_strd
2076        uxtl            v26.8h,  v26.8b
2077        ext             v28.16b, v26.16b, v26.16b, #2
2078        ext             v29.16b, v26.16b, v26.16b, #4
2079        ext             v30.16b, v26.16b, v26.16b, #6
2080        mul             v31.4h,  v26.4h,  v0.h[0]
2081        mla             v31.4h,  v28.4h,  v0.h[1]
2082        mla             v31.4h,  v29.4h,  v0.h[2]
2083        mla             v31.4h,  v30.4h,  v0.h[3]
2084        srshr           v16.4h,  v31.4h,  #2
2085
2086        bl              L(\type\()_8tap_filter_4)
2087        mov             v17.8b, v28.8b
2088        mov             v18.8b, v29.8b
2089
20904:
2091        bl              L(\type\()_8tap_filter_4)
2092        // Interleaving the mul/mla chains actually hurts performance
2093        // significantly on Cortex A53, thus keeping mul/mla tightly
2094        // chained like this.
2095        smull           v2.4s,  v16.4h, v1.h[0]
2096        smlal           v2.4s,  v17.4h, v1.h[1]
2097        smlal           v2.4s,  v18.4h, v1.h[2]
2098        smlal           v2.4s,  v28.4h, v1.h[3]
2099        smull           v3.4s,  v17.4h, v1.h[0]
2100        smlal           v3.4s,  v18.4h, v1.h[1]
2101        smlal           v3.4s,  v28.4h, v1.h[2]
2102        smlal           v3.4s,  v29.4h, v1.h[3]
2103        sqrshrn         v2.4h,  v2.4s,  #\shift_hv
2104        sqrshrn         v3.4h,  v3.4s,  #\shift_hv
2105        subs            \h,  \h,  #2
2106.ifc \type, put
2107        sqxtun          v2.8b,  v2.8h
2108        sqxtun          v3.8b,  v3.8h
2109        st1             {v2.s}[0], [\dst], \d_strd
2110        st1             {v3.s}[0], [\ds2], \d_strd
2111.else
2112        st1             {v2.4h}, [\dst], \d_strd
2113        st1             {v3.4h}, [\ds2], \d_strd
2114.endif
2115        b.le            0f
2116        mov             v16.8b,  v18.8b
2117        mov             v17.8b,  v28.8b
2118        mov             v18.8b,  v29.8b
2119        b               4b
2120
2121480:    // 4x8, 4x16, 4x32 hv
2122        ld1             {v1.8b},  [\xmy]
2123        sub             \src, \src, #1
2124        sub             \sr2, \src, \s_strd, lsl #1
2125        sub             \src, \sr2, \s_strd
2126        add             \ds2, \dst, \d_strd
2127        lsl             \s_strd, \s_strd, #1
2128        lsl             \d_strd, \d_strd, #1
2129        sxtl            v0.8h,  v0.8b
2130        sxtl            v1.8h,  v1.8b
2131        mov             x15, x30
2132
2133        ld1             {v26.8b}, [\src], \s_strd
2134        uxtl            v26.8h,  v26.8b
2135        ext             v28.16b, v26.16b, v26.16b, #2
2136        ext             v29.16b, v26.16b, v26.16b, #4
2137        ext             v30.16b, v26.16b, v26.16b, #6
2138        mul             v31.4h,  v26.4h,  v0.h[0]
2139        mla             v31.4h,  v28.4h,  v0.h[1]
2140        mla             v31.4h,  v29.4h,  v0.h[2]
2141        mla             v31.4h,  v30.4h,  v0.h[3]
2142        srshr           v16.4h,  v31.4h,  #2
2143
2144        bl              L(\type\()_8tap_filter_4)
2145        mov             v17.8b, v28.8b
2146        mov             v18.8b, v29.8b
2147        bl              L(\type\()_8tap_filter_4)
2148        mov             v19.8b, v28.8b
2149        mov             v20.8b, v29.8b
2150        bl              L(\type\()_8tap_filter_4)
2151        mov             v21.8b, v28.8b
2152        mov             v22.8b, v29.8b
2153
215448:
2155        bl              L(\type\()_8tap_filter_4)
2156        smull           v2.4s,  v16.4h, v1.h[0]
2157        smlal           v2.4s,  v17.4h, v1.h[1]
2158        smlal           v2.4s,  v18.4h, v1.h[2]
2159        smlal           v2.4s,  v19.4h, v1.h[3]
2160        smlal           v2.4s,  v20.4h, v1.h[4]
2161        smlal           v2.4s,  v21.4h, v1.h[5]
2162        smlal           v2.4s,  v22.4h, v1.h[6]
2163        smlal           v2.4s,  v28.4h, v1.h[7]
2164        smull           v3.4s,  v17.4h, v1.h[0]
2165        smlal           v3.4s,  v18.4h, v1.h[1]
2166        smlal           v3.4s,  v19.4h, v1.h[2]
2167        smlal           v3.4s,  v20.4h, v1.h[3]
2168        smlal           v3.4s,  v21.4h, v1.h[4]
2169        smlal           v3.4s,  v22.4h, v1.h[5]
2170        smlal           v3.4s,  v28.4h, v1.h[6]
2171        smlal           v3.4s,  v29.4h, v1.h[7]
2172        sqrshrn         v2.4h,  v2.4s,  #\shift_hv
2173        sqrshrn         v3.4h,  v3.4s,  #\shift_hv
2174        subs            \h,  \h,  #2
2175.ifc \type, put
2176        sqxtun          v2.8b,  v2.8h
2177        sqxtun          v3.8b,  v3.8h
2178        st1             {v2.s}[0], [\dst], \d_strd
2179        st1             {v3.s}[0], [\ds2], \d_strd
2180.else
2181        st1             {v2.4h}, [\dst], \d_strd
2182        st1             {v3.4h}, [\ds2], \d_strd
2183.endif
2184        b.le            0f
2185        mov             v16.8b,  v18.8b
2186        mov             v17.8b,  v19.8b
2187        mov             v18.8b,  v20.8b
2188        mov             v19.8b,  v21.8b
2189        mov             v20.8b,  v22.8b
2190        mov             v21.8b,  v28.8b
2191        mov             v22.8b,  v29.8b
2192        b               48b
21930:
2194        ret             x15
2195
2196L(\type\()_8tap_filter_4):
2197        ld1             {v26.8b}, [\sr2], \s_strd
2198        ld1             {v27.8b}, [\src], \s_strd
2199        uxtl            v26.8h,  v26.8b
2200        uxtl            v27.8h,  v27.8b
2201        ext             v28.16b, v26.16b, v26.16b, #2
2202        ext             v29.16b, v26.16b, v26.16b, #4
2203        ext             v30.16b, v26.16b, v26.16b, #6
2204        mul             v31.4h,  v26.4h,  v0.h[0]
2205        mla             v31.4h,  v28.4h,  v0.h[1]
2206        mla             v31.4h,  v29.4h,  v0.h[2]
2207        mla             v31.4h,  v30.4h,  v0.h[3]
2208        ext             v28.16b, v27.16b, v27.16b, #2
2209        ext             v29.16b, v27.16b, v27.16b, #4
2210        ext             v30.16b, v27.16b, v27.16b, #6
2211        mul             v27.4h,  v27.4h,  v0.h[0]
2212        mla             v27.4h,  v28.4h,  v0.h[1]
2213        mla             v27.4h,  v29.4h,  v0.h[2]
2214        mla             v27.4h,  v30.4h,  v0.h[3]
2215        srshr           v28.4h,  v31.4h,  #2
2216        srshr           v29.4h,  v27.4h,  #2
2217        ret
2218
221980:
2220160:
2221320:
2222        AARCH64_VALID_JUMP_TARGET
2223        b.gt            880f
2224        add             \xmy,  \xmy,  #2
2225        ld1             {v0.8b},  [\xmx]
2226        ld1             {v1.s}[0],  [\xmy]
2227        sub             \src,  \src,  #3
2228        sub             \src,  \src,  \s_strd
2229        sxtl            v0.8h,  v0.8b
2230        sxtl            v1.8h,  v1.8b
2231        mov             x15, x30
2232        mov             \my,  \h
2233
2234164:    // 8x2, 8x4, 16x2, 16x4, 32x2, 32x4 hv
2235        add             \ds2,  \dst,  \d_strd
2236        add             \sr2,  \src,  \s_strd
2237        lsl             \d_strd, \d_strd, #1
2238        lsl             \s_strd, \s_strd, #1
2239
2240        bl              L(\type\()_8tap_filter_8_first)
2241        bl              L(\type\()_8tap_filter_8)
2242        mov             v17.16b, v24.16b
2243        mov             v18.16b, v25.16b
2244
22458:
2246        smull           v2.4s,  v16.4h, v1.h[0]
2247        smull2          v3.4s,  v16.8h, v1.h[0]
2248        bl              L(\type\()_8tap_filter_8)
2249        smull           v4.4s,  v17.4h, v1.h[0]
2250        smull2          v5.4s,  v17.8h, v1.h[0]
2251        smlal           v2.4s,  v17.4h, v1.h[1]
2252        smlal2          v3.4s,  v17.8h, v1.h[1]
2253        smlal           v4.4s,  v18.4h, v1.h[1]
2254        smlal2          v5.4s,  v18.8h, v1.h[1]
2255        smlal           v2.4s,  v18.4h, v1.h[2]
2256        smlal2          v3.4s,  v18.8h, v1.h[2]
2257        smlal           v4.4s,  v24.4h, v1.h[2]
2258        smlal2          v5.4s,  v24.8h, v1.h[2]
2259        smlal           v2.4s,  v24.4h, v1.h[3]
2260        smlal2          v3.4s,  v24.8h, v1.h[3]
2261        smlal           v4.4s,  v25.4h, v1.h[3]
2262        smlal2          v5.4s,  v25.8h, v1.h[3]
2263        sqrshrn         v2.4h,  v2.4s,  #\shift_hv
2264        sqrshrn2        v2.8h,  v3.4s,  #\shift_hv
2265        sqrshrn         v4.4h,  v4.4s,  #\shift_hv
2266        sqrshrn2        v4.8h,  v5.4s,  #\shift_hv
2267        subs            \h,  \h,  #2
2268.ifc \type, put
2269        sqxtun          v2.8b,  v2.8h
2270        sqxtun          v4.8b,  v4.8h
2271        st1             {v2.8b}, [\dst], \d_strd
2272        st1             {v4.8b}, [\ds2], \d_strd
2273.else
2274        st1             {v2.8h}, [\dst], \d_strd
2275        st1             {v4.8h}, [\ds2], \d_strd
2276.endif
2277        b.le            9f
2278        mov             v16.16b, v18.16b
2279        mov             v17.16b, v24.16b
2280        mov             v18.16b, v25.16b
2281        b               8b
22829:
2283        subs            \w,  \w,  #8
2284        b.le            0f
2285        asr             \s_strd,  \s_strd,  #1
2286        asr             \d_strd,  \d_strd,  #1
2287        msub            \src,  \s_strd,  \xmy,  \src
2288        msub            \dst,  \d_strd,  \xmy,  \dst
2289        sub             \src,  \src,  \s_strd,  lsl #2
2290        mov             \h,  \my
2291        add             \src,  \src,  #8
2292.ifc \type, put
2293        add             \dst,  \dst,  #8
2294.else
2295        add             \dst,  \dst,  #16
2296.endif
2297        b               164b
2298
2299880:    // 8x8, 8x16, ..., 16x8, ..., 32x8, ... hv
2300640:
23011280:
2302        AARCH64_VALID_JUMP_TARGET
2303        ld1             {v0.8b},  [\xmx]
2304        ld1             {v1.8b},  [\xmy]
2305        sub             \src,  \src,  #3
2306        sub             \src,  \src,  \s_strd
2307        sub             \src,  \src,  \s_strd, lsl #1
2308        sxtl            v0.8h,  v0.8b
2309        sxtl            v1.8h,  v1.8b
2310        mov             x15, x30
2311        mov             \my,  \h
2312
2313168:
2314        add             \ds2,  \dst,  \d_strd
2315        add             \sr2,  \src,  \s_strd
2316        lsl             \d_strd, \d_strd, #1
2317        lsl             \s_strd, \s_strd, #1
2318
2319        bl              L(\type\()_8tap_filter_8_first)
2320        bl              L(\type\()_8tap_filter_8)
2321        mov             v17.16b, v24.16b
2322        mov             v18.16b, v25.16b
2323        bl              L(\type\()_8tap_filter_8)
2324        mov             v19.16b, v24.16b
2325        mov             v20.16b, v25.16b
2326        bl              L(\type\()_8tap_filter_8)
2327        mov             v21.16b, v24.16b
2328        mov             v22.16b, v25.16b
2329
233088:
2331        smull           v2.4s,  v16.4h, v1.h[0]
2332        smull2          v3.4s,  v16.8h, v1.h[0]
2333        bl              L(\type\()_8tap_filter_8)
2334        smull           v4.4s,  v17.4h, v1.h[0]
2335        smull2          v5.4s,  v17.8h, v1.h[0]
2336        smlal           v2.4s,  v17.4h, v1.h[1]
2337        smlal2          v3.4s,  v17.8h, v1.h[1]
2338        smlal           v4.4s,  v18.4h, v1.h[1]
2339        smlal2          v5.4s,  v18.8h, v1.h[1]
2340        smlal           v2.4s,  v18.4h, v1.h[2]
2341        smlal2          v3.4s,  v18.8h, v1.h[2]
2342        smlal           v4.4s,  v19.4h, v1.h[2]
2343        smlal2          v5.4s,  v19.8h, v1.h[2]
2344        smlal           v2.4s,  v19.4h, v1.h[3]
2345        smlal2          v3.4s,  v19.8h, v1.h[3]
2346        smlal           v4.4s,  v20.4h, v1.h[3]
2347        smlal2          v5.4s,  v20.8h, v1.h[3]
2348        smlal           v2.4s,  v20.4h, v1.h[4]
2349        smlal2          v3.4s,  v20.8h, v1.h[4]
2350        smlal           v4.4s,  v21.4h, v1.h[4]
2351        smlal2          v5.4s,  v21.8h, v1.h[4]
2352        smlal           v2.4s,  v21.4h, v1.h[5]
2353        smlal2          v3.4s,  v21.8h, v1.h[5]
2354        smlal           v4.4s,  v22.4h, v1.h[5]
2355        smlal2          v5.4s,  v22.8h, v1.h[5]
2356        smlal           v2.4s,  v22.4h, v1.h[6]
2357        smlal2          v3.4s,  v22.8h, v1.h[6]
2358        smlal           v4.4s,  v24.4h, v1.h[6]
2359        smlal2          v5.4s,  v24.8h, v1.h[6]
2360        smlal           v2.4s,  v24.4h, v1.h[7]
2361        smlal2          v3.4s,  v24.8h, v1.h[7]
2362        smlal           v4.4s,  v25.4h, v1.h[7]
2363        smlal2          v5.4s,  v25.8h, v1.h[7]
2364        sqrshrn         v2.4h,  v2.4s,  #\shift_hv
2365        sqrshrn2        v2.8h,  v3.4s,  #\shift_hv
2366        sqrshrn         v4.4h,  v4.4s,  #\shift_hv
2367        sqrshrn2        v4.8h,  v5.4s,  #\shift_hv
2368        subs            \h,  \h,  #2
2369.ifc \type, put
2370        sqxtun          v2.8b,  v2.8h
2371        sqxtun          v4.8b,  v4.8h
2372        st1             {v2.8b}, [\dst], \d_strd
2373        st1             {v4.8b}, [\ds2], \d_strd
2374.else
2375        st1             {v2.8h}, [\dst], \d_strd
2376        st1             {v4.8h}, [\ds2], \d_strd
2377.endif
2378        b.le            9f
2379        mov             v16.16b, v18.16b
2380        mov             v17.16b, v19.16b
2381        mov             v18.16b, v20.16b
2382        mov             v19.16b, v21.16b
2383        mov             v20.16b, v22.16b
2384        mov             v21.16b, v24.16b
2385        mov             v22.16b, v25.16b
2386        b               88b
23879:
2388        subs            \w,  \w,  #8
2389        b.le            0f
2390        asr             \s_strd,  \s_strd,  #1
2391        asr             \d_strd,  \d_strd,  #1
2392        msub            \src,  \s_strd,  \xmy,  \src
2393        msub            \dst,  \d_strd,  \xmy,  \dst
2394        sub             \src,  \src,  \s_strd,  lsl #3
2395        mov             \h,  \my
2396        add             \src,  \src,  #8
2397.ifc \type, put
2398        add             \dst,  \dst,  #8
2399.else
2400        add             \dst,  \dst,  #16
2401.endif
2402        b               168b
24030:
2404        ret             x15
2405
2406L(\type\()_8tap_filter_8_first):
2407        ld1             {v28.8b, v29.8b},  [\src], \s_strd
2408        uxtl            v28.8h,  v28.8b
2409        uxtl            v29.8h,  v29.8b
2410        mul             v16.8h,  v28.8h,  v0.h[0]
2411        ext             v24.16b, v28.16b, v29.16b, #(2*1)
2412        ext             v25.16b, v28.16b, v29.16b, #(2*2)
2413        ext             v26.16b, v28.16b, v29.16b, #(2*3)
2414        ext             v27.16b, v28.16b, v29.16b, #(2*4)
2415        mla             v16.8h,  v24.8h,  v0.h[1]
2416        mla             v16.8h,  v25.8h,  v0.h[2]
2417        mla             v16.8h,  v26.8h,  v0.h[3]
2418        mla             v16.8h,  v27.8h,  v0.h[4]
2419        ext             v24.16b, v28.16b, v29.16b, #(2*5)
2420        ext             v25.16b, v28.16b, v29.16b, #(2*6)
2421        ext             v26.16b, v28.16b, v29.16b, #(2*7)
2422        mla             v16.8h,  v24.8h,  v0.h[5]
2423        mla             v16.8h,  v25.8h,  v0.h[6]
2424        mla             v16.8h,  v26.8h,  v0.h[7]
2425        srshr           v16.8h,  v16.8h,  #2
2426        ret
2427
2428L(\type\()_8tap_filter_8):
2429        ld1             {v28.8b, v29.8b},  [\sr2], \s_strd
2430        ld1             {v30.8b, v31.8b},  [\src], \s_strd
2431        uxtl            v28.8h,  v28.8b
2432        uxtl            v29.8h,  v29.8b
2433        uxtl            v30.8h,  v30.8b
2434        uxtl            v31.8h,  v31.8b
2435        mul             v24.8h,  v28.8h,  v0.h[0]
2436        mul             v25.8h,  v30.8h,  v0.h[0]
2437.irpc i, 1234567
2438        ext             v26.16b, v28.16b, v29.16b, #(2*\i)
2439        ext             v27.16b, v30.16b, v31.16b, #(2*\i)
2440        mla             v24.8h,  v26.8h,  v0.h[\i]
2441        mla             v25.8h,  v27.8h,  v0.h[\i]
2442.endr
2443        srshr           v24.8h,  v24.8h, #2
2444        srshr           v25.8h,  v25.8h, #2
2445        ret
2446
2447L(\type\()_8tap_hv_tbl):
2448        .hword L(\type\()_8tap_hv_tbl) - 1280b
2449        .hword L(\type\()_8tap_hv_tbl) -  640b
2450        .hword L(\type\()_8tap_hv_tbl) -  320b
2451        .hword L(\type\()_8tap_hv_tbl) -  160b
2452        .hword L(\type\()_8tap_hv_tbl) -   80b
2453        .hword L(\type\()_8tap_hv_tbl) -   40b
2454        .hword L(\type\()_8tap_hv_tbl) -   20b
2455        .hword 0
2456endfunc
2457
2458
2459function \type\()_bilin_8bpc_neon, export=1
2460        dup             v1.16b, \mx
2461        dup             v3.16b, \my
2462        mov             w9,  #16
2463        sub             w8, w9, \mx
2464        sub             w9, w9, \my
2465        dup             v0.16b, w8
2466        dup             v2.16b, w9
2467.ifc \type, prep
2468        uxtw            \d_strd, \w
2469        lsl             \d_strd, \d_strd, #1
2470.endif
2471
2472        clz             w8,  \w
2473        sub             w8,  w8,  #24
2474        cbnz            \mx, L(\type\()_bilin_h)
2475        cbnz            \my, L(\type\()_bilin_v)
2476        b               \type\()_neon
2477
2478L(\type\()_bilin_h):
2479        cbnz            \my, L(\type\()_bilin_hv)
2480
2481        adr             x9,  L(\type\()_bilin_h_tbl)
2482        ldrh            w8,  [x9, x8, lsl #1]
2483        sub             x9,  x9,  w8, uxtw
2484        br              x9
2485
248620:     // 2xN h
2487        AARCH64_VALID_JUMP_TARGET
2488.ifc \type, put
2489        add             \ds2,  \dst,  \d_strd
2490        add             \sr2,  \src,  \s_strd
2491        lsl             \d_strd,  \d_strd,  #1
2492        lsl             \s_strd,  \s_strd,  #1
24932:
2494        ld1             {v4.s}[0],  [\src], \s_strd
2495        ld1             {v6.s}[0],  [\sr2], \s_strd
2496        ext             v5.8b,  v4.8b,  v4.8b, #1
2497        ext             v7.8b,  v6.8b,  v6.8b, #1
2498        trn1            v4.4h,  v4.4h,  v6.4h
2499        trn1            v5.4h,  v5.4h,  v7.4h
2500        subs            \h,  \h,  #2
2501        umull           v4.8h,  v4.8b,  v0.8b
2502        umlal           v4.8h,  v5.8b,  v1.8b
2503        uqrshrn         v4.8b,  v4.8h,  #4
2504        st1             {v4.h}[0], [\dst], \d_strd
2505        st1             {v4.h}[1], [\ds2], \d_strd
2506        b.gt            2b
2507        ret
2508.endif
2509
251040:     // 4xN h
2511        AARCH64_VALID_JUMP_TARGET
2512        add             \ds2,  \dst,  \d_strd
2513        add             \sr2,  \src,  \s_strd
2514        lsl             \d_strd,  \d_strd,  #1
2515        lsl             \s_strd,  \s_strd,  #1
25164:
2517        ld1             {v4.8b}, [\src], \s_strd
2518        ld1             {v6.8b}, [\sr2], \s_strd
2519        ext             v5.8b,  v4.8b,  v4.8b, #1
2520        ext             v7.8b,  v6.8b,  v6.8b, #1
2521        trn1            v4.2s,  v4.2s,  v6.2s
2522        trn1            v5.2s,  v5.2s,  v7.2s
2523        subs            \h,  \h,  #2
2524        umull           v4.8h,  v4.8b,  v0.8b
2525        umlal           v4.8h,  v5.8b,  v1.8b
2526.ifc \type, put
2527        uqrshrn         v4.8b,  v4.8h,  #4
2528        st1             {v4.s}[0], [\dst], \d_strd
2529        st1             {v4.s}[1], [\ds2], \d_strd
2530.else
2531        st1             {v4.d}[0], [\dst], \d_strd
2532        st1             {v4.d}[1], [\ds2], \d_strd
2533.endif
2534        b.gt            4b
2535        ret
2536
253780:     // 8xN h
2538        AARCH64_VALID_JUMP_TARGET
2539        add             \ds2,  \dst,  \d_strd
2540        add             \sr2,  \src,  \s_strd
2541        lsl             \d_strd,  \d_strd,  #1
2542        lsl             \s_strd,  \s_strd,  #1
25438:
2544        ld1             {v4.16b}, [\src], \s_strd
2545        ld1             {v6.16b}, [\sr2], \s_strd
2546        ext             v5.16b, v4.16b, v4.16b, #1
2547        ext             v7.16b, v6.16b, v6.16b, #1
2548        subs            \h,  \h,  #2
2549        umull           v4.8h,  v4.8b,  v0.8b
2550        umull           v6.8h,  v6.8b,  v0.8b
2551        umlal           v4.8h,  v5.8b,  v1.8b
2552        umlal           v6.8h,  v7.8b,  v1.8b
2553.ifc \type, put
2554        uqrshrn         v4.8b,  v4.8h,  #4
2555        uqrshrn         v6.8b,  v6.8h,  #4
2556        st1             {v4.8b}, [\dst], \d_strd
2557        st1             {v6.8b}, [\ds2], \d_strd
2558.else
2559        st1             {v4.8h}, [\dst], \d_strd
2560        st1             {v6.8h}, [\ds2], \d_strd
2561.endif
2562        b.gt            8b
2563        ret
2564160:
2565320:
2566640:
25671280:   // 16xN, 32xN, ... h
2568        AARCH64_VALID_JUMP_TARGET
2569        add             \ds2,  \dst,  \d_strd
2570        add             \sr2,  \src,  \s_strd
2571        lsl             \s_strd,  \s_strd,  #1
2572
2573        sub             \s_strd,  \s_strd,  \w, uxtw
2574        sub             \s_strd,  \s_strd,  #8
2575.ifc \type, put
2576        lsl             \d_strd,  \d_strd,  #1
2577        sub             \d_strd,  \d_strd,  \w, uxtw
2578.endif
2579161:
2580        ld1             {v16.d}[1],  [\src], #8
2581        ld1             {v20.d}[1],  [\sr2], #8
2582        mov             \mx, \w
2583
258416:
2585        ld1             {v18.16b},  [\src], #16
2586        ld1             {v22.16b},  [\sr2], #16
2587        ext             v17.16b, v16.16b, v18.16b, #8
2588        ext             v19.16b, v16.16b, v18.16b, #9
2589        ext             v21.16b, v20.16b, v22.16b, #8
2590        ext             v23.16b, v20.16b, v22.16b, #9
2591        umull           v16.8h,  v17.8b,  v0.8b
2592        umull2          v17.8h,  v17.16b, v0.16b
2593        umull           v20.8h,  v21.8b,  v0.8b
2594        umull2          v21.8h,  v21.16b, v0.16b
2595        umlal           v16.8h,  v19.8b,  v1.8b
2596        umlal2          v17.8h,  v19.16b, v1.16b
2597        umlal           v20.8h,  v23.8b,  v1.8b
2598        umlal2          v21.8h,  v23.16b, v1.16b
2599        subs            \mx, \mx, #16
2600.ifc \type, put
2601        uqrshrn         v16.8b,  v16.8h, #4
2602        uqrshrn2        v16.16b, v17.8h, #4
2603        uqrshrn         v20.8b,  v20.8h, #4
2604        uqrshrn2        v20.16b, v21.8h, #4
2605        st1             {v16.16b}, [\dst], #16
2606        st1             {v20.16b}, [\ds2], #16
2607.else
2608        st1             {v16.8h, v17.8h}, [\dst], #32
2609        st1             {v20.8h, v21.8h}, [\ds2], #32
2610.endif
2611        b.le            9f
2612
2613        mov             v16.16b, v18.16b
2614        mov             v20.16b, v22.16b
2615        b               16b
2616
26179:
2618        add             \dst,  \dst,  \d_strd
2619        add             \ds2,  \ds2,  \d_strd
2620        add             \src,  \src,  \s_strd
2621        add             \sr2,  \sr2,  \s_strd
2622
2623        subs            \h,  \h,  #2
2624        b.gt            161b
2625        ret
2626
2627L(\type\()_bilin_h_tbl):
2628        .hword L(\type\()_bilin_h_tbl) - 1280b
2629        .hword L(\type\()_bilin_h_tbl) -  640b
2630        .hword L(\type\()_bilin_h_tbl) -  320b
2631        .hword L(\type\()_bilin_h_tbl) -  160b
2632        .hword L(\type\()_bilin_h_tbl) -   80b
2633        .hword L(\type\()_bilin_h_tbl) -   40b
2634        .hword L(\type\()_bilin_h_tbl) -   20b
2635        .hword 0
2636
2637
2638L(\type\()_bilin_v):
2639        cmp             \h,  #4
2640        adr             x9,  L(\type\()_bilin_v_tbl)
2641        ldrh            w8,  [x9, x8, lsl #1]
2642        sub             x9,  x9,  w8, uxtw
2643        br              x9
2644
264520:     // 2xN v
2646        AARCH64_VALID_JUMP_TARGET
2647.ifc \type, put
2648        cmp             \h,  #2
2649        add             \ds2,  \dst,  \d_strd
2650        add             \sr2,  \src,  \s_strd
2651        lsl             \s_strd,  \s_strd,  #1
2652        lsl             \d_strd,  \d_strd,  #1
2653
2654        // 2x2 v
2655        ld1             {v16.h}[0], [\src], \s_strd
2656        b.gt            24f
265722:
2658        ld1             {v17.h}[0], [\sr2], \s_strd
2659        ld1             {v18.h}[0], [\src], \s_strd
2660        trn1            v16.4h, v16.4h, v17.4h
2661        trn1            v17.4h, v17.4h, v18.4h
2662        umull           v4.8h,  v16.8b,  v2.8b
2663        umlal           v4.8h,  v17.8b,  v3.8b
2664        uqrshrn         v4.8b,  v4.8h,  #4
2665        st1             {v4.h}[0], [\dst]
2666        st1             {v4.h}[1], [\ds2]
2667        ret
266824:     // 2x4, 2x6, 2x8, ... v
2669        ld1             {v17.h}[0], [\sr2], \s_strd
2670        ld1             {v18.h}[0], [\src], \s_strd
2671        ld1             {v19.h}[0], [\sr2], \s_strd
2672        ld1             {v20.h}[0], [\src], \s_strd
2673        sub             \h,  \h,  #4
2674        trn1            v16.4h, v16.4h, v17.4h
2675        trn1            v17.4h, v17.4h, v18.4h
2676        trn1            v18.4h, v18.4h, v19.4h
2677        trn1            v19.4h, v19.4h, v20.4h
2678        trn1            v16.2s, v16.2s, v18.2s
2679        trn1            v17.2s, v17.2s, v19.2s
2680        umull           v4.8h,  v16.8b,  v2.8b
2681        umlal           v4.8h,  v17.8b,  v3.8b
2682        cmp             \h,  #2
2683        uqrshrn         v4.8b,  v4.8h,  #4
2684        st1             {v4.h}[0], [\dst], \d_strd
2685        st1             {v4.h}[1], [\ds2], \d_strd
2686        st1             {v4.h}[2], [\dst], \d_strd
2687        st1             {v4.h}[3], [\ds2], \d_strd
2688        b.lt            0f
2689        mov             v16.8b, v20.8b
2690        b.eq            22b
2691        b               24b
26920:
2693        ret
2694.endif
2695
269640:     // 4xN v
2697        AARCH64_VALID_JUMP_TARGET
2698        add             \ds2,  \dst,  \d_strd
2699        add             \sr2,  \src,  \s_strd
2700        lsl             \s_strd,  \s_strd,  #1
2701        lsl             \d_strd,  \d_strd,  #1
2702        ld1             {v16.s}[0], [\src], \s_strd
27034:
2704        ld1             {v17.s}[0], [\sr2], \s_strd
2705        ld1             {v18.s}[0], [\src], \s_strd
2706        trn1            v16.2s, v16.2s, v17.2s
2707        trn1            v17.2s, v17.2s, v18.2s
2708        umull           v4.8h,  v16.8b,  v2.8b
2709        umlal           v4.8h,  v17.8b,  v3.8b
2710        subs            \h,  \h,  #2
2711.ifc \type, put
2712        uqrshrn         v4.8b,  v4.8h,  #4
2713        st1             {v4.s}[0], [\dst], \d_strd
2714        st1             {v4.s}[1], [\ds2], \d_strd
2715.else
2716        st1             {v4.d}[0], [\dst], \d_strd
2717        st1             {v4.d}[1], [\ds2], \d_strd
2718.endif
2719        b.le            0f
2720        mov             v16.8b, v18.8b
2721        b               4b
27220:
2723        ret
2724
272580:     // 8xN v
2726        AARCH64_VALID_JUMP_TARGET
2727        add             \ds2,  \dst,  \d_strd
2728        add             \sr2,  \src,  \s_strd
2729        lsl             \s_strd,  \s_strd,  #1
2730        lsl             \d_strd,  \d_strd,  #1
2731        ld1             {v16.8b}, [\src], \s_strd
27328:
2733        ld1             {v17.8b}, [\sr2], \s_strd
2734        ld1             {v18.8b}, [\src], \s_strd
2735        umull           v4.8h,  v16.8b,  v2.8b
2736        umull           v5.8h,  v17.8b,  v2.8b
2737        umlal           v4.8h,  v17.8b,  v3.8b
2738        umlal           v5.8h,  v18.8b,  v3.8b
2739        subs            \h,  \h,  #2
2740.ifc \type, put
2741        uqrshrn         v4.8b,  v4.8h,  #4
2742        uqrshrn         v5.8b,  v5.8h,  #4
2743        st1             {v4.8b}, [\dst], \d_strd
2744        st1             {v5.8b}, [\ds2], \d_strd
2745.else
2746        st1             {v4.8h}, [\dst], \d_strd
2747        st1             {v5.8h}, [\ds2], \d_strd
2748.endif
2749        b.le            0f
2750        mov             v16.8b, v18.8b
2751        b               8b
27520:
2753        ret
2754
2755160:    // 16xN, 32xN, ...
2756320:
2757640:
27581280:
2759        AARCH64_VALID_JUMP_TARGET
2760        mov             \my,  \h
27611:
2762        add             \ds2, \dst, \d_strd
2763        add             \sr2, \src, \s_strd
2764        lsl             \s_strd, \s_strd, #1
2765        lsl             \d_strd, \d_strd, #1
2766
2767        ld1             {v16.16b}, [\src], \s_strd
27682:
2769        ld1             {v17.16b}, [\sr2], \s_strd
2770        ld1             {v18.16b}, [\src], \s_strd
2771        umull           v4.8h,  v16.8b,  v2.8b
2772        umull2          v5.8h,  v16.16b, v2.16b
2773        umull           v6.8h,  v17.8b,  v2.8b
2774        umull2          v7.8h,  v17.16b, v2.16b
2775        umlal           v4.8h,  v17.8b,  v3.8b
2776        umlal2          v5.8h,  v17.16b, v3.16b
2777        umlal           v6.8h,  v18.8b,  v3.8b
2778        umlal2          v7.8h,  v18.16b, v3.16b
2779        subs            \h,  \h,  #2
2780.ifc \type, put
2781        uqrshrn         v4.8b,  v4.8h,  #4
2782        uqrshrn2        v4.16b, v5.8h,  #4
2783        uqrshrn         v6.8b,  v6.8h,  #4
2784        uqrshrn2        v6.16b, v7.8h,  #4
2785        st1             {v4.16b}, [\dst], \d_strd
2786        st1             {v6.16b}, [\ds2], \d_strd
2787.else
2788        st1             {v4.8h, v5.8h}, [\dst], \d_strd
2789        st1             {v6.8h, v7.8h}, [\ds2], \d_strd
2790.endif
2791        b.le            9f
2792        mov             v16.16b, v18.16b
2793        b               2b
27949:
2795        subs            \w,  \w,  #16
2796        b.le            0f
2797        asr             \s_strd, \s_strd, #1
2798        asr             \d_strd, \d_strd, #1
2799        msub            \src, \s_strd, \xmy, \src
2800        msub            \dst, \d_strd, \xmy, \dst
2801        sub             \src, \src, \s_strd, lsl #1
2802        mov             \h,  \my
2803        add             \src, \src, #16
2804.ifc \type, put
2805        add             \dst, \dst, #16
2806.else
2807        add             \dst, \dst, #32
2808.endif
2809        b               1b
28100:
2811        ret
2812
2813L(\type\()_bilin_v_tbl):
2814        .hword L(\type\()_bilin_v_tbl) - 1280b
2815        .hword L(\type\()_bilin_v_tbl) -  640b
2816        .hword L(\type\()_bilin_v_tbl) -  320b
2817        .hword L(\type\()_bilin_v_tbl) -  160b
2818        .hword L(\type\()_bilin_v_tbl) -   80b
2819        .hword L(\type\()_bilin_v_tbl) -   40b
2820        .hword L(\type\()_bilin_v_tbl) -   20b
2821        .hword 0
2822
2823L(\type\()_bilin_hv):
2824        uxtl            v2.8h, v2.8b
2825        uxtl            v3.8h, v3.8b
2826        adr             x9,  L(\type\()_bilin_hv_tbl)
2827        ldrh            w8,  [x9, x8, lsl #1]
2828        sub             x9,  x9,  w8, uxtw
2829        br              x9
2830
283120:     // 2xN hv
2832        AARCH64_VALID_JUMP_TARGET
2833.ifc \type, put
2834        add             \sr2, \src, \s_strd
2835        add             \ds2, \dst, \d_strd
2836        lsl             \s_strd, \s_strd, #1
2837        lsl             \d_strd, \d_strd, #1
2838
2839        ld1             {v28.s}[0],  [\src], \s_strd
2840        ext             v29.8b, v28.8b, v28.8b, #1
2841        umull           v16.8h, v28.8b, v0.8b
2842        umlal           v16.8h, v29.8b, v1.8b
2843
28442:
2845        ld1             {v28.s}[0],  [\sr2], \s_strd
2846        ld1             {v30.s}[0],  [\src], \s_strd
2847        ext             v29.8b, v28.8b, v28.8b, #1
2848        ext             v31.8b, v30.8b, v30.8b, #1
2849        trn1            v28.4h, v28.4h, v30.4h
2850        trn1            v29.4h, v29.4h, v31.4h
2851        umull           v17.8h, v28.8b, v0.8b
2852        umlal           v17.8h, v29.8b, v1.8b
2853
2854        trn1            v16.2s, v16.2s, v17.2s
2855
2856        mul             v4.4h,  v16.4h, v2.4h
2857        mla             v4.4h,  v17.4h, v3.4h
2858        uqrshrn         v4.8b,  v4.8h,  #8
2859        subs            \h,  \h,  #2
2860        st1             {v4.h}[0], [\dst], \d_strd
2861        st1             {v4.h}[1], [\ds2], \d_strd
2862        b.le            0f
2863        trn2            v16.2s, v17.2s, v17.2s
2864        b               2b
28650:
2866        ret
2867.endif
2868
286940:     // 4xN hv
2870        AARCH64_VALID_JUMP_TARGET
2871        add             \sr2, \src, \s_strd
2872        add             \ds2, \dst, \d_strd
2873        lsl             \s_strd, \s_strd, #1
2874        lsl             \d_strd, \d_strd, #1
2875
2876        ld1             {v28.8b},  [\src], \s_strd
2877        ext             v29.8b, v28.8b, v28.8b, #1
2878        umull           v16.8h, v28.8b, v0.8b
2879        umlal           v16.8h, v29.8b, v1.8b
2880
28814:
2882        ld1             {v28.8b},  [\sr2], \s_strd
2883        ld1             {v30.8b},  [\src], \s_strd
2884        ext             v29.8b, v28.8b, v28.8b, #1
2885        ext             v31.8b, v30.8b, v30.8b, #1
2886        trn1            v28.2s, v28.2s, v30.2s
2887        trn1            v29.2s, v29.2s, v31.2s
2888        umull           v17.8h, v28.8b, v0.8b
2889        umlal           v17.8h, v29.8b, v1.8b
2890
2891        trn1            v16.2d, v16.2d, v17.2d
2892
2893        mul             v4.8h,  v16.8h, v2.8h
2894        mla             v4.8h,  v17.8h, v3.8h
2895        subs            \h,  \h,  #2
2896.ifc \type, put
2897        uqrshrn         v4.8b,  v4.8h,  #8
2898        st1             {v4.s}[0], [\dst], \d_strd
2899        st1             {v4.s}[1], [\ds2], \d_strd
2900.else
2901        urshr           v4.8h,  v4.8h,  #4
2902        st1             {v4.d}[0], [\dst], \d_strd
2903        st1             {v4.d}[1], [\ds2], \d_strd
2904.endif
2905        b.le            0f
2906        trn2            v16.2d, v17.2d, v17.2d
2907        b               4b
29080:
2909        ret
2910
291180:     // 8xN, 16xN, ... hv
2912160:
2913320:
2914640:
29151280:
2916        AARCH64_VALID_JUMP_TARGET
2917        mov             \my,  \h
2918
29191:
2920        add             \sr2, \src, \s_strd
2921        add             \ds2, \dst, \d_strd
2922        lsl             \s_strd, \s_strd, #1
2923        lsl             \d_strd, \d_strd, #1
2924
2925        ld1             {v28.16b},  [\src], \s_strd
2926        ext             v29.16b, v28.16b, v28.16b, #1
2927        umull           v16.8h, v28.8b, v0.8b
2928        umlal           v16.8h, v29.8b, v1.8b
2929
29302:
2931        ld1             {v28.16b},  [\sr2], \s_strd
2932        ld1             {v30.16b},  [\src], \s_strd
2933        ext             v29.16b, v28.16b, v28.16b, #1
2934        ext             v31.16b, v30.16b, v30.16b, #1
2935        umull           v17.8h, v28.8b, v0.8b
2936        umlal           v17.8h, v29.8b, v1.8b
2937        umull           v18.8h, v30.8b, v0.8b
2938        umlal           v18.8h, v31.8b, v1.8b
2939
2940        mul             v4.8h,  v16.8h, v2.8h
2941        mla             v4.8h,  v17.8h, v3.8h
2942        mul             v5.8h,  v17.8h, v2.8h
2943        mla             v5.8h,  v18.8h, v3.8h
2944        subs            \h,  \h,  #2
2945.ifc \type, put
2946        uqrshrn         v4.8b,  v4.8h,  #8
2947        uqrshrn         v5.8b,  v5.8h,  #8
2948        st1             {v4.8b}, [\dst], \d_strd
2949        st1             {v5.8b}, [\ds2], \d_strd
2950.else
2951        urshr           v4.8h,  v4.8h,  #4
2952        urshr           v5.8h,  v5.8h,  #4
2953        st1             {v4.8h}, [\dst], \d_strd
2954        st1             {v5.8h}, [\ds2], \d_strd
2955.endif
2956        b.le            9f
2957        mov             v16.16b, v18.16b
2958        b               2b
29599:
2960        subs            \w,  \w,  #8
2961        b.le            0f
2962        asr             \s_strd,  \s_strd,  #1
2963        asr             \d_strd,  \d_strd,  #1
2964        msub            \src,  \s_strd,  \xmy,  \src
2965        msub            \dst,  \d_strd,  \xmy,  \dst
2966        sub             \src,  \src,  \s_strd,  lsl #1
2967        mov             \h,  \my
2968        add             \src,  \src,  #8
2969.ifc \type, put
2970        add             \dst,  \dst,  #8
2971.else
2972        add             \dst,  \dst,  #16
2973.endif
2974        b               1b
29750:
2976        ret
2977
2978L(\type\()_bilin_hv_tbl):
2979        .hword L(\type\()_bilin_hv_tbl) - 1280b
2980        .hword L(\type\()_bilin_hv_tbl) -  640b
2981        .hword L(\type\()_bilin_hv_tbl) -  320b
2982        .hword L(\type\()_bilin_hv_tbl) -  160b
2983        .hword L(\type\()_bilin_hv_tbl) -   80b
2984        .hword L(\type\()_bilin_hv_tbl) -   40b
2985        .hword L(\type\()_bilin_hv_tbl) -   20b
2986        .hword 0
2987endfunc
2988.endm
2989
2990filter_fn put,  x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 10
2991filter_fn prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6
2992
2993.macro load_filter_row dst, src, inc
2994        asr             w13, \src, #10
2995        add             \src, \src, \inc
2996        ldr             \dst, [x11, w13, sxtw #3]
2997.endm
2998
2999function warp_filter_horz_neon
3000        add             w12, w5,  #512
3001
3002        ld1             {v16.8b, v17.8b}, [x2], x3
3003
3004        load_filter_row d0, w12, w7
3005        load_filter_row d1, w12, w7
3006        load_filter_row d2, w12, w7
3007        load_filter_row d3, w12, w7
3008        load_filter_row d4, w12, w7
3009        load_filter_row d5, w12, w7
3010        load_filter_row d6, w12, w7
3011        // subtract by 128 to allow using smull
3012        eor             v16.8b,  v16.8b,  v22.8b
3013        eor             v17.8b,  v17.8b,  v22.8b
3014        load_filter_row d7, w12, w7
3015
3016        ext             v18.8b,  v16.8b,  v17.8b,  #1
3017        ext             v19.8b,  v16.8b,  v17.8b,  #2
3018        smull           v0.8h,   v0.8b,   v16.8b
3019        smull           v1.8h,   v1.8b,   v18.8b
3020        ext             v18.8b,  v16.8b,  v17.8b,  #3
3021        ext             v20.8b,  v16.8b,  v17.8b,  #4
3022        smull           v2.8h,   v2.8b,   v19.8b
3023        smull           v3.8h,   v3.8b,   v18.8b
3024        ext             v18.8b,  v16.8b,  v17.8b,  #5
3025        ext             v19.8b,  v16.8b,  v17.8b,  #6
3026        smull           v4.8h,   v4.8b,   v20.8b
3027        smull           v5.8h,   v5.8b,   v18.8b
3028        ext             v18.8b,  v16.8b,  v17.8b,  #7
3029        smull           v6.8h,   v6.8b,   v19.8b
3030        smull           v7.8h,   v7.8b,   v18.8b
3031
3032        addp            v0.8h,   v0.8h,   v1.8h
3033        addp            v2.8h,   v2.8h,   v3.8h
3034        addp            v4.8h,   v4.8h,   v5.8h
3035        addp            v6.8h,   v6.8h,   v7.8h
3036
3037        addp            v0.8h,   v0.8h,   v2.8h
3038        addp            v4.8h,   v4.8h,   v6.8h
3039
3040        addp            v0.8h,   v0.8h,   v4.8h
3041
3042        add             w5,  w5,  w8
3043
3044        ret
3045endfunc
3046
3047// void dav1d_warp_affine_8x8_8bpc_neon(
3048//         pixel *dst, const ptrdiff_t dst_stride,
3049//         const pixel *src, const ptrdiff_t src_stride,
3050//         const int16_t *const abcd, int mx, int my)
3051.macro warp t, shift
3052function warp_affine_8x8\t\()_8bpc_neon, export=1
3053        ldr             x4,  [x4]
3054        sbfx            x7,  x4, #0,  #16
3055        sbfx            x8,  x4, #16, #16
3056        sbfx            x9,  x4, #32, #16
3057        sbfx            x4,  x4, #48, #16
3058        mov             w10, #8
3059        sub             x2,  x2,  x3, lsl #1
3060        sub             x2,  x2,  x3
3061        sub             x2,  x2,  #3
3062        movrel          x11, X(mc_warp_filter), 64*8
3063        mov             x15, x30
3064.ifnb \t
3065        lsl             x1,  x1,  #1
3066.endif
3067
3068        movi            v22.8b,  #128
3069.ifb \t
3070        movi            v23.8h,  #128
3071.else
3072        movi            v23.8h,  #8, lsl #8
3073.endif
3074
3075        bl              warp_filter_horz_neon
3076        srshr           v24.8h,  v0.8h,  #3
3077        bl              warp_filter_horz_neon
3078        srshr           v25.8h,  v0.8h,  #3
3079        bl              warp_filter_horz_neon
3080        srshr           v26.8h,  v0.8h,  #3
3081        bl              warp_filter_horz_neon
3082        srshr           v27.8h,  v0.8h,  #3
3083        bl              warp_filter_horz_neon
3084        srshr           v28.8h,  v0.8h,  #3
3085        bl              warp_filter_horz_neon
3086        srshr           v29.8h,  v0.8h,  #3
3087        bl              warp_filter_horz_neon
3088        srshr           v30.8h,  v0.8h,  #3
3089
30901:
3091        add             w14, w6,  #512
3092        bl              warp_filter_horz_neon
3093        srshr           v31.8h,  v0.8h,  #3
3094
3095        load_filter_row d0, w14, w9
3096        load_filter_row d1, w14, w9
3097        load_filter_row d2, w14, w9
3098        load_filter_row d3, w14, w9
3099        load_filter_row d4, w14, w9
3100        load_filter_row d5, w14, w9
3101        load_filter_row d6, w14, w9
3102        load_filter_row d7, w14, w9
3103        transpose_8x8b_xtl v0, v1, v2, v3, v4, v5, v6, v7, sxtl
3104
3105        // This ordering of smull/smlal/smull2/smlal2 is highly
3106        // beneficial for Cortex A53 here.
3107        smull           v16.4s,  v24.4h,  v0.4h
3108        smlal           v16.4s,  v25.4h,  v1.4h
3109        smlal           v16.4s,  v26.4h,  v2.4h
3110        smlal           v16.4s,  v27.4h,  v3.4h
3111        smlal           v16.4s,  v28.4h,  v4.4h
3112        smlal           v16.4s,  v29.4h,  v5.4h
3113        smlal           v16.4s,  v30.4h,  v6.4h
3114        smlal           v16.4s,  v31.4h,  v7.4h
3115        smull2          v17.4s,  v24.8h,  v0.8h
3116        smlal2          v17.4s,  v25.8h,  v1.8h
3117        smlal2          v17.4s,  v26.8h,  v2.8h
3118        smlal2          v17.4s,  v27.8h,  v3.8h
3119        smlal2          v17.4s,  v28.8h,  v4.8h
3120        smlal2          v17.4s,  v29.8h,  v5.8h
3121        smlal2          v17.4s,  v30.8h,  v6.8h
3122        smlal2          v17.4s,  v31.8h,  v7.8h
3123
3124        mov             v24.16b, v25.16b
3125        mov             v25.16b, v26.16b
3126        sqrshrn         v16.4h,  v16.4s,  #\shift
3127        mov             v26.16b, v27.16b
3128        sqrshrn2        v16.8h,  v17.4s,  #\shift
3129        mov             v27.16b, v28.16b
3130        mov             v28.16b, v29.16b
3131        add             v16.8h,  v16.8h,  v23.8h
3132.ifb \t
3133        sqxtun          v16.8b,  v16.8h
3134.endif
3135        mov             v29.16b, v30.16b
3136        mov             v30.16b, v31.16b
3137        subs            w10, w10, #1
3138.ifnb \t
3139        st1             {v16.8h}, [x0], x1
3140.else
3141        st1             {v16.8b}, [x0], x1
3142.endif
3143
3144        add             w6,  w6,  w4
3145        b.gt            1b
3146
3147        ret             x15
3148endfunc
3149.endm
3150
3151warp  , 11
3152warp t, 7
3153
3154// void dav1d_emu_edge_8bpc_neon(
3155//         const intptr_t bw, const intptr_t bh,
3156//         const intptr_t iw, const intptr_t ih,
3157//         const intptr_t x, const intptr_t y,
3158//         pixel *dst, const ptrdiff_t dst_stride,
3159//         const pixel *ref, const ptrdiff_t ref_stride)
3160function emu_edge_8bpc_neon, export=1
3161        ldp             x8,  x9,  [sp]
3162
3163        // ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride)
3164        // ref += iclip(x, 0, iw - 1)
3165        sub             x12, x3,  #1           // ih - 1
3166        cmp             x5,  x3
3167        sub             x13, x2,  #1           // iw - 1
3168        csel            x12, x12, x5,  ge      // min(y, ih - 1)
3169        cmp             x4,  x2
3170        bic             x12, x12, x12, asr #63 // max(min(y, ih - 1), 0)
3171        csel            x13, x13, x4,  ge      // min(x, iw - 1)
3172        bic             x13, x13, x13, asr #63 // max(min(x, iw - 1), 0)
3173        madd            x8,  x12, x9,  x8      // ref += iclip() * stride
3174        add             x8,  x8,  x13          // ref += iclip()
3175
3176        // bottom_ext = iclip(y + bh - ih, 0, bh - 1)
3177        // top_ext = iclip(-y, 0, bh - 1)
3178        add             x10, x5,  x1           // y + bh
3179        neg             x5,  x5                // -y
3180        sub             x10, x10, x3           // y + bh - ih
3181        sub             x12, x1,  #1           // bh - 1
3182        cmp             x10, x1
3183        bic             x5,  x5,  x5,  asr #63 // max(-y, 0)
3184        csel            x10, x10, x12, lt      // min(y + bh - ih, bh-1)
3185        cmp             x5,  x1
3186        bic             x10, x10, x10, asr #63 // max(min(y + bh - ih, bh-1), 0)
3187        csel            x5,  x5,  x12, lt      // min(max(-y, 0), bh-1)
3188
3189        // right_ext = iclip(x + bw - iw, 0, bw - 1)
3190        // left_ext = iclip(-x, 0, bw - 1)
3191        add             x11, x4,  x0           // x + bw
3192        neg             x4,  x4                // -x
3193        sub             x11, x11, x2           // x + bw - iw
3194        sub             x13, x0,  #1           // bw - 1
3195        cmp             x11, x0
3196        bic             x4,  x4,  x4,  asr #63 // max(-x, 0)
3197        csel            x11, x11, x13, lt      // min(x + bw - iw, bw-1)
3198        cmp             x4,  x0
3199        bic             x11, x11, x11, asr #63 // max(min(x + bw - iw, bw-1), 0)
3200        csel            x4,  x4,  x13, lt      // min(max(-x, 0), bw - 1)
3201
3202        // center_h = bh - top_ext - bottom_ext
3203        // dst += top_ext * PXSTRIDE(dst_stride)
3204        // center_w = bw - left_ext - right_ext
3205        sub             x1,  x1,  x5           // bh - top_ext
3206        madd            x6,  x5,  x7,  x6
3207        sub             x2,  x0,  x4           // bw - left_ext
3208        sub             x1,  x1,  x10          // center_h = bh - top_ext - bottom_ext
3209        sub             x2,  x2,  x11          // center_w = bw - left_ext - right_ext
3210
3211        mov             x14, x6                // backup of dst
3212
3213.macro v_loop need_left, need_right
32140:
3215.if \need_left
3216        ld1r            {v0.16b}, [x8]
3217        mov             x12, x6                // out = dst
3218        mov             x3,  x4
32191:
3220        subs            x3,  x3,  #16
3221        st1             {v0.16b}, [x12], #16
3222        b.gt            1b
3223.endif
3224        mov             x13, x8
3225        add             x12, x6,  x4           // out = dst + left_ext
3226        mov             x3,  x2
32271:
3228        ld1             {v0.16b, v1.16b}, [x13], #32
3229        subs            x3,  x3,  #32
3230        st1             {v0.16b, v1.16b}, [x12], #32
3231        b.gt            1b
3232.if \need_right
3233        add             x3,  x8,  x2           // in + center_w
3234        sub             x3,  x3,  #1           // in + center_w - 1
3235        add             x12, x6,  x4           // dst + left_ext
3236        ld1r            {v0.16b}, [x3]
3237        add             x12, x12, x2           // out = dst + left_ext + center_w
3238        mov             x3,  x11
32391:
3240        subs            x3,  x3,  #16
3241        st1             {v0.16b}, [x12], #16
3242        b.gt            1b
3243.endif
3244
3245        subs            x1,  x1,  #1           // center_h--
3246        add             x6,  x6,  x7
3247        add             x8,  x8,  x9
3248        b.gt            0b
3249.endm
3250
3251        cbz             x4,  2f
3252        // need_left
3253        cbz             x11, 3f
3254        // need_left + need_right
3255        v_loop          1,   1
3256        b               5f
3257
32582:
3259        // !need_left
3260        cbz             x11, 4f
3261        // !need_left + need_right
3262        v_loop          0,   1
3263        b               5f
3264
32653:
3266        // need_left + !need_right
3267        v_loop          1,   0
3268        b               5f
3269
32704:
3271        // !need_left + !need_right
3272        v_loop          0,   0
3273
32745:
3275
3276        cbz             x10, 3f
3277        // need_bottom
3278        sub             x8,  x6,  x7           // ref = dst - stride
3279        mov             x4,  x0
32801:
3281        ld1             {v0.16b, v1.16b}, [x8], #32
3282        mov             x3,  x10
32832:
3284        subs            x3,  x3,  #1
3285        st1             {v0.16b, v1.16b}, [x6], x7
3286        b.gt            2b
3287        msub            x6,  x7,  x10,  x6     // dst -= bottom_ext * stride
3288        subs            x4,  x4,  #32          // bw -= 32
3289        add             x6,  x6,  #32          // dst += 32
3290        b.gt            1b
3291
32923:
3293        cbz             x5,  3f
3294        // need_top
3295        msub            x6,  x7,  x5,  x14     // dst = stored_dst - top_ext * stride
32961:
3297        ld1             {v0.16b, v1.16b}, [x14], #32
3298        mov             x3,  x5
32992:
3300        subs            x3,  x3,  #1
3301        st1             {v0.16b, v1.16b}, [x6], x7
3302        b.gt            2b
3303        msub            x6,  x7,  x5,  x6      // dst -= top_ext * stride
3304        subs            x0,  x0,  #32          // bw -= 32
3305        add             x6,  x6,  #32          // dst += 32
3306        b.gt            1b
3307
33083:
3309        ret
3310endfunc
3311