1/*
2 * Copyright © 2018, VideoLAN and dav1d authors
3 * Copyright © 2018, Janne Grunau
4 * Copyright © 2018, Martin Storsjo
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice, this
11 *    list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 *    this list of conditions and the following disclaimer in the documentation
15 *    and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
21 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29#include "src/arm/asm.S"
30#include "util.S"
31
32.macro avg dst, t0, t1, t2, t3
33        ld1             {\t0\().8h,\t1\().8h},   [x2],  32
34        ld1             {\t2\().8h,\t3\().8h},   [x3],  32
35        add             \t0\().8h,   \t0\().8h,   \t2\().8h
36        add             \t1\().8h,   \t1\().8h,   \t3\().8h
37        sqrshrun        \dst\().8b,  \t0\().8h,   #5
38        sqrshrun2       \dst\().16b, \t1\().8h,   #5
39.endm
40
41.macro w_avg dst, t0, t1, t2, t3
42        ld1             {\t0\().8h,\t1\().8h},   [x2],  32
43        ld1             {\t2\().8h,\t3\().8h},   [x3],  32
44        sub             \t0\().8h,   \t2\().8h,   \t0\().8h
45        sub             \t1\().8h,   \t3\().8h,   \t1\().8h
46        sqdmulh         \t0\().8h,   \t0\().8h,   v30.8h
47        sqdmulh         \t1\().8h,   \t1\().8h,   v30.8h
48        add             \t0\().8h,   \t2\().8h,   \t0\().8h
49        add             \t1\().8h,   \t3\().8h,   \t1\().8h
50        sqrshrun        \dst\().8b,  \t0\().8h,   #4
51        sqrshrun2       \dst\().16b, \t1\().8h,   #4
52.endm
53
54.macro mask dst, t0, t1, t2, t3
55        ld1             {v30.16b}, [x6],  16
56        ld1             {\t0\().8h,\t1\().8h},   [x2],  32
57        mul             v30.16b, v30.16b, v31.16b
58        ld1             {\t2\().8h,\t3\().8h},   [x3],  32
59        shll            v28.8h, v30.8b,  #8
60        shll2           v29.8h, v30.16b, #8
61        sub             \t0\().8h,   \t2\().8h,   \t0\().8h
62        sub             \t1\().8h,   \t3\().8h,   \t1\().8h
63        sqdmulh         \t0\().8h,   \t0\().8h,   v28.8h
64        sqdmulh         \t1\().8h,   \t1\().8h,   v29.8h
65        add             \t0\().8h,   \t2\().8h,   \t0\().8h
66        add             \t1\().8h,   \t3\().8h,   \t1\().8h
67        sqrshrun        \dst\().8b,  \t0\().8h,   #4
68        sqrshrun2       \dst\().16b, \t1\().8h,   #4
69.endm
70
71.macro bidir_fn type
72function \type\()_8bpc_neon, export=1
73        clz             w4,  w4
74.ifc \type, w_avg
75        dup             v30.8h, w6
76        neg             v30.8h, v30.8h
77        shl             v30.8h, v30.8h, #11
78.endif
79.ifc \type, mask
80        movi            v31.16b, #256-2
81.endif
82        adr             x7,  L(\type\()_tbl)
83        sub             w4,  w4,  #24
84        ldrh            w4,  [x7, x4, lsl #1]
85        \type           v4,  v0,  v1,  v2,  v3
86        sub             x7,  x7,  w4, uxtw
87        br              x7
8840:
89        add             x7,  x0,  x1
90        lsl             x1,  x1,  #1
914:
92        cmp             w5,  #4
93        st1             {v4.s}[0],  [x0], x1
94        st1             {v4.s}[1],  [x7], x1
95        st1             {v4.s}[2],  [x0], x1
96        st1             {v4.s}[3],  [x7], x1
97        b.eq            0f
98        \type           v5,  v0,  v1,  v2,  v3
99        cmp             w5,  #8
100        st1             {v5.s}[0],  [x0], x1
101        st1             {v5.s}[1],  [x7], x1
102        st1             {v5.s}[2],  [x0], x1
103        st1             {v5.s}[3],  [x7], x1
104        b.eq            0f
105        \type           v4,  v0,  v1,  v2,  v3
106        st1             {v4.s}[0],  [x0], x1
107        st1             {v4.s}[1],  [x7], x1
108        \type           v5,  v0,  v1,  v2,  v3
109        st1             {v4.s}[2],  [x0], x1
110        st1             {v4.s}[3],  [x7], x1
111        st1             {v5.s}[0],  [x0], x1
112        st1             {v5.s}[1],  [x7], x1
113        st1             {v5.s}[2],  [x0], x1
114        st1             {v5.s}[3],  [x7], x1
115        ret
11680:
117        add             x7,  x0,  x1
118        lsl             x1,  x1,  #1
1198:
120        st1             {v4.d}[0],  [x0], x1
121        \type           v5,  v0,  v1,  v2,  v3
122        st1             {v4.d}[1],  [x7], x1
123        st1             {v5.d}[0],  [x0], x1
124        subs            w5,  w5,  #4
125        st1             {v5.d}[1],  [x7], x1
126        b.le            0f
127        \type           v4,  v0,  v1,  v2,  v3
128        b               8b
12916:
130        \type           v5,  v0,  v1,  v2,  v3
131        st1             {v4.16b}, [x0], x1
132        \type           v6,  v0,  v1,  v2,  v3
133        st1             {v5.16b}, [x0], x1
134        \type           v7,  v0,  v1,  v2,  v3
135        st1             {v6.16b}, [x0], x1
136        subs            w5,  w5,  #4
137        st1             {v7.16b}, [x0], x1
138        b.le            0f
139        \type           v4,  v0,  v1,  v2,  v3
140        b               16b
141320:
142        add             x7,  x0,  x1
143        lsl             x1,  x1,  #1
14432:
145        \type           v5,  v0,  v1,  v2,  v3
146        \type           v6,  v0,  v1,  v2,  v3
147        st1             {v4.16b,v5.16b}, [x0], x1
148        \type           v7,  v0,  v1,  v2,  v3
149        subs            w5,  w5,  #2
150        st1             {v6.16b,v7.16b}, [x7], x1
151        b.le            0f
152        \type           v4,  v0,  v1,  v2,  v3
153        b               32b
154640:
155        add             x7,  x0,  x1
156        lsl             x1,  x1,  #1
15764:
158        \type           v5,  v0,  v1,  v2,  v3
159        \type           v6,  v0,  v1,  v2,  v3
160        \type           v7,  v0,  v1,  v2,  v3
161        \type           v16, v0,  v1,  v2,  v3
162        \type           v17, v0,  v1,  v2,  v3
163        st1             {v4.16b,v5.16b,v6.16b,v7.16b}, [x0], x1
164        \type           v18, v0,  v1,  v2,  v3
165        \type           v19, v0,  v1,  v2,  v3
166        subs            w5,  w5,  #2
167        st1             {v16.16b,v17.16b,v18.16b,v19.16b}, [x7], x1
168        b.le            0f
169        \type           v4, v0,  v1,  v2,  v3
170        b               64b
1711280:
172        add             x7,  x0,  #64
173128:
174        \type           v5,  v0,  v1,  v2,  v3
175        \type           v6,  v0,  v1,  v2,  v3
176        \type           v7,  v0,  v1,  v2,  v3
177        \type           v16, v0,  v1,  v2,  v3
178        \type           v17, v0,  v1,  v2,  v3
179        st1             {v4.16b,v5.16b,v6.16b,v7.16b}, [x0], x1
180        \type           v18, v0,  v1,  v2,  v3
181        \type           v19, v0,  v1,  v2,  v3
182        subs            w5,  w5,  #1
183        st1             {v16.16b,v17.16b,v18.16b,v19.16b}, [x7], x1
184        b.le            0f
185        \type           v4, v0,  v1,  v2,  v3
186        b               128b
1870:
188        ret
189L(\type\()_tbl):
190        .hword L(\type\()_tbl) - 1280b
191        .hword L(\type\()_tbl) -  640b
192        .hword L(\type\()_tbl) -  320b
193        .hword L(\type\()_tbl) -   16b
194        .hword L(\type\()_tbl) -   80b
195        .hword L(\type\()_tbl) -   40b
196endfunc
197.endm
198
199bidir_fn avg
200bidir_fn w_avg
201bidir_fn mask
202
203
204.macro w_mask_fn type
205function w_mask_\type\()_8bpc_neon, export=1
206        clz             w8,  w4
207        adr             x9,  L(w_mask_\type\()_tbl)
208        sub             w8,  w8,  #24
209        ldrh            w8,  [x9,  x8,  lsl #1]
210        sub             x9,  x9,  w8,  uxtw
211        mov             w10, #6903
212        dup             v0.8h,   w10
213.if \type == 444
214        movi            v1.16b,  #64
215.elseif \type == 422
216        dup             v2.8b,   w7
217        movi            v3.8b,   #129
218        sub             v3.8b,   v3.8b,   v2.8b
219.elseif \type == 420
220        dup             v2.8h,   w7
221        movi            v3.8h,   #1, lsl #8
222        sub             v3.8h,   v3.8h,   v2.8h
223.endif
224        add             x12,  x0,  x1
225        lsl             x1,   x1,  #1
226        br              x9
2274:
228        ld1             {v4.8h,   v5.8h},   [x2],  #32  // tmp1 (four rows at once)
229        ld1             {v6.8h,   v7.8h},   [x3],  #32  // tmp2 (four rows at once)
230        subs            w5,  w5,  #4
231        sub             v16.8h,  v6.8h,   v4.8h
232        sub             v17.8h,  v7.8h,   v5.8h
233        sabd            v18.8h,  v4.8h,   v6.8h
234        sabd            v19.8h,  v5.8h,   v7.8h
235        uqsub           v18.8h,  v0.8h,   v18.8h
236        uqsub           v19.8h,  v0.8h,   v19.8h
237        ushr            v18.8h,  v18.8h,  #8
238        ushr            v19.8h,  v19.8h,  #8
239        shl             v20.8h,  v18.8h,  #9
240        shl             v21.8h,  v19.8h,  #9
241        sqdmulh         v20.8h,  v20.8h,  v16.8h
242        sqdmulh         v21.8h,  v21.8h,  v17.8h
243        add             v20.8h,  v20.8h,  v4.8h
244        add             v21.8h,  v21.8h,  v5.8h
245        sqrshrun        v22.8b,  v20.8h,  #4
246        sqrshrun        v23.8b,  v21.8h,  #4
247.if \type == 444
248        xtn             v18.8b,   v18.8h
249        xtn2            v18.16b,  v19.8h
250        sub             v18.16b,  v1.16b,  v18.16b
251        st1             {v18.16b}, [x6],  #16
252.elseif \type == 422
253        addp            v18.8h,   v18.8h,  v19.8h
254        xtn             v18.8b,   v18.8h
255        uhsub           v18.8b,   v3.8b,   v18.8b
256        st1             {v18.8b},  [x6],  #8
257.elseif \type == 420
258        trn1            v24.2d,   v18.2d,  v19.2d
259        trn2            v25.2d,   v18.2d,  v19.2d
260        add             v24.8h,   v24.8h,  v25.8h
261        addp            v18.8h,   v24.8h,  v24.8h
262        sub             v18.4h,   v3.4h,   v18.4h
263        rshrn           v18.8b,   v18.8h,  #2
264        st1             {v18.s}[0],  [x6],  #4
265.endif
266        st1             {v22.s}[0],  [x0],  x1
267        st1             {v22.s}[1],  [x12], x1
268        st1             {v23.s}[0],  [x0],  x1
269        st1             {v23.s}[1],  [x12], x1
270        b.gt            4b
271        ret
2728:
273        ld1             {v4.8h,   v5.8h},   [x2],  #32
274        ld1             {v6.8h,   v7.8h},   [x3],  #32
275        subs            w5,  w5,  #2
276        sub             v16.8h,  v6.8h,   v4.8h
277        sub             v17.8h,  v7.8h,   v5.8h
278        sabd            v18.8h,  v4.8h,   v6.8h
279        sabd            v19.8h,  v5.8h,   v7.8h
280        uqsub           v18.8h,  v0.8h,   v18.8h
281        uqsub           v19.8h,  v0.8h,   v19.8h
282        ushr            v18.8h,  v18.8h,  #8
283        ushr            v19.8h,  v19.8h,  #8
284        shl             v20.8h,  v18.8h,  #9
285        shl             v21.8h,  v19.8h,  #9
286        sqdmulh         v20.8h,  v20.8h,  v16.8h
287        sqdmulh         v21.8h,  v21.8h,  v17.8h
288        add             v20.8h,  v20.8h,  v4.8h
289        add             v21.8h,  v21.8h,  v5.8h
290        sqrshrun        v22.8b,  v20.8h,  #4
291        sqrshrun        v23.8b,  v21.8h,  #4
292.if \type == 444
293        xtn             v18.8b,  v18.8h
294        xtn2            v18.16b, v19.8h
295        sub             v18.16b, v1.16b,  v18.16b
296        st1             {v18.16b}, [x6],  #16
297.elseif \type == 422
298        addp            v18.8h,  v18.8h,  v19.8h
299        xtn             v18.8b,  v18.8h
300        uhsub           v18.8b,  v3.8b,   v18.8b
301        st1             {v18.8b},  [x6],  #8
302.elseif \type == 420
303        add             v18.8h,  v18.8h,  v19.8h
304        addp            v18.8h,  v18.8h,  v18.8h
305        sub             v18.4h,  v3.4h,   v18.4h
306        rshrn           v18.8b,  v18.8h,  #2
307        st1             {v18.s}[0],  [x6],  #4
308.endif
309        st1             {v22.8b},  [x0],  x1
310        st1             {v23.8b},  [x12], x1
311        b.gt            8b
312        ret
3131280:
314640:
315320:
316160:
317        mov             w11, w4
318        sub             x1,  x1,  w4,  uxtw
319.if \type == 444
320        add             x10, x6,  w4,  uxtw
321.elseif \type == 422
322        add             x10, x6,  x11, lsr #1
323.endif
324        add             x9,  x3,  w4,  uxtw #1
325        add             x7,  x2,  w4,  uxtw #1
326161:
327        mov             w8,  w4
32816:
329        ld1             {v4.8h,   v5.8h},   [x2],  #32
330        ld1             {v6.8h,   v7.8h},   [x3],  #32
331        ld1             {v16.8h,  v17.8h},  [x7],  #32
332        ld1             {v18.8h,  v19.8h},  [x9],  #32
333        subs            w8,  w8,  #16
334        sub             v6.8h,   v6.8h,   v4.8h
335        sub             v7.8h,   v7.8h,   v5.8h
336        sub             v18.8h,  v18.8h,  v16.8h
337        sub             v19.8h,  v19.8h,  v17.8h
338        abs             v20.8h,  v6.8h
339        abs             v21.8h,  v7.8h
340        abs             v22.8h,  v18.8h
341        abs             v23.8h,  v19.8h
342        uqsub           v20.8h,  v0.8h,   v20.8h
343        uqsub           v21.8h,  v0.8h,   v21.8h
344        uqsub           v22.8h,  v0.8h,   v22.8h
345        uqsub           v23.8h,  v0.8h,   v23.8h
346        ushr            v20.8h,  v20.8h,  #8
347        ushr            v21.8h,  v21.8h,  #8
348        ushr            v22.8h,  v22.8h,  #8
349        ushr            v23.8h,  v23.8h,  #8
350        shl             v24.8h,  v20.8h,  #9
351        shl             v25.8h,  v21.8h,  #9
352        shl             v26.8h,  v22.8h,  #9
353        shl             v27.8h,  v23.8h,  #9
354        sqdmulh         v24.8h,  v24.8h,  v6.8h
355        sqdmulh         v25.8h,  v25.8h,  v7.8h
356        sqdmulh         v26.8h,  v26.8h,  v18.8h
357        sqdmulh         v27.8h,  v27.8h,  v19.8h
358        add             v24.8h,  v24.8h,  v4.8h
359        add             v25.8h,  v25.8h,  v5.8h
360        add             v26.8h,  v26.8h,  v16.8h
361        add             v27.8h,  v27.8h,  v17.8h
362        sqrshrun        v24.8b,  v24.8h,  #4
363        sqrshrun        v25.8b,  v25.8h,  #4
364        sqrshrun        v26.8b,  v26.8h,  #4
365        sqrshrun        v27.8b,  v27.8h,  #4
366.if \type == 444
367        xtn             v20.8b,  v20.8h
368        xtn2            v20.16b, v21.8h
369        xtn             v21.8b,  v22.8h
370        xtn2            v21.16b, v23.8h
371        sub             v20.16b, v1.16b,  v20.16b
372        sub             v21.16b, v1.16b,  v21.16b
373        st1             {v20.16b}, [x6],  #16
374        st1             {v21.16b}, [x10], #16
375.elseif \type == 422
376        addp            v20.8h,  v20.8h,  v21.8h
377        addp            v21.8h,  v22.8h,  v23.8h
378        xtn             v20.8b,  v20.8h
379        xtn             v21.8b,  v21.8h
380        uhsub           v20.8b,  v3.8b,   v20.8b
381        uhsub           v21.8b,  v3.8b,   v21.8b
382        st1             {v20.8b},  [x6],  #8
383        st1             {v21.8b},  [x10], #8
384.elseif \type == 420
385        add             v20.8h,  v20.8h,  v22.8h
386        add             v21.8h,  v21.8h,  v23.8h
387        addp            v20.8h,  v20.8h,  v21.8h
388        sub             v20.8h,  v3.8h,   v20.8h
389        rshrn           v20.8b,  v20.8h,  #2
390        st1             {v20.8b},  [x6],  #8
391.endif
392        st1             {v24.8b,  v25.8b},  [x0],  #16
393        st1             {v26.8b,  v27.8b},  [x12], #16
394        b.gt            16b
395        subs            w5,  w5,  #2
396        add             x2,  x2,  w4,  uxtw #1
397        add             x3,  x3,  w4,  uxtw #1
398        add             x7,  x7,  w4,  uxtw #1
399        add             x9,  x9,  w4,  uxtw #1
400.if \type == 444
401        add             x6,  x6,  w4,  uxtw
402        add             x10, x10, w4,  uxtw
403.elseif \type == 422
404        add             x6,  x6,  x11, lsr #1
405        add             x10, x10, x11, lsr #1
406.endif
407        add             x0,  x0,  x1
408        add             x12, x12, x1
409        b.gt            161b
410        ret
411L(w_mask_\type\()_tbl):
412        .hword L(w_mask_\type\()_tbl) - 1280b
413        .hword L(w_mask_\type\()_tbl) -  640b
414        .hword L(w_mask_\type\()_tbl) -  320b
415        .hword L(w_mask_\type\()_tbl) -  160b
416        .hword L(w_mask_\type\()_tbl) -    8b
417        .hword L(w_mask_\type\()_tbl) -    4b
418endfunc
419.endm
420
421w_mask_fn 444
422w_mask_fn 422
423w_mask_fn 420
424
425
426function blend_8bpc_neon, export=1
427        adr             x6,  L(blend_tbl)
428        clz             w3,  w3
429        sub             w3,  w3,  #26
430        ldrh            w3,  [x6,  x3,  lsl #1]
431        sub             x6,  x6,  w3,  uxtw
432        movi            v4.16b,  #64
433        add             x8,  x0,  x1
434        lsl             x1,  x1,  #1
435        br              x6
4364:
437        ld1             {v2.8b},     [x5],  #8
438        ld1             {v1.d}[0],   [x2],  #8
439        ld1             {v0.s}[0],   [x0]
440        subs            w4,  w4,  #2
441        ld1             {v0.s}[1],   [x8]
442        sub             v3.8b,   v4.8b,   v2.8b
443        umull           v5.8h,   v1.8b,   v2.8b
444        umlal           v5.8h,   v0.8b,   v3.8b
445        rshrn           v6.8b,   v5.8h,   #6
446        st1             {v6.s}[0],   [x0],  x1
447        st1             {v6.s}[1],   [x8],  x1
448        b.gt            4b
449        ret
4508:
451        ld1             {v2.16b},  [x5],  #16
452        ld1             {v1.16b},  [x2],  #16
453        ld1             {v0.d}[0],   [x0]
454        ld1             {v0.d}[1],   [x8]
455        sub             v3.16b,  v4.16b,  v2.16b
456        subs            w4,  w4,  #2
457        umull           v5.8h,   v1.8b,   v2.8b
458        umlal           v5.8h,   v0.8b,   v3.8b
459        umull2          v6.8h,   v1.16b,  v2.16b
460        umlal2          v6.8h,   v0.16b,  v3.16b
461        rshrn           v7.8b,   v5.8h,   #6
462        rshrn2          v7.16b,  v6.8h,   #6
463        st1             {v7.d}[0],   [x0],  x1
464        st1             {v7.d}[1],   [x8],  x1
465        b.gt            8b
466        ret
46716:
468        ld1             {v1.16b,  v2.16b},  [x5],  #32
469        ld1             {v5.16b,  v6.16b},  [x2],  #32
470        ld1             {v0.16b},  [x0]
471        subs            w4,  w4,  #2
472        sub             v7.16b,  v4.16b,  v1.16b
473        sub             v20.16b, v4.16b,  v2.16b
474        ld1             {v3.16b},  [x8]
475        umull           v16.8h,  v5.8b,   v1.8b
476        umlal           v16.8h,  v0.8b,   v7.8b
477        umull2          v17.8h,  v5.16b,  v1.16b
478        umlal2          v17.8h,  v0.16b,  v7.16b
479        umull           v21.8h,  v6.8b,   v2.8b
480        umlal           v21.8h,  v3.8b,   v20.8b
481        umull2          v22.8h,  v6.16b,  v2.16b
482        umlal2          v22.8h,  v3.16b,  v20.16b
483        rshrn           v18.8b,  v16.8h,  #6
484        rshrn2          v18.16b, v17.8h,  #6
485        rshrn           v19.8b,  v21.8h,  #6
486        rshrn2          v19.16b, v22.8h,  #6
487        st1             {v18.16b}, [x0],  x1
488        st1             {v19.16b}, [x8],  x1
489        b.gt            16b
490        ret
49132:
492        ld1             {v0.16b,  v1.16b,  v2.16b,  v3.16b},  [x5],  #64
493        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x2],  #64
494        ld1             {v20.16b, v21.16b}, [x0]
495        subs            w4,  w4,  #2
496        ld1             {v22.16b, v23.16b}, [x8]
497        sub             v5.16b,  v4.16b,  v0.16b
498        sub             v6.16b,  v4.16b,  v1.16b
499        sub             v30.16b, v4.16b,  v2.16b
500        sub             v31.16b, v4.16b,  v3.16b
501        umull           v24.8h,  v16.8b,  v0.8b
502        umlal           v24.8h,  v20.8b,  v5.8b
503        umull2          v26.8h,  v16.16b, v0.16b
504        umlal2          v26.8h,  v20.16b, v5.16b
505        umull           v28.8h,  v17.8b,  v1.8b
506        umlal           v28.8h,  v21.8b,  v6.8b
507        umull2          v7.8h,   v17.16b, v1.16b
508        umlal2          v7.8h,   v21.16b, v6.16b
509        umull           v27.8h,  v18.8b,  v2.8b
510        umlal           v27.8h,  v22.8b,  v30.8b
511        umull2          v1.8h,   v18.16b, v2.16b
512        umlal2          v1.8h,   v22.16b, v30.16b
513        umull           v29.8h,  v19.8b,  v3.8b
514        umlal           v29.8h,  v23.8b,  v31.8b
515        umull2          v21.8h,  v19.16b, v3.16b
516        umlal2          v21.8h,  v23.16b, v31.16b
517        rshrn           v24.8b,  v24.8h,  #6
518        rshrn2          v24.16b, v26.8h,  #6
519        rshrn           v25.8b,  v28.8h,  #6
520        rshrn2          v25.16b, v7.8h,   #6
521        rshrn           v27.8b,  v27.8h,  #6
522        rshrn2          v27.16b, v1.8h,   #6
523        rshrn           v28.8b,  v29.8h,  #6
524        rshrn2          v28.16b, v21.8h,  #6
525        st1             {v24.16b, v25.16b}, [x0],  x1
526        st1             {v27.16b, v28.16b}, [x8],  x1
527        b.gt            32b
528        ret
529L(blend_tbl):
530        .hword L(blend_tbl) - 32b
531        .hword L(blend_tbl) - 16b
532        .hword L(blend_tbl) -  8b
533        .hword L(blend_tbl) -  4b
534endfunc
535
536function blend_h_8bpc_neon, export=1
537        adr             x6,  L(blend_h_tbl)
538        movrel          x5,  X(obmc_masks)
539        add             x5,  x5,  w4,  uxtw
540        sub             w4,  w4,  w4,  lsr #2
541        clz             w7,  w3
542        movi            v4.16b,  #64
543        add             x8,  x0,  x1
544        lsl             x1,  x1,  #1
545        sub             w7,  w7,  #24
546        ldrh            w7,  [x6,  x7,  lsl #1]
547        sub             x6,  x6,  w7, uxtw
548        br              x6
5492:
550        ld1             {v0.h}[0],   [x5],  #2
551        ld1             {v1.s}[0],   [x2],  #4
552        subs            w4,  w4,  #2
553        ld1             {v2.h}[0],   [x0]
554        zip1            v0.8b,   v0.8b,   v0.8b
555        sub             v3.8b,   v4.8b,   v0.8b
556        ld1             {v2.h}[1],   [x8]
557        umull           v5.8h,   v1.8b,   v0.8b
558        umlal           v5.8h,   v2.8b,   v3.8b
559        rshrn           v5.8b,   v5.8h,   #6
560        st1             {v5.h}[0],   [x0],  x1
561        st1             {v5.h}[1],   [x8],  x1
562        b.gt            2b
563        ret
5644:
565        ld2r            {v0.8b,   v1.8b},   [x5],  #2
566        ld1             {v2.8b},   [x2],  #8
567        subs            w4,  w4,  #2
568        ext             v0.8b,   v0.8b,   v1.8b,   #4
569        ld1             {v3.s}[0],   [x0]
570        sub             v5.8b,   v4.8b,   v0.8b
571        ld1             {v3.s}[1],   [x8]
572        umull           v6.8h,   v2.8b,   v0.8b
573        umlal           v6.8h,   v3.8b,   v5.8b
574        rshrn           v6.8b,   v6.8h,   #6
575        st1             {v6.s}[0],   [x0],  x1
576        st1             {v6.s}[1],   [x8],  x1
577        b.gt            4b
578        ret
5798:
580        ld2r            {v0.16b,  v1.16b},  [x5],  #2
581        ld1             {v2.16b},  [x2],  #16
582        ld1             {v3.d}[0],   [x0]
583        ext             v0.16b,  v0.16b,  v1.16b,  #8
584        sub             v5.16b,  v4.16b,  v0.16b
585        ld1             {v3.d}[1],   [x8]
586        subs            w4,  w4,  #2
587        umull           v6.8h,   v0.8b,   v2.8b
588        umlal           v6.8h,   v3.8b,   v5.8b
589        umull2          v7.8h,   v0.16b,  v2.16b
590        umlal2          v7.8h,   v3.16b,  v5.16b
591        rshrn           v16.8b,  v6.8h,   #6
592        rshrn2          v16.16b, v7.8h,   #6
593        st1             {v16.d}[0],  [x0],  x1
594        st1             {v16.d}[1],  [x8],  x1
595        b.gt            8b
596        ret
59716:
598        ld2r            {v0.16b,  v1.16b},  [x5],  #2
599        ld1             {v2.16b,  v3.16b},  [x2],  #32
600        ld1             {v5.16b},  [x0]
601        sub             v7.16b,  v4.16b,  v0.16b
602        sub             v16.16b, v4.16b,  v1.16b
603        ld1             {v6.16b},  [x8]
604        subs            w4,  w4,  #2
605        umull           v17.8h,  v0.8b,   v2.8b
606        umlal           v17.8h,  v5.8b,   v7.8b
607        umull2          v18.8h,  v0.16b,  v2.16b
608        umlal2          v18.8h,  v5.16b,  v7.16b
609        umull           v19.8h,  v1.8b,   v3.8b
610        umlal           v19.8h,  v6.8b,   v16.8b
611        umull2          v20.8h,  v1.16b,  v3.16b
612        umlal2          v20.8h,  v6.16b,  v16.16b
613        rshrn           v21.8b,  v17.8h,  #6
614        rshrn2          v21.16b, v18.8h,  #6
615        rshrn           v22.8b,  v19.8h,  #6
616        rshrn2          v22.16b, v20.8h,  #6
617        st1             {v21.16b}, [x0],  x1
618        st1             {v22.16b}, [x8],  x1
619        b.gt            16b
620        ret
6211280:
622640:
623320:
624        sub             x1,  x1,  w3,  uxtw
625        add             x7,  x2,  w3,  uxtw
626321:
627        ld2r            {v0.16b,  v1.16b},  [x5],  #2
628        mov             w6,  w3
629        sub             v20.16b, v4.16b,  v0.16b
630        sub             v21.16b, v4.16b,  v1.16b
63132:
632        ld1             {v16.16b, v17.16b}, [x2],  #32
633        ld1             {v2.16b,  v3.16b},  [x0]
634        subs            w6,  w6,  #32
635        umull           v23.8h,  v0.8b,   v16.8b
636        umlal           v23.8h,  v2.8b,   v20.8b
637        ld1             {v18.16b, v19.16b}, [x7],  #32
638        umull2          v27.8h,  v0.16b,  v16.16b
639        umlal2          v27.8h,  v2.16b,  v20.16b
640        ld1             {v6.16b,  v7.16b},  [x8]
641        umull           v24.8h,  v0.8b,   v17.8b
642        umlal           v24.8h,  v3.8b,   v20.8b
643        umull2          v28.8h,  v0.16b,  v17.16b
644        umlal2          v28.8h,  v3.16b,  v20.16b
645        umull           v25.8h,  v1.8b,   v18.8b
646        umlal           v25.8h,  v6.8b,   v21.8b
647        umull2          v5.8h,   v1.16b,  v18.16b
648        umlal2          v5.8h,   v6.16b,  v21.16b
649        rshrn           v29.8b,  v23.8h,  #6
650        rshrn2          v29.16b, v27.8h,  #6
651        umull           v26.8h,  v1.8b,   v19.8b
652        umlal           v26.8h,  v7.8b,   v21.8b
653        umull2          v31.8h,  v1.16b,  v19.16b
654        umlal2          v31.8h,  v7.16b,  v21.16b
655        rshrn           v30.8b,  v24.8h,  #6
656        rshrn2          v30.16b, v28.8h,  #6
657        rshrn           v23.8b,  v25.8h,  #6
658        rshrn2          v23.16b, v5.8h,   #6
659        rshrn           v24.8b,  v26.8h,  #6
660        st1             {v29.16b, v30.16b}, [x0],  #32
661        rshrn2          v24.16b, v31.8h,  #6
662        st1             {v23.16b, v24.16b}, [x8],  #32
663        b.gt            32b
664        subs            w4,  w4,  #2
665        add             x0,  x0,  x1
666        add             x8,  x8,  x1
667        add             x2,  x2,  w3,  uxtw
668        add             x7,  x7,  w3,  uxtw
669        b.gt            321b
670        ret
671L(blend_h_tbl):
672        .hword L(blend_h_tbl) - 1280b
673        .hword L(blend_h_tbl) -  640b
674        .hword L(blend_h_tbl) -  320b
675        .hword L(blend_h_tbl) -   16b
676        .hword L(blend_h_tbl) -    8b
677        .hword L(blend_h_tbl) -    4b
678        .hword L(blend_h_tbl) -    2b
679endfunc
680
681function blend_v_8bpc_neon, export=1
682        adr             x6,  L(blend_v_tbl)
683        movrel          x5,  X(obmc_masks)
684        add             x5,  x5,  w3,  uxtw
685        clz             w3,  w3
686        movi            v4.16b,  #64
687        add             x8,  x0,  x1
688        lsl             x1,  x1,  #1
689        sub             w3,  w3,  #26
690        ldrh            w3,  [x6,  x3,  lsl #1]
691        sub             x6,  x6,  w3,  uxtw
692        br              x6
69320:
694        ld1r            {v0.8b},   [x5]
695        sub             v1.8b,   v4.8b,   v0.8b
6962:
697        ld1             {v2.h}[0],   [x2],  #2
698        ld1             {v3.b}[0],   [x0]
699        subs            w4,  w4,  #2
700        ld1             {v2.b}[1],   [x2]
701        ld1             {v3.b}[1],   [x8]
702        umull           v5.8h,   v2.8b,   v0.8b
703        umlal           v5.8h,   v3.8b,   v1.8b
704        rshrn           v5.8b,   v5.8h,   #6
705        add             x2,  x2,  #2
706        st1             {v5.b}[0],   [x0],  x1
707        st1             {v5.b}[1],   [x8],  x1
708        b.gt            2b
709        ret
71040:
711        ld1r            {v0.2s},   [x5]
712        sub             x1,  x1,  #2
713        sub             v1.8b,   v4.8b,   v0.8b
7144:
715        ld1             {v2.8b},   [x2],  #8
716        ld1             {v3.s}[0],   [x0]
717        ld1             {v3.s}[1],   [x8]
718        subs            w4,  w4,  #2
719        umull           v5.8h,   v2.8b,   v0.8b
720        umlal           v5.8h,   v3.8b,   v1.8b
721        rshrn           v5.8b,   v5.8h,   #6
722        st1             {v5.h}[0],   [x0],  #2
723        st1             {v5.h}[2],   [x8],  #2
724        st1             {v5.b}[2],   [x0],  x1
725        st1             {v5.b}[6],   [x8],  x1
726        b.gt            4b
727        ret
72880:
729        ld1r            {v0.2d},   [x5]
730        sub             x1,  x1,  #4
731        sub             v1.16b,  v4.16b,  v0.16b
7328:
733        ld1             {v2.16b},  [x2],  #16
734        ld1             {v3.d}[0],   [x0]
735        ld1             {v3.d}[1],   [x8]
736        subs            w4,  w4,  #2
737        umull           v5.8h,  v0.8b,  v2.8b
738        umlal           v5.8h,  v3.8b,  v1.8b
739        umull2          v6.8h,  v0.16b, v2.16b
740        umlal2          v6.8h,  v3.16b, v1.16b
741        rshrn           v7.8b,  v5.8h,  #6
742        rshrn2          v7.16b, v6.8h,  #6
743        st1             {v7.s}[0],   [x0],  #4
744        st1             {v7.s}[2],   [x8],  #4
745        st1             {v7.h}[2],   [x0],  x1
746        st1             {v7.h}[6],   [x8],  x1
747        b.gt            8b
748        ret
749160:
750        ld1             {v0.16b},  [x5]
751        sub             x1,  x1,  #8
752        sub             v2.16b,  v4.16b,  v0.16b
75316:
754        ld1             {v5.16b,  v6.16b},  [x2],  #32
755        ld1             {v7.16b},  [x0]
756        subs            w4,  w4,  #2
757        ld1             {v16.16b}, [x8]
758        umull           v17.8h,  v5.8b,   v0.8b
759        umlal           v17.8h,  v7.8b,   v2.8b
760        umull2          v18.8h,  v5.16b,  v0.16b
761        umlal2          v18.8h,  v7.16b,  v2.16b
762        umull           v20.8h,  v6.8b,   v0.8b
763        umlal           v20.8h,  v16.8b,  v2.8b
764        umull2          v21.8h,  v6.16b,  v0.16b
765        umlal2          v21.8h,  v16.16b, v2.16b
766        rshrn           v19.8b,  v17.8h,  #6
767        rshrn2          v19.16b, v18.8h,  #6
768        rshrn           v22.8b,  v20.8h,  #6
769        rshrn2          v22.16b, v21.8h,  #6
770        st1             {v19.8b},  [x0],  #8
771        st1             {v22.8b},  [x8],  #8
772        st1             {v19.s}[2],  [x0],  x1
773        st1             {v22.s}[2],  [x8],  x1
774        b.gt            16b
775        ret
776320:
777        ld1             {v0.16b,  v1.16b},  [x5]
778        sub             x1,  x1,  #16
779        sub             v2.16b,  v4.16b,  v0.16b
780        sub             v3.8b,   v4.8b,   v1.8b
78132:
782        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x2],  #64
783        ld1             {v5.16b,  v6.16b},  [x0]
784        subs            w4,  w4,  #2
785        ld1             {v20.16b, v21.16b}, [x8]
786        umull           v22.8h,  v16.8b,  v0.8b
787        umlal           v22.8h,  v5.8b,   v2.8b
788        umull2          v23.8h,  v16.16b, v0.16b
789        umlal2          v23.8h,  v5.16b,  v2.16b
790        umull           v28.8h,  v17.8b,  v1.8b
791        umlal           v28.8h,  v6.8b,   v3.8b
792        umull           v30.8h,  v18.8b,  v0.8b
793        umlal           v30.8h,  v20.8b,  v2.8b
794        umull2          v31.8h,  v18.16b, v0.16b
795        umlal2          v31.8h,  v20.16b, v2.16b
796        umull           v25.8h,  v19.8b,  v1.8b
797        umlal           v25.8h,  v21.8b,  v3.8b
798        rshrn           v24.8b,  v22.8h,  #6
799        rshrn2          v24.16b, v23.8h,  #6
800        rshrn           v28.8b,  v28.8h,  #6
801        rshrn           v30.8b,  v30.8h,  #6
802        rshrn2          v30.16b, v31.8h,  #6
803        rshrn           v27.8b,  v25.8h,  #6
804        st1             {v24.16b}, [x0],  #16
805        st1             {v30.16b}, [x8],  #16
806        st1             {v28.8b},  [x0],  x1
807        st1             {v27.8b},  [x8],  x1
808        b.gt            32b
809        ret
810L(blend_v_tbl):
811        .hword L(blend_v_tbl) - 320b
812        .hword L(blend_v_tbl) - 160b
813        .hword L(blend_v_tbl) -  80b
814        .hword L(blend_v_tbl) -  40b
815        .hword L(blend_v_tbl) -  20b
816endfunc
817
818
819// This has got the same signature as the put_8tap functions,
820// and assumes that x8 is set to (clz(w)-24).
821function put_neon
822        adr             x9,  L(put_tbl)
823        ldrh            w8,  [x9, x8, lsl #1]
824        sub             x9,  x9,  w8, uxtw
825        br              x9
826
8272:
828        ld1             {v0.h}[0], [x2], x3
829        ld1             {v1.h}[0], [x2], x3
830        subs            w5,  w5,  #2
831        st1             {v0.h}[0], [x0], x1
832        st1             {v1.h}[0], [x0], x1
833        b.gt            2b
834        ret
8354:
836        ld1             {v0.s}[0], [x2], x3
837        ld1             {v1.s}[0], [x2], x3
838        subs            w5,  w5,  #2
839        st1             {v0.s}[0], [x0], x1
840        st1             {v1.s}[0], [x0], x1
841        b.gt            4b
842        ret
8438:
844        ld1             {v0.8b}, [x2], x3
845        ld1             {v1.8b}, [x2], x3
846        subs            w5,  w5,  #2
847        st1             {v0.8b}, [x0], x1
848        st1             {v1.8b}, [x0], x1
849        b.gt            8b
850        ret
851160:
852        add             x8,  x0,  x1
853        lsl             x1,  x1,  #1
854        add             x9,  x2,  x3
855        lsl             x3,  x3,  #1
85616:
857        ld1             {v0.16b}, [x2], x3
858        ld1             {v1.16b}, [x9], x3
859        subs            w5,  w5,  #2
860        st1             {v0.16b}, [x0], x1
861        st1             {v1.16b}, [x8], x1
862        b.gt            16b
863        ret
86432:
865        ldp             x6,  x7,  [x2]
866        ldp             x8,  x9,  [x2, #16]
867        stp             x6,  x7,  [x0]
868        subs            w5,  w5,  #1
869        stp             x8,  x9,  [x0, #16]
870        add             x2,  x2,  x3
871        add             x0,  x0,  x1
872        b.gt            32b
873        ret
87464:
875        ldp             x6,  x7,  [x2]
876        ldp             x8,  x9,  [x2, #16]
877        stp             x6,  x7,  [x0]
878        ldp             x10, x11, [x2, #32]
879        stp             x8,  x9,  [x0, #16]
880        subs            w5,  w5,  #1
881        ldp             x12, x13, [x2, #48]
882        stp             x10, x11, [x0, #32]
883        stp             x12, x13, [x0, #48]
884        add             x2,  x2,  x3
885        add             x0,  x0,  x1
886        b.gt            64b
887        ret
888128:
889        ldp             q0,  q1,  [x2]
890        ldp             q2,  q3,  [x2, #32]
891        stp             q0,  q1,  [x0]
892        ldp             q4,  q5,  [x2, #64]
893        stp             q2,  q3,  [x0, #32]
894        ldp             q6,  q7,  [x2, #96]
895        subs            w5,  w5,  #1
896        stp             q4,  q5,  [x0, #64]
897        stp             q6,  q7,  [x0, #96]
898        add             x2,  x2,  x3
899        add             x0,  x0,  x1
900        b.gt            128b
901        ret
902
903L(put_tbl):
904        .hword L(put_tbl) - 128b
905        .hword L(put_tbl) -  64b
906        .hword L(put_tbl) -  32b
907        .hword L(put_tbl) - 160b
908        .hword L(put_tbl) -   8b
909        .hword L(put_tbl) -   4b
910        .hword L(put_tbl) -   2b
911endfunc
912
913
914// This has got the same signature as the prep_8tap functions,
915// and assumes that x8 is set to (clz(w)-24), and x7 to w*2.
916function prep_neon
917        adr             x9,  L(prep_tbl)
918        ldrh            w8,  [x9, x8, lsl #1]
919        sub             x9,  x9,  w8, uxtw
920        br              x9
921
9224:
923        ld1             {v0.s}[0], [x1], x2
924        ld1             {v1.s}[0], [x1], x2
925        subs            w4,  w4,  #2
926        ushll           v0.8h, v0.8b, #4
927        ushll           v1.8h, v1.8b, #4
928        st1             {v0.4h, v1.4h}, [x0], #16
929        b.gt            4b
930        ret
9318:
932        ld1             {v0.8b}, [x1], x2
933        ld1             {v1.8b}, [x1], x2
934        subs            w4,  w4,  #2
935        ushll           v0.8h, v0.8b, #4
936        ushll           v1.8h, v1.8b, #4
937        st1             {v0.8h, v1.8h}, [x0], #32
938        b.gt            8b
939        ret
940160:
941        add             x9,  x1,  x2
942        lsl             x2,  x2,  #1
94316:
944        ld1             {v0.16b}, [x1], x2
945        ld1             {v1.16b}, [x9], x2
946        subs            w4,  w4,  #2
947        ushll           v4.8h, v0.8b,  #4
948        ushll2          v5.8h, v0.16b, #4
949        ushll           v6.8h, v1.8b,  #4
950        ushll2          v7.8h, v1.16b, #4
951        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
952        b.gt            16b
953        ret
954320:
955        add             x8,  x0,  w3, uxtw
95632:
957        ld1             {v0.16b, v1.16b},  [x1], x2
958        subs            w4,  w4,  #2
959        ushll           v4.8h,  v0.8b,  #4
960        ushll2          v5.8h,  v0.16b, #4
961        ld1             {v2.16b, v3.16b},  [x1], x2
962        ushll           v6.8h,  v1.8b,  #4
963        ushll2          v7.8h,  v1.16b, #4
964        ushll           v16.8h, v2.8b,  #4
965        st1             {v4.8h,  v5.8h},  [x0], x7
966        ushll2          v17.8h, v2.16b, #4
967        st1             {v6.8h,  v7.8h},  [x8], x7
968        ushll           v18.8h, v3.8b,  #4
969        st1             {v16.8h, v17.8h}, [x0], x7
970        ushll2          v19.8h, v3.16b, #4
971        st1             {v18.8h, v19.8h}, [x8], x7
972        b.gt            32b
973        ret
974640:
975        add             x8,  x0,  #32
976        mov             x6,  #64
97764:
978        ldp             q0,  q1,  [x1]
979        subs            w4,  w4,  #1
980        ushll           v4.8h,  v0.8b,  #4
981        ushll2          v5.8h,  v0.16b, #4
982        ldp             q2,  q3,  [x1, #32]
983        ushll           v6.8h,  v1.8b,  #4
984        ushll2          v7.8h,  v1.16b, #4
985        add             x1,  x1,  x2
986        ushll           v16.8h, v2.8b,  #4
987        st1             {v4.8h,  v5.8h},  [x0], x6
988        ushll2          v17.8h, v2.16b, #4
989        ushll           v18.8h, v3.8b,  #4
990        st1             {v6.8h,  v7.8h},  [x8], x6
991        ushll2          v19.8h, v3.16b, #4
992        st1             {v16.8h, v17.8h}, [x0], x6
993        st1             {v18.8h, v19.8h}, [x8], x6
994        b.gt            64b
995        ret
9961280:
997        add             x8,  x0,  #64
998        mov             x6,  #128
999128:
1000        ldp             q0,  q1,  [x1]
1001        ldp             q2,  q3,  [x1, #32]
1002        ushll           v16.8h,  v0.8b,  #4
1003        ushll2          v17.8h,  v0.16b, #4
1004        ushll           v18.8h,  v1.8b,  #4
1005        ushll2          v19.8h,  v1.16b, #4
1006        ushll           v20.8h,  v2.8b,  #4
1007        ushll2          v21.8h,  v2.16b, #4
1008        ldp             q4,  q5,  [x1, #64]
1009        st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x6
1010        ushll           v22.8h,  v3.8b,  #4
1011        ushll2          v23.8h,  v3.16b, #4
1012        ushll           v24.8h,  v4.8b,  #4
1013        ushll2          v25.8h,  v4.16b, #4
1014        ushll           v26.8h,  v5.8b,  #4
1015        ushll2          v27.8h,  v5.16b, #4
1016        ldp             q6,  q7,  [x1, #96]
1017        st1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x8], x6
1018        ushll           v28.8h,  v6.8b,  #4
1019        ushll2          v29.8h,  v6.16b, #4
1020        ushll           v30.8h,  v7.8b,  #4
1021        ushll2          v31.8h,  v7.16b, #4
1022        subs            w4,  w4,  #1
1023        add             x1,  x1,  x2
1024        st1             {v24.8h, v25.8h, v26.8h, v27.8h}, [x0], x6
1025        st1             {v28.8h, v29.8h, v30.8h, v31.8h}, [x8], x6
1026        b.gt            128b
1027        ret
1028
1029L(prep_tbl):
1030        .hword L(prep_tbl) - 1280b
1031        .hword L(prep_tbl) -  640b
1032        .hword L(prep_tbl) -  320b
1033        .hword L(prep_tbl) -  160b
1034        .hword L(prep_tbl) -    8b
1035        .hword L(prep_tbl) -    4b
1036endfunc
1037
1038
1039.macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6
1040        ld1             {\d0\wd}[0], [\s0], \strd
1041        ld1             {\d1\wd}[0], [\s1], \strd
1042.ifnb \d2
1043        ld1             {\d2\wd}[0], [\s0], \strd
1044        ld1             {\d3\wd}[0], [\s1], \strd
1045.endif
1046.ifnb \d4
1047        ld1             {\d4\wd}[0], [\s0], \strd
1048.endif
1049.ifnb \d5
1050        ld1             {\d5\wd}[0], [\s1], \strd
1051.endif
1052.ifnb \d6
1053        ld1             {\d6\wd}[0], [\s0], \strd
1054.endif
1055.endm
1056.macro load_reg s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6
1057        ld1             {\d0\wd}, [\s0], \strd
1058        ld1             {\d1\wd}, [\s1], \strd
1059.ifnb \d2
1060        ld1             {\d2\wd}, [\s0], \strd
1061        ld1             {\d3\wd}, [\s1], \strd
1062.endif
1063.ifnb \d4
1064        ld1             {\d4\wd}, [\s0], \strd
1065.endif
1066.ifnb \d5
1067        ld1             {\d5\wd}, [\s1], \strd
1068.endif
1069.ifnb \d6
1070        ld1             {\d6\wd}, [\s0], \strd
1071.endif
1072.endm
1073.macro load_h s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
1074        load_slice      \s0, \s1, \strd, .h, \d0, \d1, \d2, \d3, \d4, \d5, \d6
1075.endm
1076.macro load_s s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
1077        load_slice      \s0, \s1, \strd, .s, \d0, \d1, \d2, \d3, \d4, \d5, \d6
1078.endm
1079.macro load_8b s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
1080        load_reg        \s0, \s1, \strd, .8b, \d0, \d1, \d2, \d3, \d4, \d5, \d6
1081.endm
1082.macro load_16b s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
1083        load_reg        \s0, \s1, \strd, .16b, \d0, \d1, \d2, \d3, \d4, \d5, \d6
1084.endm
1085.macro interleave_1 wd, r0, r1, r2, r3, r4
1086        trn1            \r0\wd, \r0\wd, \r1\wd
1087        trn1            \r1\wd, \r1\wd, \r2\wd
1088.ifnb \r3
1089        trn1            \r2\wd, \r2\wd, \r3\wd
1090        trn1            \r3\wd, \r3\wd, \r4\wd
1091.endif
1092.endm
1093.macro interleave_1_h r0, r1, r2, r3, r4
1094        interleave_1    .4h, \r0, \r1, \r2, \r3, \r4
1095.endm
1096.macro interleave_1_s r0, r1, r2, r3, r4
1097        interleave_1    .2s, \r0, \r1, \r2, \r3, \r4
1098.endm
1099.macro interleave_2 wd, r0, r1, r2, r3, r4, r5
1100        trn1            \r0\wd,  \r0\wd, \r2\wd
1101        trn1            \r1\wd,  \r1\wd, \r3\wd
1102        trn1            \r2\wd,  \r2\wd, \r4\wd
1103        trn1            \r3\wd,  \r3\wd, \r5\wd
1104.endm
1105.macro interleave_2_s r0, r1, r2, r3, r4, r5
1106        interleave_2    .2s, \r0, \r1, \r2, \r3, \r4, \r5
1107.endm
1108.macro uxtl_b r0, r1, r2, r3, r4, r5, r6
1109        uxtl            \r0\().8h, \r0\().8b
1110        uxtl            \r1\().8h, \r1\().8b
1111.ifnb \r2
1112        uxtl            \r2\().8h, \r2\().8b
1113        uxtl            \r3\().8h, \r3\().8b
1114.endif
1115.ifnb \r4
1116        uxtl            \r4\().8h, \r4\().8b
1117.endif
1118.ifnb \r5
1119        uxtl            \r5\().8h, \r5\().8b
1120.endif
1121.ifnb \r6
1122        uxtl            \r6\().8h, \r6\().8b
1123.endif
1124.endm
1125.macro mul_mla_4 d, s0, s1, s2, s3, wd
1126        mul             \d\wd,  \s0\wd,  v0.h[0]
1127        mla             \d\wd,  \s1\wd,  v0.h[1]
1128        mla             \d\wd,  \s2\wd,  v0.h[2]
1129        mla             \d\wd,  \s3\wd,  v0.h[3]
1130.endm
1131// Interleaving the mul/mla chains actually hurts performance
1132// significantly on Cortex A53, thus keeping mul/mla tightly
1133// chained like this.
1134.macro mul_mla_8_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8
1135        mul             \d0\().8h, \s0\().8h, v0.h[0]
1136        mla             \d0\().8h, \s1\().8h, v0.h[1]
1137        mla             \d0\().8h, \s2\().8h, v0.h[2]
1138        mla             \d0\().8h, \s3\().8h, v0.h[3]
1139        mla             \d0\().8h, \s4\().8h, v0.h[4]
1140        mla             \d0\().8h, \s5\().8h, v0.h[5]
1141        mla             \d0\().8h, \s6\().8h, v0.h[6]
1142        mla             \d0\().8h, \s7\().8h, v0.h[7]
1143        mul             \d1\().8h, \s1\().8h, v0.h[0]
1144        mla             \d1\().8h, \s2\().8h, v0.h[1]
1145        mla             \d1\().8h, \s3\().8h, v0.h[2]
1146        mla             \d1\().8h, \s4\().8h, v0.h[3]
1147        mla             \d1\().8h, \s5\().8h, v0.h[4]
1148        mla             \d1\().8h, \s6\().8h, v0.h[5]
1149        mla             \d1\().8h, \s7\().8h, v0.h[6]
1150        mla             \d1\().8h, \s8\().8h, v0.h[7]
1151.endm
1152.macro mul_mla_8_2 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9
1153        mul             \d0\().8h, \s0\().8h, v0.h[0]
1154        mla             \d0\().8h, \s1\().8h, v0.h[1]
1155        mla             \d0\().8h, \s2\().8h, v0.h[2]
1156        mla             \d0\().8h, \s3\().8h, v0.h[3]
1157        mla             \d0\().8h, \s4\().8h, v0.h[4]
1158        mla             \d0\().8h, \s5\().8h, v0.h[5]
1159        mla             \d0\().8h, \s6\().8h, v0.h[6]
1160        mla             \d0\().8h, \s7\().8h, v0.h[7]
1161        mul             \d1\().8h, \s2\().8h, v0.h[0]
1162        mla             \d1\().8h, \s3\().8h, v0.h[1]
1163        mla             \d1\().8h, \s4\().8h, v0.h[2]
1164        mla             \d1\().8h, \s5\().8h, v0.h[3]
1165        mla             \d1\().8h, \s6\().8h, v0.h[4]
1166        mla             \d1\().8h, \s7\().8h, v0.h[5]
1167        mla             \d1\().8h, \s8\().8h, v0.h[6]
1168        mla             \d1\().8h, \s9\().8h, v0.h[7]
1169.endm
1170.macro mul_mla_8_4 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11
1171        mul             \d0\().8h, \s0\().8h,  v0.h[0]
1172        mla             \d0\().8h, \s1\().8h,  v0.h[1]
1173        mla             \d0\().8h, \s2\().8h,  v0.h[2]
1174        mla             \d0\().8h, \s3\().8h,  v0.h[3]
1175        mla             \d0\().8h, \s4\().8h,  v0.h[4]
1176        mla             \d0\().8h, \s5\().8h,  v0.h[5]
1177        mla             \d0\().8h, \s6\().8h,  v0.h[6]
1178        mla             \d0\().8h, \s7\().8h,  v0.h[7]
1179        mul             \d1\().8h, \s4\().8h,  v0.h[0]
1180        mla             \d1\().8h, \s5\().8h,  v0.h[1]
1181        mla             \d1\().8h, \s6\().8h,  v0.h[2]
1182        mla             \d1\().8h, \s7\().8h,  v0.h[3]
1183        mla             \d1\().8h, \s8\().8h,  v0.h[4]
1184        mla             \d1\().8h, \s9\().8h,  v0.h[5]
1185        mla             \d1\().8h, \s10\().8h, v0.h[6]
1186        mla             \d1\().8h, \s11\().8h, v0.h[7]
1187.endm
1188.macro sqrshrun_b shift, r0, r1, r2, r3
1189        sqrshrun        \r0\().8b, \r0\().8h,  #\shift
1190.ifnb \r1
1191        sqrshrun        \r1\().8b, \r1\().8h,  #\shift
1192.endif
1193.ifnb \r2
1194        sqrshrun        \r2\().8b, \r2\().8h,  #\shift
1195        sqrshrun        \r3\().8b, \r3\().8h,  #\shift
1196.endif
1197.endm
1198.macro srshr_h shift, r0, r1, r2, r3
1199        srshr           \r0\().8h, \r0\().8h,  #\shift
1200.ifnb \r1
1201        srshr           \r1\().8h, \r1\().8h,  #\shift
1202.endif
1203.ifnb \r2
1204        srshr           \r2\().8h, \r2\().8h,  #\shift
1205        srshr           \r3\().8h, \r3\().8h,  #\shift
1206.endif
1207.endm
1208.macro st_h strd, reg, lanes
1209        st1             {\reg\().h}[0], [x0], \strd
1210        st1             {\reg\().h}[1], [x8], \strd
1211.if \lanes > 2
1212        st1             {\reg\().h}[2], [x0], \strd
1213        st1             {\reg\().h}[3], [x8], \strd
1214.endif
1215.endm
1216.macro st_s strd, r0, r1
1217        st1             {\r0\().s}[0], [x0], \strd
1218        st1             {\r0\().s}[1], [x8], \strd
1219.ifnb \r1
1220        st1             {\r1\().s}[0], [x0], \strd
1221        st1             {\r1\().s}[1], [x8], \strd
1222.endif
1223.endm
1224.macro st_d strd, r0, r1
1225        st1             {\r0\().d}[0], [x0], \strd
1226        st1             {\r0\().d}[1], [x8], \strd
1227.ifnb \r1
1228        st1             {\r1\().d}[0], [x0], \strd
1229        st1             {\r1\().d}[1], [x8], \strd
1230.endif
1231.endm
1232.macro shift_store_4 type, strd, r0, r1
1233.ifc \type, put
1234        sqrshrun_b      6,     \r0, \r1
1235        st_s            \strd, \r0, \r1
1236.else
1237        srshr_h         2,     \r0, \r1
1238        st_d            \strd, \r0, \r1
1239.endif
1240.endm
1241.macro st_reg strd, wd, r0, r1, r2, r3, r4, r5, r6, r7
1242        st1             {\r0\wd}, [x0], \strd
1243        st1             {\r1\wd}, [x8], \strd
1244.ifnb \r2
1245        st1             {\r2\wd}, [x0], \strd
1246        st1             {\r3\wd}, [x8], \strd
1247.endif
1248.ifnb \r4
1249        st1             {\r4\wd}, [x0], \strd
1250        st1             {\r5\wd}, [x8], \strd
1251        st1             {\r6\wd}, [x0], \strd
1252        st1             {\r7\wd}, [x8], \strd
1253.endif
1254.endm
1255.macro st_8b strd, r0, r1, r2, r3, r4, r5, r6, r7
1256        st_reg          \strd, .8b,  \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7
1257.endm
1258.macro st_16b strd, r0, r1, r2, r3, r4, r5, r6, r7
1259        st_reg          \strd, .16b, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7
1260.endm
1261.macro shift_store_8 type, strd, r0, r1, r2, r3
1262.ifc \type, put
1263        sqrshrun_b      6,     \r0, \r1, \r2, \r3
1264        st_8b           \strd, \r0, \r1, \r2, \r3
1265.else
1266        srshr_h         2,     \r0, \r1, \r2, \r3
1267        st_16b          \strd, \r0, \r1, \r2, \r3
1268.endif
1269.endm
1270.macro shift_store_16 type, strd, r0, r1, r2, r3
1271.ifc \type, put
1272        sqrshrun        \r0\().8b,  \r0\().8h, #6
1273        sqrshrun2       \r0\().16b, \r1\().8h, #6
1274        sqrshrun        \r2\().8b,  \r2\().8h, #6
1275        sqrshrun2       \r2\().16b, \r3\().8h, #6
1276        st_16b          \strd, \r0, \r2
1277.else
1278        srshr_h         2,     \r0, \r1, \r2, \r3
1279        st1             {\r0\().8h, \r1\().8h}, [x0], \strd
1280        st1             {\r2\().8h, \r3\().8h}, [x8], \strd
1281.endif
1282.endm
1283
1284.macro make_8tap_fn op, type, type_h, type_v
1285function \op\()_8tap_\type\()_8bpc_neon, export=1
1286        mov             x8,  \type_h
1287        mov             x9,  \type_v
1288        b               \op\()_8tap_neon
1289endfunc
1290.endm
1291
1292// No spaces in these expressions, due to gas-preprocessor.
1293#define REGULAR ((0*15<<7)|3*15)
1294#define SMOOTH  ((1*15<<7)|4*15)
1295#define SHARP   ((2*15<<7)|3*15)
1296
1297.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, ds2, sr2, shift_hv
1298make_8tap_fn \type, regular,        REGULAR, REGULAR
1299make_8tap_fn \type, regular_smooth, REGULAR, SMOOTH
1300make_8tap_fn \type, regular_sharp,  REGULAR, SHARP
1301make_8tap_fn \type, smooth,         SMOOTH,  SMOOTH
1302make_8tap_fn \type, smooth_regular, SMOOTH,  REGULAR
1303make_8tap_fn \type, smooth_sharp,   SMOOTH,  SHARP
1304make_8tap_fn \type, sharp,          SHARP,   SHARP
1305make_8tap_fn \type, sharp_regular,  SHARP,   REGULAR
1306make_8tap_fn \type, sharp_smooth,   SHARP,   SMOOTH
1307
1308function \type\()_8tap_neon
1309        mov             w10,  #0x4081  // (1 << 14) | (1 << 7) | (1 << 0)
1310        mul             \mx,  \mx, w10
1311        mul             \my,  \my, w10
1312        add             \mx,  \mx, w8 // mx, 8tap_h, 4tap_h
1313        add             \my,  \my, w9 // my, 8tap_v, 4tap_v
1314.ifc \type, prep
1315        uxtw            \d_strd, \w
1316        lsl             \d_strd, \d_strd, #1
1317.endif
1318
1319        clz             w8,  \w
1320        tst             \mx, #(0x7f << 14)
1321        sub             w8,  w8,  #24
1322        movrel          x10, X(mc_subpel_filters), -8
1323        b.ne            L(\type\()_8tap_h)
1324        tst             \my, #(0x7f << 14)
1325        b.ne            L(\type\()_8tap_v)
1326        b               \type\()_neon
1327
1328L(\type\()_8tap_h):
1329        cmp             \w,  #4
1330        ubfx            w9,  \mx, #7, #7
1331        and             \mx, \mx, #0x7f
1332        b.le            4f
1333        mov             \mx,  w9
13344:
1335        tst             \my,  #(0x7f << 14)
1336        add             \xmx, x10, \mx, uxtw #3
1337        b.ne            L(\type\()_8tap_hv)
1338
1339        adr             x9,  L(\type\()_8tap_h_tbl)
1340        ldrh            w8,  [x9, x8, lsl #1]
1341        sub             x9,  x9,  w8, uxtw
1342        br              x9
1343
134420:     // 2xN h
1345.ifc \type, put
1346        add             \xmx,  \xmx,  #2
1347        ld1             {v0.s}[0], [\xmx]
1348        sub             \src,  \src,  #1
1349        add             \ds2,  \dst,  \d_strd
1350        add             \sr2,  \src,  \s_strd
1351        lsl             \d_strd,  \d_strd,  #1
1352        lsl             \s_strd,  \s_strd,  #1
1353        sxtl            v0.8h,  v0.8b
13542:
1355        ld1             {v4.8b},  [\src], \s_strd
1356        ld1             {v6.8b},  [\sr2], \s_strd
1357        uxtl            v4.8h,  v4.8b
1358        uxtl            v6.8h,  v6.8b
1359        ext             v5.16b, v4.16b, v4.16b, #2
1360        ext             v7.16b, v6.16b, v6.16b, #2
1361        subs            \h,  \h,  #2
1362        trn1            v3.2s,  v4.2s,  v6.2s
1363        trn2            v6.2s,  v4.2s,  v6.2s
1364        trn1            v4.2s,  v5.2s,  v7.2s
1365        trn2            v7.2s,  v5.2s,  v7.2s
1366        mul             v3.4h,  v3.4h,  v0.h[0]
1367        mla             v3.4h,  v4.4h,  v0.h[1]
1368        mla             v3.4h,  v6.4h,  v0.h[2]
1369        mla             v3.4h,  v7.4h,  v0.h[3]
1370        srshr           v3.4h,  v3.4h,  #2
1371        sqrshrun        v3.8b,  v3.8h,  #4
1372        st1             {v3.h}[0], [\dst], \d_strd
1373        st1             {v3.h}[1], [\ds2], \d_strd
1374        b.gt            2b
1375        ret
1376.endif
1377
137840:     // 4xN h
1379        add             \xmx,  \xmx,  #2
1380        ld1             {v0.s}[0], [\xmx]
1381        sub             \src,  \src,  #1
1382        add             \ds2,  \dst,  \d_strd
1383        add             \sr2,  \src,  \s_strd
1384        lsl             \d_strd,  \d_strd,  #1
1385        lsl             \s_strd,  \s_strd,  #1
1386        sxtl            v0.8h,  v0.8b
13874:
1388        ld1             {v16.8b}, [\src], \s_strd
1389        ld1             {v20.8b}, [\sr2], \s_strd
1390        uxtl            v16.8h,  v16.8b
1391        uxtl            v20.8h,  v20.8b
1392        ext             v17.16b, v16.16b, v16.16b, #2
1393        ext             v18.16b, v16.16b, v16.16b, #4
1394        ext             v19.16b, v16.16b, v16.16b, #6
1395        ext             v21.16b, v20.16b, v20.16b, #2
1396        ext             v22.16b, v20.16b, v20.16b, #4
1397        ext             v23.16b, v20.16b, v20.16b, #6
1398        subs            \h,  \h,  #2
1399        mul             v16.4h,  v16.4h,  v0.h[0]
1400        mla             v16.4h,  v17.4h,  v0.h[1]
1401        mla             v16.4h,  v18.4h,  v0.h[2]
1402        mla             v16.4h,  v19.4h,  v0.h[3]
1403        mul             v20.4h,  v20.4h,  v0.h[0]
1404        mla             v20.4h,  v21.4h,  v0.h[1]
1405        mla             v20.4h,  v22.4h,  v0.h[2]
1406        mla             v20.4h,  v23.4h,  v0.h[3]
1407        srshr           v16.4h,  v16.4h,  #2
1408        srshr           v20.4h,  v20.4h,  #2
1409.ifc \type, put
1410        sqrshrun        v16.8b,  v16.8h,  #4
1411        sqrshrun        v20.8b,  v20.8h,  #4
1412        st1             {v16.s}[0], [\dst], \d_strd
1413        st1             {v20.s}[0], [\ds2], \d_strd
1414.else
1415        st1             {v16.4h}, [\dst], \d_strd
1416        st1             {v20.4h}, [\ds2], \d_strd
1417.endif
1418        b.gt            4b
1419        ret
1420
142180:     // 8xN h
1422        ld1             {v0.8b}, [\xmx]
1423        sub             \src,  \src,  #3
1424        add             \ds2,  \dst,  \d_strd
1425        add             \sr2,  \src,  \s_strd
1426        lsl             \d_strd,  \d_strd,  #1
1427        lsl             \s_strd,  \s_strd,  #1
1428        sxtl            v0.8h, v0.8b
14298:
1430        ld1             {v16.8b, v17.8b},  [\src], \s_strd
1431        ld1             {v20.8b, v21.8b},  [\sr2], \s_strd
1432        uxtl            v16.8h,  v16.8b
1433        uxtl            v17.8h,  v17.8b
1434        uxtl            v20.8h,  v20.8b
1435        uxtl            v21.8h,  v21.8b
1436
1437        mul             v18.8h,  v16.8h,  v0.h[0]
1438        mul             v22.8h,  v20.8h,  v0.h[0]
1439.irpc i, 1234567
1440        ext             v19.16b, v16.16b, v17.16b, #(2*\i)
1441        ext             v23.16b, v20.16b, v21.16b, #(2*\i)
1442        mla             v18.8h,  v19.8h,  v0.h[\i]
1443        mla             v22.8h,  v23.8h,  v0.h[\i]
1444.endr
1445        subs            \h,  \h,  #2
1446        srshr           v18.8h,  v18.8h, #2
1447        srshr           v22.8h,  v22.8h, #2
1448.ifc \type, put
1449        sqrshrun        v18.8b,  v18.8h, #4
1450        sqrshrun        v22.8b,  v22.8h, #4
1451        st1             {v18.8b}, [\dst], \d_strd
1452        st1             {v22.8b}, [\ds2], \d_strd
1453.else
1454        st1             {v18.8h}, [\dst], \d_strd
1455        st1             {v22.8h}, [\ds2], \d_strd
1456.endif
1457        b.gt            8b
1458        ret
1459160:
1460320:
1461640:
14621280:   // 16xN, 32xN, ... h
1463        ld1             {v0.8b}, [\xmx]
1464        sub             \src,  \src,  #3
1465        add             \ds2,  \dst,  \d_strd
1466        add             \sr2,  \src,  \s_strd
1467        lsl             \s_strd,  \s_strd,  #1
1468        sxtl            v0.8h, v0.8b
1469
1470        sub             \s_strd,  \s_strd,  \w, uxtw
1471        sub             \s_strd,  \s_strd,  #8
1472.ifc \type, put
1473        lsl             \d_strd,  \d_strd,  #1
1474        sub             \d_strd,  \d_strd,  \w, uxtw
1475.endif
1476161:
1477        ld1             {v16.8b, v17.8b, v18.8b},  [\src], #24
1478        ld1             {v20.8b, v21.8b, v22.8b},  [\sr2], #24
1479        mov             \mx, \w
1480        uxtl            v16.8h,  v16.8b
1481        uxtl            v17.8h,  v17.8b
1482        uxtl            v18.8h,  v18.8b
1483        uxtl            v20.8h,  v20.8b
1484        uxtl            v21.8h,  v21.8b
1485        uxtl            v22.8h,  v22.8b
1486
148716:
1488        mul             v24.8h,  v16.8h,  v0.h[0]
1489        mul             v25.8h,  v17.8h,  v0.h[0]
1490        mul             v26.8h,  v20.8h,  v0.h[0]
1491        mul             v27.8h,  v21.8h,  v0.h[0]
1492.irpc i, 1234567
1493        ext             v28.16b, v16.16b, v17.16b, #(2*\i)
1494        ext             v29.16b, v17.16b, v18.16b, #(2*\i)
1495        ext             v30.16b, v20.16b, v21.16b, #(2*\i)
1496        ext             v31.16b, v21.16b, v22.16b, #(2*\i)
1497        mla             v24.8h,  v28.8h,  v0.h[\i]
1498        mla             v25.8h,  v29.8h,  v0.h[\i]
1499        mla             v26.8h,  v30.8h,  v0.h[\i]
1500        mla             v27.8h,  v31.8h,  v0.h[\i]
1501.endr
1502        srshr           v24.8h,  v24.8h, #2
1503        srshr           v25.8h,  v25.8h, #2
1504        srshr           v26.8h,  v26.8h, #2
1505        srshr           v27.8h,  v27.8h, #2
1506        subs            \mx, \mx, #16
1507.ifc \type, put
1508        sqrshrun        v24.8b,  v24.8h, #4
1509        sqrshrun2       v24.16b, v25.8h, #4
1510        sqrshrun        v26.8b,  v26.8h, #4
1511        sqrshrun2       v26.16b, v27.8h, #4
1512        st1             {v24.16b}, [\dst], #16
1513        st1             {v26.16b}, [\ds2], #16
1514.else
1515        st1             {v24.8h, v25.8h}, [\dst], #32
1516        st1             {v26.8h, v27.8h}, [\ds2], #32
1517.endif
1518        b.le            9f
1519
1520        mov             v16.16b, v18.16b
1521        mov             v20.16b, v22.16b
1522        ld1             {v17.8b, v18.8b}, [\src], #16
1523        ld1             {v21.8b, v22.8b}, [\sr2], #16
1524        uxtl            v17.8h,  v17.8b
1525        uxtl            v18.8h,  v18.8b
1526        uxtl            v21.8h,  v21.8b
1527        uxtl            v22.8h,  v22.8b
1528        b               16b
1529
15309:
1531        add             \dst,  \dst,  \d_strd
1532        add             \ds2,  \ds2,  \d_strd
1533        add             \src,  \src,  \s_strd
1534        add             \sr2,  \sr2,  \s_strd
1535
1536        subs            \h,  \h,  #2
1537        b.gt            161b
1538        ret
1539
1540L(\type\()_8tap_h_tbl):
1541        .hword L(\type\()_8tap_h_tbl) - 1280b
1542        .hword L(\type\()_8tap_h_tbl) -  640b
1543        .hword L(\type\()_8tap_h_tbl) -  320b
1544        .hword L(\type\()_8tap_h_tbl) -  160b
1545        .hword L(\type\()_8tap_h_tbl) -   80b
1546        .hword L(\type\()_8tap_h_tbl) -   40b
1547        .hword L(\type\()_8tap_h_tbl) -   20b
1548        .hword 0
1549
1550
1551L(\type\()_8tap_v):
1552        cmp             \h,  #4
1553        ubfx            w9,  \my, #7, #7
1554        and             \my, \my, #0x7f
1555        b.le            4f
1556        mov             \my, w9
15574:
1558        add             \xmy, x10, \my, uxtw #3
1559
1560        adr             x9,  L(\type\()_8tap_v_tbl)
1561        ldrh            w8,  [x9, x8, lsl #1]
1562        sub             x9,  x9,  w8, uxtw
1563        br              x9
1564
156520:     // 2xN v
1566.ifc \type, put
1567        b.gt            28f
1568
1569        cmp             \h,  #2
1570        add             \xmy, \xmy, #2
1571        ld1             {v0.s}[0], [\xmy]
1572        sub             \src,  \src,  \s_strd
1573        add             \ds2,  \dst,  \d_strd
1574        add             \sr2,  \src,  \s_strd
1575        lsl             \s_strd,  \s_strd,  #1
1576        lsl             \d_strd,  \d_strd,  #1
1577        sxtl            v0.8h, v0.8b
1578
1579        // 2x2 v
1580        load_h          \src, \sr2, \s_strd, v1, v2, v3, v4, v5
1581        interleave_1_h  v1, v2, v3, v4, v5
1582        b.gt            24f
1583        uxtl_b          v1, v2, v3, v4
1584        mul_mla_4       v6, v1, v2, v3, v4, .4h
1585        sqrshrun_b      6,  v6
1586        st_h            \d_strd, v6, 2
1587        ret
1588
158924:     // 2x4 v
1590        load_h          \sr2, \src, \s_strd, v6, v7
1591        interleave_1_h  v5, v6, v7
1592        interleave_2_s  v1, v2, v3, v4, v5, v6
1593        uxtl_b          v1, v2, v3, v4
1594        mul_mla_4       v6, v1, v2, v3, v4, .8h
1595        sqrshrun_b      6,  v6
1596        st_h            \d_strd, v6, 4
1597        ret
1598
159928:     // 2x8, 2x16 v
1600        ld1             {v0.8b}, [\xmy]
1601        sub             \sr2,  \src,  \s_strd, lsl #1
1602        add             \ds2,  \dst,  \d_strd
1603        sub             \src,  \sr2,  \s_strd
1604        lsl             \d_strd,  \d_strd,  #1
1605        lsl             \s_strd,  \s_strd,  #1
1606        sxtl            v0.8h, v0.8b
1607
1608        load_h          \src, \sr2, \s_strd, v1,  v2,  v3,  v4, v5, v6, v7
1609        interleave_1_h  v1,  v2,  v3,  v4,  v5
1610        interleave_1_h  v5,  v6,  v7
1611        interleave_2_s  v1,  v2,  v3,  v4,  v5,  v6
1612        uxtl_b          v1,  v2,  v3,  v4
1613216:
1614        subs            \h,  \h,  #8
1615        load_h          \sr2, \src, \s_strd, v16, v17, v18, v19
1616        load_h          \sr2, \src, \s_strd, v20, v21, v22, v23
1617        interleave_1_h  v7,  v16, v17, v18, v19
1618        interleave_1_h  v19, v20, v21, v22, v23
1619        interleave_2_s  v5,  v6,  v7,  v16, v17, v18
1620        interleave_2_s  v17, v18, v19, v20, v21, v22
1621        uxtl_b          v5,  v6,  v7,  v16
1622        uxtl_b          v17, v18, v19, v20
1623        mul_mla_8_4     v30, v31, v1,  v2,  v3,  v4,  v5,  v6,  v7,  v16, v17, v18, v19, v20
1624        sqrshrun_b      6,   v30, v31
1625        st_h            \d_strd, v30, 4
1626        st_h            \d_strd, v31, 4
1627        b.le            0f
1628        mov             v1.16b,  v17.16b
1629        mov             v2.16b,  v18.16b
1630        mov             v3.16b,  v19.16b
1631        mov             v4.16b,  v20.16b
1632        mov             v5.16b,  v21.16b
1633        mov             v6.16b,  v22.16b
1634        mov             v7.16b,  v23.16b
1635        b               216b
16360:
1637        ret
1638.endif
1639
164040:
1641        b.gt            480f
1642
1643        // 4x2, 4x4 v
1644        cmp             \h,  #2
1645        add             \xmy, \xmy, #2
1646        ld1             {v0.s}[0], [\xmy]
1647        sub             \src, \src, \s_strd
1648        add             \ds2, \dst, \d_strd
1649        add             \sr2, \src, \s_strd
1650        lsl             \s_strd, \s_strd, #1
1651        lsl             \d_strd, \d_strd, #1
1652        sxtl            v0.8h, v0.8b
1653
1654        load_s          \src, \sr2, \s_strd, v1, v2, v3, v4, v5
1655        interleave_1_s  v1, v2, v3, v4, v5
1656        uxtl_b          v1, v2, v3, v4
1657        mul_mla_4       v6, v1, v2, v3, v4, .8h
1658        shift_store_4   \type, \d_strd, v6
1659        b.le            0f
1660        load_s          \sr2, \src, \s_strd, v6, v7
1661        interleave_1_s  v5, v6, v7
1662        uxtl_b          v5, v6
1663        mul_mla_4       v7, v3, v4, v5, v6, .8h
1664        shift_store_4   \type, \d_strd, v7
16650:
1666        ret
1667
1668480:    // 4x8, 4x16 v
1669        ld1             {v0.8b}, [\xmy]
1670        sub             \sr2, \src, \s_strd, lsl #1
1671        add             \ds2, \dst, \d_strd
1672        sub             \src, \sr2, \s_strd
1673        lsl             \s_strd, \s_strd, #1
1674        lsl             \d_strd, \d_strd, #1
1675        sxtl            v0.8h, v0.8b
1676
1677        load_s          \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22
1678        interleave_1_s  v16, v17, v18
1679        interleave_1_s  v18, v19, v20, v21, v22
1680        uxtl_b          v16, v17
1681        uxtl_b          v18, v19, v20, v21
1682
168348:
1684        subs            \h,  \h,  #4
1685        load_s          \sr2, \src, \s_strd, v23, v24, v25, v26
1686        interleave_1_s  v22, v23, v24, v25, v26
1687        uxtl_b          v22, v23, v24, v25
1688        mul_mla_8_2     v1,  v2,  v16, v17, v18, v19, v20, v21, v22, v23, v24, v25
1689        shift_store_4   \type, \d_strd, v1, v2
1690        b.le            0f
1691        subs            \h,  \h,  #4
1692        load_s          \sr2,  \src, \s_strd, v27, v16, v17, v18
1693        interleave_1_s  v26, v27, v16, v17, v18
1694        uxtl_b          v26, v27, v16, v17
1695        mul_mla_8_2     v1,  v2,  v20, v21, v22, v23, v24, v25, v26, v27, v16, v17
1696        shift_store_4   \type, \d_strd, v1, v2
1697        b.le            0f
1698        subs            \h,  \h,  #4
1699        load_s          \sr2, \src, \s_strd, v19, v20, v21, v22
1700        interleave_1_s  v18, v19, v20, v21, v22
1701        uxtl_b          v18, v19, v20, v21
1702        mul_mla_8_2     v1,  v2,  v24, v25, v26, v27, v16, v17, v18, v19, v20, v21
1703        shift_store_4   \type, \d_strd, v1, v2
1704        b.gt            48b
17050:
1706        ret
1707
170880:
1709        b.gt            880f
1710
1711        // 8x2, 8x4 v
1712        cmp             \h,  #2
1713        add             \xmy, \xmy, #2
1714        ld1             {v0.s}[0], [\xmy]
1715        sub             \src, \src, \s_strd
1716        add             \ds2, \dst, \d_strd
1717        add             \sr2, \src, \s_strd
1718        lsl             \s_strd, \s_strd, #1
1719        lsl             \d_strd, \d_strd, #1
1720        sxtl            v0.8h, v0.8b
1721
1722        load_8b         \src, \sr2, \s_strd, v1, v2, v3, v4, v5
1723        uxtl_b          v1, v2, v3, v4, v5
1724        mul_mla_4       v6, v1, v2, v3, v4, .8h
1725        mul_mla_4       v7, v2, v3, v4, v5, .8h
1726        shift_store_8   \type, \d_strd, v6, v7
1727        b.le            0f
1728        load_8b         \sr2, \src, \s_strd, v6, v7
1729        uxtl_b          v6, v7
1730        mul_mla_4       v1, v3, v4, v5, v6, .8h
1731        mul_mla_4       v2, v4, v5, v6, v7, .8h
1732        shift_store_8   \type, \d_strd, v1, v2
17330:
1734        ret
1735
1736880:    // 8x6, 8x8, 8x16, 8x32 v
17371680:   // 16x8, 16x16, ...
1738320:    // 32x8, 32x16, ...
1739640:
17401280:
1741        ld1             {v0.8b}, [\xmy]
1742        sub             \src, \src, \s_strd
1743        sub             \src, \src, \s_strd, lsl #1
1744        sxtl            v0.8h, v0.8b
1745        mov             \my,  \h
1746168:
1747        add             \ds2, \dst, \d_strd
1748        add             \sr2, \src, \s_strd
1749        lsl             \s_strd, \s_strd, #1
1750        lsl             \d_strd, \d_strd, #1
1751
1752        load_8b         \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22
1753        uxtl_b          v16, v17, v18, v19, v20, v21, v22
1754
175588:
1756        subs            \h,  \h,  #2
1757        load_8b         \sr2, \src, \s_strd, v23, v24
1758        uxtl_b          v23, v24
1759        mul_mla_8_1     v1,  v2,  v16, v17, v18, v19, v20, v21, v22, v23, v24
1760        shift_store_8   \type, \d_strd, v1, v2
1761        b.le            9f
1762        subs            \h,  \h,  #2
1763        load_8b         \sr2, \src, \s_strd, v25, v26
1764        uxtl_b          v25, v26
1765        mul_mla_8_1     v3,  v4,  v18, v19, v20, v21, v22, v23, v24, v25, v26
1766        shift_store_8   \type, \d_strd, v3, v4
1767        b.le            9f
1768        subs            \h,  \h,  #2
1769        load_8b         \sr2, \src, \s_strd, v27, v16
1770        uxtl_b          v27, v16
1771        mul_mla_8_1     v1,  v2,  v20, v21, v22, v23, v24, v25, v26, v27, v16
1772        shift_store_8   \type, \d_strd, v1, v2
1773        b.le            9f
1774        subs            \h,  \h,  #2
1775        load_8b         \sr2, \src, \s_strd, v17, v18
1776        uxtl_b          v17, v18
1777        mul_mla_8_1     v3,  v4,  v22, v23, v24, v25, v26, v27, v16, v17, v18
1778        shift_store_8   \type, \d_strd, v3, v4
1779        b.le            9f
1780        subs            \h,  \h,  #4
1781        load_8b         \sr2, \src, \s_strd, v19, v20, v21, v22
1782        uxtl_b          v19, v20, v21, v22
1783        mul_mla_8_1     v1,  v2,  v24, v25, v26, v27, v16, v17, v18, v19, v20
1784        mul_mla_8_1     v3,  v4,  v26, v27, v16, v17, v18, v19, v20, v21, v22
1785        shift_store_8   \type, \d_strd, v1, v2, v3, v4
1786        b.gt            88b
17879:
1788        subs            \w,  \w,  #8
1789        b.le            0f
1790        asr             \s_strd, \s_strd, #1
1791        asr             \d_strd, \d_strd, #1
1792        msub            \src, \s_strd, \xmy, \src
1793        msub            \dst, \d_strd, \xmy, \dst
1794        sub             \src, \src, \s_strd, lsl #3
1795        mov             \h,  \my
1796        add             \src, \src, #8
1797.ifc \type, put
1798        add             \dst, \dst, #8
1799.else
1800        add             \dst, \dst, #16
1801.endif
1802        b               168b
18030:
1804        ret
1805
1806160:
1807        b.gt            1680b
1808
1809        // 16x2, 16x4 v
1810        add             \xmy, \xmy, #2
1811        ld1             {v0.s}[0], [\xmy]
1812        sub             \src, \src, \s_strd
1813        add             \ds2, \dst, \d_strd
1814        add             \sr2, \src, \s_strd
1815        lsl             \s_strd, \s_strd, #1
1816        lsl             \d_strd, \d_strd, #1
1817        sxtl            v0.8h, v0.8b
1818
1819        cmp             \h,  #2
1820        load_16b        \src, \sr2, \s_strd, v1,  v2,  v3,  v4,  v5
1821        uxtl            v16.8h, v1.8b
1822        uxtl            v17.8h, v2.8b
1823        uxtl            v18.8h, v3.8b
1824        uxtl            v19.8h, v4.8b
1825        uxtl            v20.8h, v5.8b
1826        uxtl2           v23.8h, v1.16b
1827        uxtl2           v24.8h, v2.16b
1828        uxtl2           v25.8h, v3.16b
1829        uxtl2           v26.8h, v4.16b
1830        uxtl2           v27.8h, v5.16b
1831        mul_mla_4       v1,  v16, v17, v18, v19, .8h
1832        mul_mla_4       v16, v17, v18, v19, v20, .8h
1833        mul_mla_4       v2,  v23, v24, v25, v26, .8h
1834        mul_mla_4       v17, v24, v25, v26, v27, .8h
1835        shift_store_16  \type, \d_strd, v1, v2, v16, v17
1836        b.le            0f
1837        load_16b        \sr2, \src, \s_strd, v6,  v7
1838        uxtl            v21.8h, v6.8b
1839        uxtl            v22.8h, v7.8b
1840        uxtl2           v28.8h, v6.16b
1841        uxtl2           v29.8h, v7.16b
1842        mul_mla_4       v1,  v18, v19, v20, v21, .8h
1843        mul_mla_4       v3,  v19, v20, v21, v22, .8h
1844        mul_mla_4       v2,  v25, v26, v27, v28, .8h
1845        mul_mla_4       v4,  v26, v27, v28, v29, .8h
1846        shift_store_16  \type, \d_strd, v1, v2, v3, v4
18470:
1848        ret
1849
1850L(\type\()_8tap_v_tbl):
1851        .hword L(\type\()_8tap_v_tbl) - 1280b
1852        .hword L(\type\()_8tap_v_tbl) -  640b
1853        .hword L(\type\()_8tap_v_tbl) -  320b
1854        .hword L(\type\()_8tap_v_tbl) -  160b
1855        .hword L(\type\()_8tap_v_tbl) -   80b
1856        .hword L(\type\()_8tap_v_tbl) -   40b
1857        .hword L(\type\()_8tap_v_tbl) -   20b
1858        .hword 0
1859
1860L(\type\()_8tap_hv):
1861        cmp             \h,  #4
1862        ubfx            w9,  \my, #7, #7
1863        and             \my, \my, #0x7f
1864        b.le            4f
1865        mov             \my,  w9
18664:
1867        add             \xmy,  x10, \my, uxtw #3
1868
1869        adr             x9,  L(\type\()_8tap_hv_tbl)
1870        ldrh            w8,  [x9, x8, lsl #1]
1871        sub             x9,  x9,  w8, uxtw
1872        br              x9
1873
187420:
1875.ifc \type, put
1876        add             \xmx,  \xmx,  #2
1877        ld1             {v0.s}[0],  [\xmx]
1878        b.gt            280f
1879        add             \xmy,  \xmy,  #2
1880        ld1             {v1.s}[0],  [\xmy]
1881
1882        // 2x2, 2x4 hv
1883        sub             \sr2, \src, #1
1884        sub             \src, \sr2, \s_strd
1885        add             \ds2, \dst, \d_strd
1886        lsl             \s_strd, \s_strd, #1
1887        lsl             \d_strd, \d_strd, #1
1888        sxtl            v0.8h,  v0.8b
1889        sxtl            v1.8h,  v1.8b
1890        mov             x15, x30
1891
1892        ld1             {v28.8b}, [\src], \s_strd
1893        uxtl            v28.8h,  v28.8b
1894        ext             v29.16b, v28.16b, v28.16b, #2
1895        mul             v28.4h,  v28.4h,  v0.4h
1896        mul             v29.4h,  v29.4h,  v0.4h
1897        addp            v28.4h,  v28.4h,  v29.4h
1898        addp            v16.4h,  v28.4h,  v28.4h
1899        srshr           v16.4h,  v16.4h,  #2
1900        bl              L(\type\()_8tap_filter_2)
1901
1902        trn1            v16.2s, v16.2s, v28.2s
1903        mov             v17.8b, v28.8b
1904
19052:
1906        bl              L(\type\()_8tap_filter_2)
1907
1908        ext             v18.8b, v17.8b, v28.8b, #4
1909        mov             v19.8b, v28.8b
1910        smull           v2.4s,  v16.4h, v1.h[0]
1911        smlal           v2.4s,  v17.4h, v1.h[1]
1912        smlal           v2.4s,  v18.4h, v1.h[2]
1913        smlal           v2.4s,  v19.4h, v1.h[3]
1914
1915        sqrshrn         v2.4h,  v2.4s,  #\shift_hv
1916        sqxtun          v2.8b,  v2.8h
1917        subs            \h,  \h,  #2
1918        st1             {v2.h}[0], [\dst], \d_strd
1919        st1             {v2.h}[1], [\ds2], \d_strd
1920        b.le            0f
1921        mov             v16.8b, v18.8b
1922        mov             v17.8b, v19.8b
1923        b               2b
1924
1925280:    // 2x8, 2x16, 2x32 hv
1926        ld1             {v1.8b},  [\xmy]
1927        sub             \src, \src, #1
1928        sub             \sr2, \src, \s_strd, lsl #1
1929        sub             \src, \sr2, \s_strd
1930        add             \ds2, \dst, \d_strd
1931        lsl             \s_strd, \s_strd, #1
1932        lsl             \d_strd, \d_strd, #1
1933        sxtl            v0.8h,  v0.8b
1934        sxtl            v1.8h,  v1.8b
1935        mov             x15, x30
1936
1937        ld1             {v28.8b}, [\src], \s_strd
1938        uxtl            v28.8h,  v28.8b
1939        ext             v29.16b, v28.16b, v28.16b, #2
1940        mul             v28.4h,  v28.4h,  v0.4h
1941        mul             v29.4h,  v29.4h,  v0.4h
1942        addp            v28.4h,  v28.4h,  v29.4h
1943        addp            v16.4h,  v28.4h,  v28.4h
1944        srshr           v16.4h,  v16.4h,  #2
1945
1946        bl              L(\type\()_8tap_filter_2)
1947        trn1            v16.2s, v16.2s, v28.2s
1948        mov             v17.8b, v28.8b
1949        bl              L(\type\()_8tap_filter_2)
1950        ext             v18.8b, v17.8b, v28.8b, #4
1951        mov             v19.8b, v28.8b
1952        bl              L(\type\()_8tap_filter_2)
1953        ext             v20.8b, v19.8b, v28.8b, #4
1954        mov             v21.8b, v28.8b
1955
195628:
1957        bl              L(\type\()_8tap_filter_2)
1958        ext             v22.8b, v21.8b, v28.8b, #4
1959        mov             v23.8b, v28.8b
1960        smull           v2.4s,  v16.4h, v1.h[0]
1961        smlal           v2.4s,  v17.4h, v1.h[1]
1962        smlal           v2.4s,  v18.4h, v1.h[2]
1963        smlal           v2.4s,  v19.4h, v1.h[3]
1964        smlal           v2.4s,  v20.4h, v1.h[4]
1965        smlal           v2.4s,  v21.4h, v1.h[5]
1966        smlal           v2.4s,  v22.4h, v1.h[6]
1967        smlal           v2.4s,  v23.4h, v1.h[7]
1968
1969        sqrshrn         v2.4h,  v2.4s,  #\shift_hv
1970        sqxtun          v2.8b,  v2.8h
1971        subs            \h,  \h,  #2
1972        st1             {v2.h}[0], [\dst], \d_strd
1973        st1             {v2.h}[1], [\ds2], \d_strd
1974        b.le            0f
1975        mov             v16.8b, v18.8b
1976        mov             v17.8b, v19.8b
1977        mov             v18.8b, v20.8b
1978        mov             v19.8b, v21.8b
1979        mov             v20.8b, v22.8b
1980        mov             v21.8b, v23.8b
1981        b               28b
1982
19830:
1984        br              x15
1985
1986L(\type\()_8tap_filter_2):
1987        ld1             {v28.8b},  [\sr2], \s_strd
1988        ld1             {v30.8b},  [\src], \s_strd
1989        uxtl            v28.8h,  v28.8b
1990        uxtl            v30.8h,  v30.8b
1991        ext             v29.16b, v28.16b, v28.16b, #2
1992        ext             v31.16b, v30.16b, v30.16b, #2
1993        trn1            v27.2s,  v28.2s,  v30.2s
1994        trn2            v30.2s,  v28.2s,  v30.2s
1995        trn1            v28.2s,  v29.2s,  v31.2s
1996        trn2            v31.2s,  v29.2s,  v31.2s
1997        mul             v27.4h,  v27.4h,  v0.h[0]
1998        mla             v27.4h,  v28.4h,  v0.h[1]
1999        mla             v27.4h,  v30.4h,  v0.h[2]
2000        mla             v27.4h,  v31.4h,  v0.h[3]
2001        srshr           v28.4h,  v27.4h,  #2
2002        ret
2003.endif
2004
200540:
2006        add             \xmx, \xmx, #2
2007        ld1             {v0.s}[0],  [\xmx]
2008        b.gt            480f
2009        add             \xmy, \xmy,  #2
2010        ld1             {v1.s}[0],  [\xmy]
2011        sub             \sr2, \src, #1
2012        sub             \src, \sr2, \s_strd
2013        add             \ds2, \dst, \d_strd
2014        lsl             \s_strd, \s_strd, #1
2015        lsl             \d_strd, \d_strd, #1
2016        sxtl            v0.8h,  v0.8b
2017        sxtl            v1.8h,  v1.8b
2018        mov             x15, x30
2019
2020        // 4x2, 4x4 hv
2021        ld1             {v26.8b}, [\src], \s_strd
2022        uxtl            v26.8h,  v26.8b
2023        ext             v28.16b, v26.16b, v26.16b, #2
2024        ext             v29.16b, v26.16b, v26.16b, #4
2025        ext             v30.16b, v26.16b, v26.16b, #6
2026        mul             v31.4h,  v26.4h,  v0.h[0]
2027        mla             v31.4h,  v28.4h,  v0.h[1]
2028        mla             v31.4h,  v29.4h,  v0.h[2]
2029        mla             v31.4h,  v30.4h,  v0.h[3]
2030        srshr           v16.4h,  v31.4h,  #2
2031
2032        bl              L(\type\()_8tap_filter_4)
2033        mov             v17.8b, v28.8b
2034        mov             v18.8b, v29.8b
2035
20364:
2037        bl              L(\type\()_8tap_filter_4)
2038        // Interleaving the mul/mla chains actually hurts performance
2039        // significantly on Cortex A53, thus keeping mul/mla tightly
2040        // chained like this.
2041        smull           v2.4s,  v16.4h, v1.h[0]
2042        smlal           v2.4s,  v17.4h, v1.h[1]
2043        smlal           v2.4s,  v18.4h, v1.h[2]
2044        smlal           v2.4s,  v28.4h, v1.h[3]
2045        smull           v3.4s,  v17.4h, v1.h[0]
2046        smlal           v3.4s,  v18.4h, v1.h[1]
2047        smlal           v3.4s,  v28.4h, v1.h[2]
2048        smlal           v3.4s,  v29.4h, v1.h[3]
2049        sqrshrn         v2.4h,  v2.4s,  #\shift_hv
2050        sqrshrn         v3.4h,  v3.4s,  #\shift_hv
2051        subs            \h,  \h,  #2
2052.ifc \type, put
2053        sqxtun          v2.8b,  v2.8h
2054        sqxtun          v3.8b,  v3.8h
2055        st1             {v2.s}[0], [\dst], \d_strd
2056        st1             {v3.s}[0], [\ds2], \d_strd
2057.else
2058        st1             {v2.4h}, [\dst], \d_strd
2059        st1             {v3.4h}, [\ds2], \d_strd
2060.endif
2061        b.le            0f
2062        mov             v16.8b,  v18.8b
2063        mov             v17.8b,  v28.8b
2064        mov             v18.8b,  v29.8b
2065        b               4b
2066
2067480:    // 4x8, 4x16, 4x32 hv
2068        ld1             {v1.8b},  [\xmy]
2069        sub             \src, \src, #1
2070        sub             \sr2, \src, \s_strd, lsl #1
2071        sub             \src, \sr2, \s_strd
2072        add             \ds2, \dst, \d_strd
2073        lsl             \s_strd, \s_strd, #1
2074        lsl             \d_strd, \d_strd, #1
2075        sxtl            v0.8h,  v0.8b
2076        sxtl            v1.8h,  v1.8b
2077        mov             x15, x30
2078
2079        ld1             {v26.8b}, [\src], \s_strd
2080        uxtl            v26.8h,  v26.8b
2081        ext             v28.16b, v26.16b, v26.16b, #2
2082        ext             v29.16b, v26.16b, v26.16b, #4
2083        ext             v30.16b, v26.16b, v26.16b, #6
2084        mul             v31.4h,  v26.4h,  v0.h[0]
2085        mla             v31.4h,  v28.4h,  v0.h[1]
2086        mla             v31.4h,  v29.4h,  v0.h[2]
2087        mla             v31.4h,  v30.4h,  v0.h[3]
2088        srshr           v16.4h,  v31.4h,  #2
2089
2090        bl              L(\type\()_8tap_filter_4)
2091        mov             v17.8b, v28.8b
2092        mov             v18.8b, v29.8b
2093        bl              L(\type\()_8tap_filter_4)
2094        mov             v19.8b, v28.8b
2095        mov             v20.8b, v29.8b
2096        bl              L(\type\()_8tap_filter_4)
2097        mov             v21.8b, v28.8b
2098        mov             v22.8b, v29.8b
2099
210048:
2101        bl              L(\type\()_8tap_filter_4)
2102        smull           v2.4s,  v16.4h, v1.h[0]
2103        smlal           v2.4s,  v17.4h, v1.h[1]
2104        smlal           v2.4s,  v18.4h, v1.h[2]
2105        smlal           v2.4s,  v19.4h, v1.h[3]
2106        smlal           v2.4s,  v20.4h, v1.h[4]
2107        smlal           v2.4s,  v21.4h, v1.h[5]
2108        smlal           v2.4s,  v22.4h, v1.h[6]
2109        smlal           v2.4s,  v28.4h, v1.h[7]
2110        smull           v3.4s,  v17.4h, v1.h[0]
2111        smlal           v3.4s,  v18.4h, v1.h[1]
2112        smlal           v3.4s,  v19.4h, v1.h[2]
2113        smlal           v3.4s,  v20.4h, v1.h[3]
2114        smlal           v3.4s,  v21.4h, v1.h[4]
2115        smlal           v3.4s,  v22.4h, v1.h[5]
2116        smlal           v3.4s,  v28.4h, v1.h[6]
2117        smlal           v3.4s,  v29.4h, v1.h[7]
2118        sqrshrn         v2.4h,  v2.4s,  #\shift_hv
2119        sqrshrn         v3.4h,  v3.4s,  #\shift_hv
2120        subs            \h,  \h,  #2
2121.ifc \type, put
2122        sqxtun          v2.8b,  v2.8h
2123        sqxtun          v3.8b,  v3.8h
2124        st1             {v2.s}[0], [\dst], \d_strd
2125        st1             {v3.s}[0], [\ds2], \d_strd
2126.else
2127        st1             {v2.4h}, [\dst], \d_strd
2128        st1             {v3.4h}, [\ds2], \d_strd
2129.endif
2130        b.le            0f
2131        mov             v16.8b,  v18.8b
2132        mov             v17.8b,  v19.8b
2133        mov             v18.8b,  v20.8b
2134        mov             v19.8b,  v21.8b
2135        mov             v20.8b,  v22.8b
2136        mov             v21.8b,  v28.8b
2137        mov             v22.8b,  v29.8b
2138        b               48b
21390:
2140        br              x15
2141
2142L(\type\()_8tap_filter_4):
2143        ld1             {v26.8b}, [\sr2], \s_strd
2144        ld1             {v27.8b}, [\src], \s_strd
2145        uxtl            v26.8h,  v26.8b
2146        uxtl            v27.8h,  v27.8b
2147        ext             v28.16b, v26.16b, v26.16b, #2
2148        ext             v29.16b, v26.16b, v26.16b, #4
2149        ext             v30.16b, v26.16b, v26.16b, #6
2150        mul             v31.4h,  v26.4h,  v0.h[0]
2151        mla             v31.4h,  v28.4h,  v0.h[1]
2152        mla             v31.4h,  v29.4h,  v0.h[2]
2153        mla             v31.4h,  v30.4h,  v0.h[3]
2154        ext             v28.16b, v27.16b, v27.16b, #2
2155        ext             v29.16b, v27.16b, v27.16b, #4
2156        ext             v30.16b, v27.16b, v27.16b, #6
2157        mul             v27.4h,  v27.4h,  v0.h[0]
2158        mla             v27.4h,  v28.4h,  v0.h[1]
2159        mla             v27.4h,  v29.4h,  v0.h[2]
2160        mla             v27.4h,  v30.4h,  v0.h[3]
2161        srshr           v28.4h,  v31.4h,  #2
2162        srshr           v29.4h,  v27.4h,  #2
2163        ret
2164
216580:
2166160:
2167320:
2168        b.gt            880f
2169        add             \xmy,  \xmy,  #2
2170        ld1             {v0.8b},  [\xmx]
2171        ld1             {v1.s}[0],  [\xmy]
2172        sub             \src,  \src,  #3
2173        sub             \src,  \src,  \s_strd
2174        sxtl            v0.8h,  v0.8b
2175        sxtl            v1.8h,  v1.8b
2176        mov             x15, x30
2177        mov             \my,  \h
2178
2179164:    // 8x2, 8x4, 16x2, 16x4, 32x2, 32x4 hv
2180        add             \ds2,  \dst,  \d_strd
2181        add             \sr2,  \src,  \s_strd
2182        lsl             \d_strd, \d_strd, #1
2183        lsl             \s_strd, \s_strd, #1
2184
2185        ld1             {v28.8b, v29.8b},  [\src], \s_strd
2186        uxtl            v28.8h,  v28.8b
2187        uxtl            v29.8h,  v29.8b
2188        mul             v24.8h,  v28.8h,  v0.h[0]
2189.irpc i, 1234567
2190        ext             v26.16b, v28.16b, v29.16b, #(2*\i)
2191        mla             v24.8h,  v26.8h,  v0.h[\i]
2192.endr
2193        srshr           v16.8h,  v24.8h, #2
2194
2195        bl              L(\type\()_8tap_filter_8)
2196        mov             v17.16b, v24.16b
2197        mov             v18.16b, v25.16b
2198
21998:
2200        smull           v2.4s,  v16.4h, v1.h[0]
2201        smull2          v3.4s,  v16.8h, v1.h[0]
2202        bl              L(\type\()_8tap_filter_8)
2203        smull           v4.4s,  v17.4h, v1.h[0]
2204        smull2          v5.4s,  v17.8h, v1.h[0]
2205        smlal           v2.4s,  v17.4h, v1.h[1]
2206        smlal2          v3.4s,  v17.8h, v1.h[1]
2207        smlal           v4.4s,  v18.4h, v1.h[1]
2208        smlal2          v5.4s,  v18.8h, v1.h[1]
2209        smlal           v2.4s,  v18.4h, v1.h[2]
2210        smlal2          v3.4s,  v18.8h, v1.h[2]
2211        smlal           v4.4s,  v24.4h, v1.h[2]
2212        smlal2          v5.4s,  v24.8h, v1.h[2]
2213        smlal           v2.4s,  v24.4h, v1.h[3]
2214        smlal2          v3.4s,  v24.8h, v1.h[3]
2215        smlal           v4.4s,  v25.4h, v1.h[3]
2216        smlal2          v5.4s,  v25.8h, v1.h[3]
2217        sqrshrn         v2.4h,  v2.4s,  #\shift_hv
2218        sqrshrn2        v2.8h,  v3.4s,  #\shift_hv
2219        sqrshrn         v4.4h,  v4.4s,  #\shift_hv
2220        sqrshrn2        v4.8h,  v5.4s,  #\shift_hv
2221        subs            \h,  \h,  #2
2222.ifc \type, put
2223        sqxtun          v2.8b,  v2.8h
2224        sqxtun          v4.8b,  v4.8h
2225        st1             {v2.8b}, [\dst], \d_strd
2226        st1             {v4.8b}, [\ds2], \d_strd
2227.else
2228        st1             {v2.8h}, [\dst], \d_strd
2229        st1             {v4.8h}, [\ds2], \d_strd
2230.endif
2231        b.le            9f
2232        mov             v16.16b, v18.16b
2233        mov             v17.16b, v24.16b
2234        mov             v18.16b, v25.16b
2235        b               8b
22369:
2237        subs            \w,  \w,  #8
2238        b.le            0f
2239        asr             \s_strd,  \s_strd,  #1
2240        asr             \d_strd,  \d_strd,  #1
2241        msub            \src,  \s_strd,  \xmy,  \src
2242        msub            \dst,  \d_strd,  \xmy,  \dst
2243        sub             \src,  \src,  \s_strd,  lsl #2
2244        mov             \h,  \my
2245        add             \src,  \src,  #8
2246.ifc \type, put
2247        add             \dst,  \dst,  #8
2248.else
2249        add             \dst,  \dst,  #16
2250.endif
2251        b               164b
2252
2253880:    // 8x8, 8x16, ..., 16x8, ..., 32x8, ... hv
2254640:
22551280:
2256        ld1             {v0.8b},  [\xmx]
2257        ld1             {v1.8b},  [\xmy]
2258        sub             \src,  \src,  #3
2259        sub             \src,  \src,  \s_strd
2260        sub             \src,  \src,  \s_strd, lsl #1
2261        sxtl            v0.8h,  v0.8b
2262        sxtl            v1.8h,  v1.8b
2263        mov             x15, x30
2264        mov             \my,  \h
2265
2266168:
2267        add             \ds2,  \dst,  \d_strd
2268        add             \sr2,  \src,  \s_strd
2269        lsl             \d_strd, \d_strd, #1
2270        lsl             \s_strd, \s_strd, #1
2271
2272        ld1             {v28.8b, v29.8b},  [\src], \s_strd
2273        uxtl            v28.8h,  v28.8b
2274        uxtl            v29.8h,  v29.8b
2275        mul             v24.8h,  v28.8h,  v0.h[0]
2276.irpc i, 1234567
2277        ext             v26.16b, v28.16b, v29.16b, #(2*\i)
2278        mla             v24.8h,  v26.8h,  v0.h[\i]
2279.endr
2280        srshr           v16.8h,  v24.8h, #2
2281
2282        bl              L(\type\()_8tap_filter_8)
2283        mov             v17.16b, v24.16b
2284        mov             v18.16b, v25.16b
2285        bl              L(\type\()_8tap_filter_8)
2286        mov             v19.16b, v24.16b
2287        mov             v20.16b, v25.16b
2288        bl              L(\type\()_8tap_filter_8)
2289        mov             v21.16b, v24.16b
2290        mov             v22.16b, v25.16b
2291
229288:
2293        smull           v2.4s,  v16.4h, v1.h[0]
2294        smull2          v3.4s,  v16.8h, v1.h[0]
2295        bl              L(\type\()_8tap_filter_8)
2296        smull           v4.4s,  v17.4h, v1.h[0]
2297        smull2          v5.4s,  v17.8h, v1.h[0]
2298        smlal           v2.4s,  v17.4h, v1.h[1]
2299        smlal2          v3.4s,  v17.8h, v1.h[1]
2300        smlal           v4.4s,  v18.4h, v1.h[1]
2301        smlal2          v5.4s,  v18.8h, v1.h[1]
2302        smlal           v2.4s,  v18.4h, v1.h[2]
2303        smlal2          v3.4s,  v18.8h, v1.h[2]
2304        smlal           v4.4s,  v19.4h, v1.h[2]
2305        smlal2          v5.4s,  v19.8h, v1.h[2]
2306        smlal           v2.4s,  v19.4h, v1.h[3]
2307        smlal2          v3.4s,  v19.8h, v1.h[3]
2308        smlal           v4.4s,  v20.4h, v1.h[3]
2309        smlal2          v5.4s,  v20.8h, v1.h[3]
2310        smlal           v2.4s,  v20.4h, v1.h[4]
2311        smlal2          v3.4s,  v20.8h, v1.h[4]
2312        smlal           v4.4s,  v21.4h, v1.h[4]
2313        smlal2          v5.4s,  v21.8h, v1.h[4]
2314        smlal           v2.4s,  v21.4h, v1.h[5]
2315        smlal2          v3.4s,  v21.8h, v1.h[5]
2316        smlal           v4.4s,  v22.4h, v1.h[5]
2317        smlal2          v5.4s,  v22.8h, v1.h[5]
2318        smlal           v2.4s,  v22.4h, v1.h[6]
2319        smlal2          v3.4s,  v22.8h, v1.h[6]
2320        smlal           v4.4s,  v24.4h, v1.h[6]
2321        smlal2          v5.4s,  v24.8h, v1.h[6]
2322        smlal           v2.4s,  v24.4h, v1.h[7]
2323        smlal2          v3.4s,  v24.8h, v1.h[7]
2324        smlal           v4.4s,  v25.4h, v1.h[7]
2325        smlal2          v5.4s,  v25.8h, v1.h[7]
2326        sqrshrn         v2.4h,  v2.4s,  #\shift_hv
2327        sqrshrn2        v2.8h,  v3.4s,  #\shift_hv
2328        sqrshrn         v4.4h,  v4.4s,  #\shift_hv
2329        sqrshrn2        v4.8h,  v5.4s,  #\shift_hv
2330        subs            \h,  \h,  #2
2331.ifc \type, put
2332        sqxtun          v2.8b,  v2.8h
2333        sqxtun          v4.8b,  v4.8h
2334        st1             {v2.8b}, [\dst], \d_strd
2335        st1             {v4.8b}, [\ds2], \d_strd
2336.else
2337        st1             {v2.8h}, [\dst], \d_strd
2338        st1             {v4.8h}, [\ds2], \d_strd
2339.endif
2340        b.le            9f
2341        mov             v16.16b, v18.16b
2342        mov             v17.16b, v19.16b
2343        mov             v18.16b, v20.16b
2344        mov             v19.16b, v21.16b
2345        mov             v20.16b, v22.16b
2346        mov             v21.16b, v24.16b
2347        mov             v22.16b, v25.16b
2348        b               88b
23499:
2350        subs            \w,  \w,  #8
2351        b.le            0f
2352        asr             \s_strd,  \s_strd,  #1
2353        asr             \d_strd,  \d_strd,  #1
2354        msub            \src,  \s_strd,  \xmy,  \src
2355        msub            \dst,  \d_strd,  \xmy,  \dst
2356        sub             \src,  \src,  \s_strd,  lsl #3
2357        mov             \h,  \my
2358        add             \src,  \src,  #8
2359.ifc \type, put
2360        add             \dst,  \dst,  #8
2361.else
2362        add             \dst,  \dst,  #16
2363.endif
2364        b               168b
23650:
2366        br              x15
2367
2368L(\type\()_8tap_filter_8):
2369        ld1             {v28.8b, v29.8b},  [\sr2], \s_strd
2370        ld1             {v30.8b, v31.8b},  [\src], \s_strd
2371        uxtl            v28.8h,  v28.8b
2372        uxtl            v29.8h,  v29.8b
2373        uxtl            v30.8h,  v30.8b
2374        uxtl            v31.8h,  v31.8b
2375        mul             v24.8h,  v28.8h,  v0.h[0]
2376        mul             v25.8h,  v30.8h,  v0.h[0]
2377.irpc i, 1234567
2378        ext             v26.16b, v28.16b, v29.16b, #(2*\i)
2379        ext             v27.16b, v30.16b, v31.16b, #(2*\i)
2380        mla             v24.8h,  v26.8h,  v0.h[\i]
2381        mla             v25.8h,  v27.8h,  v0.h[\i]
2382.endr
2383        srshr           v24.8h,  v24.8h, #2
2384        srshr           v25.8h,  v25.8h, #2
2385        ret
2386
2387L(\type\()_8tap_hv_tbl):
2388        .hword L(\type\()_8tap_hv_tbl) - 1280b
2389        .hword L(\type\()_8tap_hv_tbl) -  640b
2390        .hword L(\type\()_8tap_hv_tbl) -  320b
2391        .hword L(\type\()_8tap_hv_tbl) -  160b
2392        .hword L(\type\()_8tap_hv_tbl) -   80b
2393        .hword L(\type\()_8tap_hv_tbl) -   40b
2394        .hword L(\type\()_8tap_hv_tbl) -   20b
2395        .hword 0
2396endfunc
2397
2398
2399function \type\()_bilin_8bpc_neon, export=1
2400        dup             v1.16b, \mx
2401        dup             v3.16b, \my
2402        mov             w9,  #16
2403        sub             w8, w9, \mx
2404        sub             w9, w9, \my
2405        dup             v0.16b, w8
2406        dup             v2.16b, w9
2407.ifc \type, prep
2408        uxtw            \d_strd, \w
2409        lsl             \d_strd, \d_strd, #1
2410.endif
2411
2412        clz             w8,  \w
2413        sub             w8,  w8,  #24
2414        cbnz            \mx, L(\type\()_bilin_h)
2415        cbnz            \my, L(\type\()_bilin_v)
2416        b               \type\()_neon
2417
2418L(\type\()_bilin_h):
2419        cbnz            \my, L(\type\()_bilin_hv)
2420
2421        adr             x9,  L(\type\()_bilin_h_tbl)
2422        ldrh            w8,  [x9, x8, lsl #1]
2423        sub             x9,  x9,  w8, uxtw
2424        br              x9
2425
242620:     // 2xN h
2427.ifc \type, put
2428        add             \ds2,  \dst,  \d_strd
2429        add             \sr2,  \src,  \s_strd
2430        lsl             \d_strd,  \d_strd,  #1
2431        lsl             \s_strd,  \s_strd,  #1
24322:
2433        ld1             {v4.s}[0],  [\src], \s_strd
2434        ld1             {v6.s}[0],  [\sr2], \s_strd
2435        ext             v5.8b,  v4.8b,  v4.8b, #1
2436        ext             v7.8b,  v6.8b,  v6.8b, #1
2437        trn1            v4.4h,  v4.4h,  v6.4h
2438        trn1            v5.4h,  v5.4h,  v7.4h
2439        subs            \h,  \h,  #2
2440        umull           v4.8h,  v4.8b,  v0.8b
2441        umlal           v4.8h,  v5.8b,  v1.8b
2442        uqrshrn         v4.8b,  v4.8h,  #4
2443        st1             {v4.h}[0], [\dst], \d_strd
2444        st1             {v4.h}[1], [\ds2], \d_strd
2445        b.gt            2b
2446        ret
2447.endif
2448
244940:     // 4xN h
2450        add             \ds2,  \dst,  \d_strd
2451        add             \sr2,  \src,  \s_strd
2452        lsl             \d_strd,  \d_strd,  #1
2453        lsl             \s_strd,  \s_strd,  #1
24544:
2455        ld1             {v4.8b}, [\src], \s_strd
2456        ld1             {v6.8b}, [\sr2], \s_strd
2457        ext             v5.8b,  v4.8b,  v4.8b, #1
2458        ext             v7.8b,  v6.8b,  v6.8b, #1
2459        trn1            v4.2s,  v4.2s,  v6.2s
2460        trn1            v5.2s,  v5.2s,  v7.2s
2461        subs            \h,  \h,  #2
2462        umull           v4.8h,  v4.8b,  v0.8b
2463        umlal           v4.8h,  v5.8b,  v1.8b
2464.ifc \type, put
2465        uqrshrn         v4.8b,  v4.8h,  #4
2466        st1             {v4.s}[0], [\dst], \d_strd
2467        st1             {v4.s}[1], [\ds2], \d_strd
2468.else
2469        st1             {v4.d}[0], [\dst], \d_strd
2470        st1             {v4.d}[1], [\ds2], \d_strd
2471.endif
2472        b.gt            4b
2473        ret
2474
247580:     // 8xN h
2476        add             \ds2,  \dst,  \d_strd
2477        add             \sr2,  \src,  \s_strd
2478        lsl             \d_strd,  \d_strd,  #1
2479        lsl             \s_strd,  \s_strd,  #1
24808:
2481        ld1             {v4.16b}, [\src], \s_strd
2482        ld1             {v6.16b}, [\sr2], \s_strd
2483        ext             v5.16b, v4.16b, v4.16b, #1
2484        ext             v7.16b, v6.16b, v6.16b, #1
2485        subs            \h,  \h,  #2
2486        umull           v4.8h,  v4.8b,  v0.8b
2487        umull           v6.8h,  v6.8b,  v0.8b
2488        umlal           v4.8h,  v5.8b,  v1.8b
2489        umlal           v6.8h,  v7.8b,  v1.8b
2490.ifc \type, put
2491        uqrshrn         v4.8b,  v4.8h,  #4
2492        uqrshrn         v6.8b,  v6.8h,  #4
2493        st1             {v4.8b}, [\dst], \d_strd
2494        st1             {v6.8b}, [\ds2], \d_strd
2495.else
2496        st1             {v4.8h}, [\dst], \d_strd
2497        st1             {v6.8h}, [\ds2], \d_strd
2498.endif
2499        b.gt            8b
2500        ret
2501160:
2502320:
2503640:
25041280:   // 16xN, 32xN, ... h
2505        add             \ds2,  \dst,  \d_strd
2506        add             \sr2,  \src,  \s_strd
2507        lsl             \s_strd,  \s_strd,  #1
2508
2509        sub             \s_strd,  \s_strd,  \w, uxtw
2510        sub             \s_strd,  \s_strd,  #8
2511.ifc \type, put
2512        lsl             \d_strd,  \d_strd,  #1
2513        sub             \d_strd,  \d_strd,  \w, uxtw
2514.endif
2515161:
2516        ld1             {v16.d}[1],  [\src], #8
2517        ld1             {v20.d}[1],  [\sr2], #8
2518        mov             \mx, \w
2519
252016:
2521        ld1             {v18.16b},  [\src], #16
2522        ld1             {v22.16b},  [\sr2], #16
2523        ext             v17.16b, v16.16b, v18.16b, #8
2524        ext             v19.16b, v16.16b, v18.16b, #9
2525        ext             v21.16b, v20.16b, v22.16b, #8
2526        ext             v23.16b, v20.16b, v22.16b, #9
2527        umull           v16.8h,  v17.8b,  v0.8b
2528        umull2          v17.8h,  v17.16b, v0.16b
2529        umull           v20.8h,  v21.8b,  v0.8b
2530        umull2          v21.8h,  v21.16b, v0.16b
2531        umlal           v16.8h,  v19.8b,  v1.8b
2532        umlal2          v17.8h,  v19.16b, v1.16b
2533        umlal           v20.8h,  v23.8b,  v1.8b
2534        umlal2          v21.8h,  v23.16b, v1.16b
2535        subs            \mx, \mx, #16
2536.ifc \type, put
2537        uqrshrn         v16.8b,  v16.8h, #4
2538        uqrshrn2        v16.16b, v17.8h, #4
2539        uqrshrn         v20.8b,  v20.8h, #4
2540        uqrshrn2        v20.16b, v21.8h, #4
2541        st1             {v16.16b}, [\dst], #16
2542        st1             {v20.16b}, [\ds2], #16
2543.else
2544        st1             {v16.8h, v17.8h}, [\dst], #32
2545        st1             {v20.8h, v21.8h}, [\ds2], #32
2546.endif
2547        b.le            9f
2548
2549        mov             v16.16b, v18.16b
2550        mov             v20.16b, v22.16b
2551        b               16b
2552
25539:
2554        add             \dst,  \dst,  \d_strd
2555        add             \ds2,  \ds2,  \d_strd
2556        add             \src,  \src,  \s_strd
2557        add             \sr2,  \sr2,  \s_strd
2558
2559        subs            \h,  \h,  #2
2560        b.gt            161b
2561        ret
2562
2563L(\type\()_bilin_h_tbl):
2564        .hword L(\type\()_bilin_h_tbl) - 1280b
2565        .hword L(\type\()_bilin_h_tbl) -  640b
2566        .hword L(\type\()_bilin_h_tbl) -  320b
2567        .hword L(\type\()_bilin_h_tbl) -  160b
2568        .hword L(\type\()_bilin_h_tbl) -   80b
2569        .hword L(\type\()_bilin_h_tbl) -   40b
2570        .hword L(\type\()_bilin_h_tbl) -   20b
2571        .hword 0
2572
2573
2574L(\type\()_bilin_v):
2575        cmp             \h,  #4
2576        adr             x9,  L(\type\()_bilin_v_tbl)
2577        ldrh            w8,  [x9, x8, lsl #1]
2578        sub             x9,  x9,  w8, uxtw
2579        br              x9
2580
258120:     // 2xN v
2582.ifc \type, put
2583        cmp             \h,  #2
2584        add             \ds2,  \dst,  \d_strd
2585        add             \sr2,  \src,  \s_strd
2586        lsl             \s_strd,  \s_strd,  #1
2587        lsl             \d_strd,  \d_strd,  #1
2588
2589        // 2x2 v
2590        ld1             {v16.h}[0], [\src], \s_strd
2591        b.gt            24f
2592        ld1             {v17.h}[0], [\sr2], \s_strd
2593        ld1             {v18.h}[0], [\src], \s_strd
2594        trn1            v16.4h, v16.4h, v17.4h
2595        trn1            v17.4h, v17.4h, v18.4h
2596        umull           v4.8h,  v16.8b,  v2.8b
2597        umlal           v4.8h,  v17.8b,  v3.8b
2598        uqrshrn         v4.8b,  v4.8h,  #4
2599        st1             {v4.h}[0], [\dst]
2600        st1             {v4.h}[1], [\ds2]
2601        ret
260224:     // 2x4, 2x8, ... v
2603        ld1             {v17.h}[0], [\sr2], \s_strd
2604        ld1             {v18.h}[0], [\src], \s_strd
2605        ld1             {v19.h}[0], [\sr2], \s_strd
2606        ld1             {v20.h}[0], [\src], \s_strd
2607        trn1            v16.4h, v16.4h, v17.4h
2608        trn1            v17.4h, v17.4h, v18.4h
2609        trn1            v18.4h, v18.4h, v19.4h
2610        trn1            v19.4h, v19.4h, v20.4h
2611        trn1            v16.2s, v16.2s, v18.2s
2612        trn1            v17.2s, v17.2s, v19.2s
2613        umull           v4.8h,  v16.8b,  v2.8b
2614        umlal           v4.8h,  v17.8b,  v3.8b
2615        subs            \h,  \h,  #4
2616        uqrshrn         v4.8b,  v4.8h,  #4
2617        st1             {v4.h}[0], [\dst], \d_strd
2618        st1             {v4.h}[1], [\ds2], \d_strd
2619        st1             {v4.h}[2], [\dst], \d_strd
2620        st1             {v4.h}[3], [\ds2], \d_strd
2621        b.le            0f
2622        mov             v16.8b, v20.8b
2623        b               24b
26240:
2625        ret
2626.endif
2627
262840:     // 4xN v
2629        add             \ds2,  \dst,  \d_strd
2630        add             \sr2,  \src,  \s_strd
2631        lsl             \s_strd,  \s_strd,  #1
2632        lsl             \d_strd,  \d_strd,  #1
2633        ld1             {v16.s}[0], [\src], \s_strd
26344:
2635        ld1             {v17.s}[0], [\sr2], \s_strd
2636        ld1             {v18.s}[0], [\src], \s_strd
2637        trn1            v16.2s, v16.2s, v17.2s
2638        trn1            v17.2s, v17.2s, v18.2s
2639        umull           v4.8h,  v16.8b,  v2.8b
2640        umlal           v4.8h,  v17.8b,  v3.8b
2641        subs            \h,  \h,  #2
2642.ifc \type, put
2643        uqrshrn         v4.8b,  v4.8h,  #4
2644        st1             {v4.s}[0], [\dst], \d_strd
2645        st1             {v4.s}[1], [\ds2], \d_strd
2646.else
2647        st1             {v4.d}[0], [\dst], \d_strd
2648        st1             {v4.d}[1], [\ds2], \d_strd
2649.endif
2650        b.le            0f
2651        mov             v16.8b, v18.8b
2652        b               4b
26530:
2654        ret
2655
265680:     // 8xN v
2657        add             \ds2,  \dst,  \d_strd
2658        add             \sr2,  \src,  \s_strd
2659        lsl             \s_strd,  \s_strd,  #1
2660        lsl             \d_strd,  \d_strd,  #1
2661        ld1             {v16.8b}, [\src], \s_strd
26628:
2663        ld1             {v17.8b}, [\sr2], \s_strd
2664        ld1             {v18.8b}, [\src], \s_strd
2665        umull           v4.8h,  v16.8b,  v2.8b
2666        umull           v5.8h,  v17.8b,  v2.8b
2667        umlal           v4.8h,  v17.8b,  v3.8b
2668        umlal           v5.8h,  v18.8b,  v3.8b
2669        subs            \h,  \h,  #2
2670.ifc \type, put
2671        uqrshrn         v4.8b,  v4.8h,  #4
2672        uqrshrn         v5.8b,  v5.8h,  #4
2673        st1             {v4.8b}, [\dst], \d_strd
2674        st1             {v5.8b}, [\ds2], \d_strd
2675.else
2676        st1             {v4.8h}, [\dst], \d_strd
2677        st1             {v5.8h}, [\ds2], \d_strd
2678.endif
2679        b.le            0f
2680        mov             v16.8b, v18.8b
2681        b               8b
26820:
2683        ret
2684
2685160:    // 16xN, 32xN, ...
2686320:
2687640:
26881280:
2689        mov             \my,  \h
26901:
2691        add             \ds2, \dst, \d_strd
2692        add             \sr2, \src, \s_strd
2693        lsl             \s_strd, \s_strd, #1
2694        lsl             \d_strd, \d_strd, #1
2695
2696        ld1             {v16.16b}, [\src], \s_strd
26972:
2698        ld1             {v17.16b}, [\sr2], \s_strd
2699        ld1             {v18.16b}, [\src], \s_strd
2700        umull           v4.8h,  v16.8b,  v2.8b
2701        umull2          v5.8h,  v16.16b, v2.16b
2702        umull           v6.8h,  v17.8b,  v2.8b
2703        umull2          v7.8h,  v17.16b, v2.16b
2704        umlal           v4.8h,  v17.8b,  v3.8b
2705        umlal2          v5.8h,  v17.16b, v3.16b
2706        umlal           v6.8h,  v18.8b,  v3.8b
2707        umlal2          v7.8h,  v18.16b, v3.16b
2708        subs            \h,  \h,  #2
2709.ifc \type, put
2710        uqrshrn         v4.8b,  v4.8h,  #4
2711        uqrshrn2        v4.16b, v5.8h,  #4
2712        uqrshrn         v6.8b,  v6.8h,  #4
2713        uqrshrn2        v6.16b, v7.8h,  #4
2714        st1             {v4.16b}, [\dst], \d_strd
2715        st1             {v6.16b}, [\ds2], \d_strd
2716.else
2717        st1             {v4.8h, v5.8h}, [\dst], \d_strd
2718        st1             {v6.8h, v7.8h}, [\ds2], \d_strd
2719.endif
2720        b.le            9f
2721        mov             v16.16b, v18.16b
2722        b               2b
27239:
2724        subs            \w,  \w,  #16
2725        b.le            0f
2726        asr             \s_strd, \s_strd, #1
2727        asr             \d_strd, \d_strd, #1
2728        msub            \src, \s_strd, \xmy, \src
2729        msub            \dst, \d_strd, \xmy, \dst
2730        sub             \src, \src, \s_strd, lsl #1
2731        mov             \h,  \my
2732        add             \src, \src, #16
2733.ifc \type, put
2734        add             \dst, \dst, #16
2735.else
2736        add             \dst, \dst, #32
2737.endif
2738        b               1b
27390:
2740        ret
2741
2742L(\type\()_bilin_v_tbl):
2743        .hword L(\type\()_bilin_v_tbl) - 1280b
2744        .hword L(\type\()_bilin_v_tbl) -  640b
2745        .hword L(\type\()_bilin_v_tbl) -  320b
2746        .hword L(\type\()_bilin_v_tbl) -  160b
2747        .hword L(\type\()_bilin_v_tbl) -   80b
2748        .hword L(\type\()_bilin_v_tbl) -   40b
2749        .hword L(\type\()_bilin_v_tbl) -   20b
2750        .hword 0
2751
2752L(\type\()_bilin_hv):
2753        uxtl            v2.8h, v2.8b
2754        uxtl            v3.8h, v3.8b
2755        adr             x9,  L(\type\()_bilin_hv_tbl)
2756        ldrh            w8,  [x9, x8, lsl #1]
2757        sub             x9,  x9,  w8, uxtw
2758        br              x9
2759
276020:     // 2xN hv
2761.ifc \type, put
2762        add             \sr2, \src, \s_strd
2763        add             \ds2, \dst, \d_strd
2764        lsl             \s_strd, \s_strd, #1
2765        lsl             \d_strd, \d_strd, #1
2766
2767        ld1             {v28.s}[0],  [\src], \s_strd
2768        ext             v29.8b, v28.8b, v28.8b, #1
2769        umull           v16.8h, v28.8b, v0.8b
2770        umlal           v16.8h, v29.8b, v1.8b
2771
27722:
2773        ld1             {v28.s}[0],  [\sr2], \s_strd
2774        ld1             {v30.s}[0],  [\src], \s_strd
2775        ext             v29.8b, v28.8b, v28.8b, #1
2776        ext             v31.8b, v30.8b, v30.8b, #1
2777        trn1            v28.4h, v28.4h, v30.4h
2778        trn1            v29.4h, v29.4h, v31.4h
2779        umull           v17.8h, v28.8b, v0.8b
2780        umlal           v17.8h, v29.8b, v1.8b
2781
2782        trn1            v16.2s, v16.2s, v17.2s
2783
2784        mul             v4.4h,  v16.4h, v2.4h
2785        mla             v4.4h,  v17.4h, v3.4h
2786        uqrshrn         v4.8b,  v4.8h,  #8
2787        subs            \h,  \h,  #2
2788        st1             {v4.h}[0], [\dst], \d_strd
2789        st1             {v4.h}[1], [\ds2], \d_strd
2790        b.le            0f
2791        trn2            v16.2s, v17.2s, v17.2s
2792        b               2b
27930:
2794        ret
2795.endif
2796
279740:     // 4xN hv
2798        add             \sr2, \src, \s_strd
2799        add             \ds2, \dst, \d_strd
2800        lsl             \s_strd, \s_strd, #1
2801        lsl             \d_strd, \d_strd, #1
2802
2803        ld1             {v28.8b},  [\src], \s_strd
2804        ext             v29.8b, v28.8b, v28.8b, #1
2805        umull           v16.8h, v28.8b, v0.8b
2806        umlal           v16.8h, v29.8b, v1.8b
2807
28084:
2809        ld1             {v28.8b},  [\sr2], \s_strd
2810        ld1             {v30.8b},  [\src], \s_strd
2811        ext             v29.8b, v28.8b, v28.8b, #1
2812        ext             v31.8b, v30.8b, v30.8b, #1
2813        trn1            v28.2s, v28.2s, v30.2s
2814        trn1            v29.2s, v29.2s, v31.2s
2815        umull           v17.8h, v28.8b, v0.8b
2816        umlal           v17.8h, v29.8b, v1.8b
2817
2818        trn1            v16.2d, v16.2d, v17.2d
2819
2820        mul             v4.8h,  v16.8h, v2.8h
2821        mla             v4.8h,  v17.8h, v3.8h
2822        subs            \h,  \h,  #2
2823.ifc \type, put
2824        uqrshrn         v4.8b,  v4.8h,  #8
2825        st1             {v4.s}[0], [\dst], \d_strd
2826        st1             {v4.s}[1], [\ds2], \d_strd
2827.else
2828        urshr           v4.8h,  v4.8h,  #4
2829        st1             {v4.d}[0], [\dst], \d_strd
2830        st1             {v4.d}[1], [\ds2], \d_strd
2831.endif
2832        b.le            0f
2833        trn2            v16.2d, v17.2d, v17.2d
2834        b               4b
28350:
2836        ret
2837
283880:     // 8xN, 16xN, ... hv
2839160:
2840320:
2841640:
28421280:
2843        mov             \my,  \h
2844
28451:
2846        add             \sr2, \src, \s_strd
2847        add             \ds2, \dst, \d_strd
2848        lsl             \s_strd, \s_strd, #1
2849        lsl             \d_strd, \d_strd, #1
2850
2851        ld1             {v28.16b},  [\src], \s_strd
2852        ext             v29.16b, v28.16b, v28.16b, #1
2853        umull           v16.8h, v28.8b, v0.8b
2854        umlal           v16.8h, v29.8b, v1.8b
2855
28562:
2857        ld1             {v28.16b},  [\sr2], \s_strd
2858        ld1             {v30.16b},  [\src], \s_strd
2859        ext             v29.16b, v28.16b, v28.16b, #1
2860        ext             v31.16b, v30.16b, v30.16b, #1
2861        umull           v17.8h, v28.8b, v0.8b
2862        umlal           v17.8h, v29.8b, v1.8b
2863        umull           v18.8h, v30.8b, v0.8b
2864        umlal           v18.8h, v31.8b, v1.8b
2865
2866        mul             v4.8h,  v16.8h, v2.8h
2867        mla             v4.8h,  v17.8h, v3.8h
2868        mul             v5.8h,  v17.8h, v2.8h
2869        mla             v5.8h,  v18.8h, v3.8h
2870        subs            \h,  \h,  #2
2871.ifc \type, put
2872        uqrshrn         v4.8b,  v4.8h,  #8
2873        uqrshrn         v5.8b,  v5.8h,  #8
2874        st1             {v4.8b}, [\dst], \d_strd
2875        st1             {v5.8b}, [\ds2], \d_strd
2876.else
2877        urshr           v4.8h,  v4.8h,  #4
2878        urshr           v5.8h,  v5.8h,  #4
2879        st1             {v4.8h}, [\dst], \d_strd
2880        st1             {v5.8h}, [\ds2], \d_strd
2881.endif
2882        b.le            9f
2883        mov             v16.16b, v18.16b
2884        b               2b
28859:
2886        subs            \w,  \w,  #8
2887        b.le            0f
2888        asr             \s_strd,  \s_strd,  #1
2889        asr             \d_strd,  \d_strd,  #1
2890        msub            \src,  \s_strd,  \xmy,  \src
2891        msub            \dst,  \d_strd,  \xmy,  \dst
2892        sub             \src,  \src,  \s_strd,  lsl #1
2893        mov             \h,  \my
2894        add             \src,  \src,  #8
2895.ifc \type, put
2896        add             \dst,  \dst,  #8
2897.else
2898        add             \dst,  \dst,  #16
2899.endif
2900        b               1b
29010:
2902        ret
2903
2904L(\type\()_bilin_hv_tbl):
2905        .hword L(\type\()_bilin_hv_tbl) - 1280b
2906        .hword L(\type\()_bilin_hv_tbl) -  640b
2907        .hword L(\type\()_bilin_hv_tbl) -  320b
2908        .hword L(\type\()_bilin_hv_tbl) -  160b
2909        .hword L(\type\()_bilin_hv_tbl) -   80b
2910        .hword L(\type\()_bilin_hv_tbl) -   40b
2911        .hword L(\type\()_bilin_hv_tbl) -   20b
2912        .hword 0
2913endfunc
2914.endm
2915
2916filter_fn put,  x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 10
2917filter_fn prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6
2918
2919.macro load_filter_row dst, src, inc
2920        asr             w13, \src, #10
2921        ldr             \dst, [x11, w13, sxtw #3]
2922        add             \src, \src, \inc
2923.endm
2924
2925function warp_filter_horz_neon
2926        add             w12, w5,  #512
2927
2928        ld1             {v16.8b, v17.8b}, [x2], x3
2929
2930        load_filter_row d0, w12, w7
2931        uxtl            v16.8h,  v16.8b
2932        load_filter_row d1, w12, w7
2933        uxtl            v17.8h,  v17.8b
2934        load_filter_row d2, w12, w7
2935        sxtl            v0.8h,   v0.8b
2936        load_filter_row d3, w12, w7
2937        sxtl            v1.8h,   v1.8b
2938        load_filter_row d4, w12, w7
2939        sxtl            v2.8h,   v2.8b
2940        load_filter_row d5, w12, w7
2941        sxtl            v3.8h,   v3.8b
2942        load_filter_row d6, w12, w7
2943        sxtl            v4.8h,   v4.8b
2944        load_filter_row d7, w12, w7
2945        sxtl            v5.8h,   v5.8b
2946        ext             v18.16b, v16.16b, v17.16b, #2*1
2947        mul             v23.8h,  v16.8h,  v0.8h
2948        sxtl            v6.8h,   v6.8b
2949        ext             v19.16b, v16.16b, v17.16b, #2*2
2950        mul             v18.8h,  v18.8h,  v1.8h
2951        sxtl            v7.8h,   v7.8b
2952        ext             v20.16b, v16.16b, v17.16b, #2*3
2953        mul             v19.8h,  v19.8h,  v2.8h
2954        ext             v21.16b, v16.16b, v17.16b, #2*4
2955        saddlp          v23.4s,  v23.8h
2956        mul             v20.8h,  v20.8h,  v3.8h
2957        ext             v22.16b, v16.16b, v17.16b, #2*5
2958        saddlp          v18.4s,  v18.8h
2959        mul             v21.8h,  v21.8h,  v4.8h
2960        saddlp          v19.4s,  v19.8h
2961        mul             v22.8h,  v22.8h,  v5.8h
2962        saddlp          v20.4s,  v20.8h
2963        saddlp          v21.4s,  v21.8h
2964        saddlp          v22.4s,  v22.8h
2965        addp            v18.4s,  v23.4s,  v18.4s
2966        ext             v23.16b, v16.16b, v17.16b, #2*6
2967        addp            v19.4s,  v19.4s,  v20.4s
2968        mul             v23.8h,  v23.8h,  v6.8h
2969        ext             v20.16b, v16.16b, v17.16b, #2*7
2970        mul             v20.8h,  v20.8h,  v7.8h
2971        saddlp          v23.4s,  v23.8h
2972        addp            v21.4s,  v21.4s,  v22.4s
2973        saddlp          v20.4s,  v20.8h
2974        addp            v20.4s,  v23.4s,  v20.4s
2975        addp            v18.4s,  v18.4s,  v19.4s
2976        addp            v20.4s,  v21.4s,  v20.4s
2977
2978        add             w5,  w5,  w8
2979
2980        rshrn           v16.4h,  v18.4s,  #3
2981        rshrn2          v16.8h,  v20.4s,  #3
2982
2983        ret
2984endfunc
2985
2986// void dav1d_warp_affine_8x8_8bpc_neon(
2987//         pixel *dst, const ptrdiff_t dst_stride,
2988//         const pixel *src, const ptrdiff_t src_stride,
2989//         const int16_t *const abcd, int mx, int my)
2990.macro warp t, shift
2991function warp_affine_8x8\t\()_8bpc_neon, export=1
2992        ldr             x4,  [x4]
2993        sbfx            x7,  x4, #0,  #16
2994        sbfx            x8,  x4, #16, #16
2995        sbfx            x9,  x4, #32, #16
2996        sbfx            x4,  x4, #48, #16
2997        mov             w10, #8
2998        sub             x2,  x2,  x3, lsl #1
2999        sub             x2,  x2,  x3
3000        sub             x2,  x2,  #3
3001        movrel          x11, X(mc_warp_filter), 64*8
3002        mov             x15, x30
3003.ifnb \t
3004        lsl             x1,  x1,  #1
3005.endif
3006
3007        bl              warp_filter_horz_neon
3008        mov             v24.16b, v16.16b
3009        bl              warp_filter_horz_neon
3010        mov             v25.16b, v16.16b
3011        bl              warp_filter_horz_neon
3012        mov             v26.16b, v16.16b
3013        bl              warp_filter_horz_neon
3014        mov             v27.16b, v16.16b
3015        bl              warp_filter_horz_neon
3016        mov             v28.16b, v16.16b
3017        bl              warp_filter_horz_neon
3018        mov             v29.16b, v16.16b
3019        bl              warp_filter_horz_neon
3020        mov             v30.16b, v16.16b
3021
30221:
3023        add             w14, w6,  #512
3024        bl              warp_filter_horz_neon
3025        mov             v31.16b, v16.16b
3026
3027        load_filter_row d0, w14, w9
3028        load_filter_row d1, w14, w9
3029        load_filter_row d2, w14, w9
3030        load_filter_row d3, w14, w9
3031        load_filter_row d4, w14, w9
3032        load_filter_row d5, w14, w9
3033        load_filter_row d6, w14, w9
3034        load_filter_row d7, w14, w9
3035        transpose_8x8b  v0, v1, v2, v3, v4, v5, v6, v7, v16, v17
3036        sxtl            v0.8h,   v0.8b
3037        sxtl            v1.8h,   v1.8b
3038        sxtl            v2.8h,   v2.8b
3039        sxtl            v3.8h,   v3.8b
3040        sxtl            v4.8h,   v4.8b
3041        sxtl            v5.8h,   v5.8b
3042        sxtl            v6.8h,   v6.8b
3043        sxtl            v7.8h,   v7.8b
3044
3045        // This ordering of smull/smlal/smull2/smlal2 is highly
3046        // beneficial for Cortex A53 here.
3047        smull           v16.4s,  v24.4h,  v0.4h
3048        smlal           v16.4s,  v25.4h,  v1.4h
3049        smlal           v16.4s,  v26.4h,  v2.4h
3050        smlal           v16.4s,  v27.4h,  v3.4h
3051        smlal           v16.4s,  v28.4h,  v4.4h
3052        smlal           v16.4s,  v29.4h,  v5.4h
3053        smlal           v16.4s,  v30.4h,  v6.4h
3054        smlal           v16.4s,  v31.4h,  v7.4h
3055        smull2          v17.4s,  v24.8h,  v0.8h
3056        smlal2          v17.4s,  v25.8h,  v1.8h
3057        smlal2          v17.4s,  v26.8h,  v2.8h
3058        smlal2          v17.4s,  v27.8h,  v3.8h
3059        smlal2          v17.4s,  v28.8h,  v4.8h
3060        smlal2          v17.4s,  v29.8h,  v5.8h
3061        smlal2          v17.4s,  v30.8h,  v6.8h
3062        smlal2          v17.4s,  v31.8h,  v7.8h
3063
3064        mov             v24.16b, v25.16b
3065        mov             v25.16b, v26.16b
3066        sqrshrn         v16.4h,  v16.4s,  #\shift
3067        mov             v26.16b, v27.16b
3068        sqrshrn2        v16.8h,  v17.4s,  #\shift
3069        mov             v27.16b, v28.16b
3070        mov             v28.16b, v29.16b
3071.ifb \t
3072        sqxtun          v16.8b,  v16.8h
3073.endif
3074        mov             v29.16b, v30.16b
3075        mov             v30.16b, v31.16b
3076        subs            w10, w10, #1
3077.ifnb \t
3078        st1             {v16.8h}, [x0], x1
3079.else
3080        st1             {v16.8b}, [x0], x1
3081.endif
3082
3083        add             w6,  w6,  w4
3084        b.gt            1b
3085
3086        br              x15
3087endfunc
3088.endm
3089
3090warp  , 11
3091warp t, 7
3092