1/*
2 * Copyright © 2018, VideoLAN and dav1d authors
3 * Copyright © 2018, Janne Grunau
4 * Copyright © 2018, Martin Storsjo
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice, this
11 *    list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 *    this list of conditions and the following disclaimer in the documentation
15 *    and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
21 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29#include "src/arm/asm.S"
30#include "util.S"
31
32.macro avg dst, t0, t1, t2, t3
33        ld1             {\t0\().8h,\t1\().8h},   [x2],  32
34        ld1             {\t2\().8h,\t3\().8h},   [x3],  32
35        add             \t0\().8h,   \t0\().8h,   \t2\().8h
36        add             \t1\().8h,   \t1\().8h,   \t3\().8h
37        sqrshrun        \dst\().8b,  \t0\().8h,   #5
38        sqrshrun2       \dst\().16b, \t1\().8h,   #5
39.endm
40
41.macro w_avg dst, t0, t1, t2, t3
42        ld1             {\t0\().8h,\t1\().8h},   [x2],  32
43        ld1             {\t2\().8h,\t3\().8h},   [x3],  32
44        sub             \t0\().8h,   \t2\().8h,   \t0\().8h
45        sub             \t1\().8h,   \t3\().8h,   \t1\().8h
46        sqdmulh         \t0\().8h,   \t0\().8h,   v30.8h
47        sqdmulh         \t1\().8h,   \t1\().8h,   v30.8h
48        add             \t0\().8h,   \t2\().8h,   \t0\().8h
49        add             \t1\().8h,   \t3\().8h,   \t1\().8h
50        sqrshrun        \dst\().8b,  \t0\().8h,   #4
51        sqrshrun2       \dst\().16b, \t1\().8h,   #4
52.endm
53
54.macro mask dst, t0, t1, t2, t3
55        ld1             {v30.16b}, [x6],  16
56        ld1             {\t0\().8h,\t1\().8h},   [x2],  32
57        mul             v30.16b, v30.16b, v31.16b
58        ld1             {\t2\().8h,\t3\().8h},   [x3],  32
59        shll            v28.8h, v30.8b,  #8
60        shll2           v29.8h, v30.16b, #8
61        sub             \t0\().8h,   \t2\().8h,   \t0\().8h
62        sub             \t1\().8h,   \t3\().8h,   \t1\().8h
63        sqdmulh         \t0\().8h,   \t0\().8h,   v28.8h
64        sqdmulh         \t1\().8h,   \t1\().8h,   v29.8h
65        add             \t0\().8h,   \t2\().8h,   \t0\().8h
66        add             \t1\().8h,   \t3\().8h,   \t1\().8h
67        sqrshrun        \dst\().8b,  \t0\().8h,   #4
68        sqrshrun2       \dst\().16b, \t1\().8h,   #4
69.endm
70
71.macro bidir_fn type
72function \type\()_8bpc_neon, export=1
73        clz             w4,  w4
74.ifc \type, w_avg
75        dup             v30.8h, w6
76        neg             v30.8h, v30.8h
77        shl             v30.8h, v30.8h, #11
78.endif
79.ifc \type, mask
80        movi            v31.16b, #256-2
81.endif
82        adr             x7,  L(\type\()_tbl)
83        sub             w4,  w4,  #24
84        ldrh            w4,  [x7, x4, lsl #1]
85        \type           v4,  v0,  v1,  v2,  v3
86        sub             x7,  x7,  w4, uxtw
87        br              x7
8840:
89        add             x7,  x0,  x1
90        lsl             x1,  x1,  #1
914:
92        cmp             w5,  #4
93        st1             {v4.s}[0],  [x0], x1
94        st1             {v4.s}[1],  [x7], x1
95        st1             {v4.s}[2],  [x0], x1
96        st1             {v4.s}[3],  [x7], x1
97        b.eq            0f
98        \type           v5,  v0,  v1,  v2,  v3
99        cmp             w5,  #8
100        st1             {v5.s}[0],  [x0], x1
101        st1             {v5.s}[1],  [x7], x1
102        st1             {v5.s}[2],  [x0], x1
103        st1             {v5.s}[3],  [x7], x1
104        b.eq            0f
105        \type           v4,  v0,  v1,  v2,  v3
106        st1             {v4.s}[0],  [x0], x1
107        st1             {v4.s}[1],  [x7], x1
108        \type           v5,  v0,  v1,  v2,  v3
109        st1             {v4.s}[2],  [x0], x1
110        st1             {v4.s}[3],  [x7], x1
111        st1             {v5.s}[0],  [x0], x1
112        st1             {v5.s}[1],  [x7], x1
113        st1             {v5.s}[2],  [x0], x1
114        st1             {v5.s}[3],  [x7], x1
115        ret
11680:
117        add             x7,  x0,  x1
118        lsl             x1,  x1,  #1
1198:
120        st1             {v4.d}[0],  [x0], x1
121        \type           v5,  v0,  v1,  v2,  v3
122        st1             {v4.d}[1],  [x7], x1
123        st1             {v5.d}[0],  [x0], x1
124        subs            w5,  w5,  #4
125        st1             {v5.d}[1],  [x7], x1
126        b.le            0f
127        \type           v4,  v0,  v1,  v2,  v3
128        b               8b
12916:
130        \type           v5,  v0,  v1,  v2,  v3
131        st1             {v4.16b}, [x0], x1
132        \type           v6,  v0,  v1,  v2,  v3
133        st1             {v5.16b}, [x0], x1
134        \type           v7,  v0,  v1,  v2,  v3
135        st1             {v6.16b}, [x0], x1
136        subs            w5,  w5,  #4
137        st1             {v7.16b}, [x0], x1
138        b.le            0f
139        \type           v4,  v0,  v1,  v2,  v3
140        b               16b
141320:
142        add             x7,  x0,  x1
143        lsl             x1,  x1,  #1
14432:
145        \type           v5,  v0,  v1,  v2,  v3
146        \type           v6,  v0,  v1,  v2,  v3
147        st1             {v4.16b,v5.16b}, [x0], x1
148        \type           v7,  v0,  v1,  v2,  v3
149        subs            w5,  w5,  #2
150        st1             {v6.16b,v7.16b}, [x7], x1
151        b.le            0f
152        \type           v4,  v0,  v1,  v2,  v3
153        b               32b
154640:
155        add             x7,  x0,  x1
156        lsl             x1,  x1,  #1
15764:
158        \type           v5,  v0,  v1,  v2,  v3
159        \type           v6,  v0,  v1,  v2,  v3
160        \type           v7,  v0,  v1,  v2,  v3
161        \type           v16, v0,  v1,  v2,  v3
162        \type           v17, v0,  v1,  v2,  v3
163        st1             {v4.16b,v5.16b,v6.16b,v7.16b}, [x0], x1
164        \type           v18, v0,  v1,  v2,  v3
165        \type           v19, v0,  v1,  v2,  v3
166        subs            w5,  w5,  #2
167        st1             {v16.16b,v17.16b,v18.16b,v19.16b}, [x7], x1
168        b.le            0f
169        \type           v4, v0,  v1,  v2,  v3
170        b               64b
1711280:
172        add             x7,  x0,  #64
173128:
174        \type           v5,  v0,  v1,  v2,  v3
175        \type           v6,  v0,  v1,  v2,  v3
176        \type           v7,  v0,  v1,  v2,  v3
177        \type           v16, v0,  v1,  v2,  v3
178        \type           v17, v0,  v1,  v2,  v3
179        st1             {v4.16b,v5.16b,v6.16b,v7.16b}, [x0], x1
180        \type           v18, v0,  v1,  v2,  v3
181        \type           v19, v0,  v1,  v2,  v3
182        subs            w5,  w5,  #1
183        st1             {v16.16b,v17.16b,v18.16b,v19.16b}, [x7], x1
184        b.le            0f
185        \type           v4, v0,  v1,  v2,  v3
186        b               128b
1870:
188        ret
189L(\type\()_tbl):
190        .hword L(\type\()_tbl) - 1280b
191        .hword L(\type\()_tbl) -  640b
192        .hword L(\type\()_tbl) -  320b
193        .hword L(\type\()_tbl) -   16b
194        .hword L(\type\()_tbl) -   80b
195        .hword L(\type\()_tbl) -   40b
196endfunc
197.endm
198
199bidir_fn avg
200bidir_fn w_avg
201bidir_fn mask
202
203
204.macro w_mask_fn type
205function w_mask_\type\()_8bpc_neon, export=1
206        clz             w8,  w4
207        adr             x9,  L(w_mask_\type\()_tbl)
208        sub             w8,  w8,  #24
209        ldrh            w8,  [x9,  x8,  lsl #1]
210        sub             x9,  x9,  w8,  uxtw
211        mov             w10, #6903
212        dup             v0.8h,   w10
213.if \type == 444
214        movi            v1.16b,  #64
215.elseif \type == 422
216        dup             v2.8b,   w7
217        movi            v3.8b,   #129
218        sub             v3.8b,   v3.8b,   v2.8b
219.elseif \type == 420
220        dup             v2.8h,   w7
221        movi            v3.8h,   #1, lsl #8
222        sub             v3.8h,   v3.8h,   v2.8h
223.endif
224        add             x12,  x0,  x1
225        lsl             x1,   x1,  #1
226        br              x9
2274:
228        ld1             {v4.8h,   v5.8h},   [x2],  #32  // tmp1 (four rows at once)
229        ld1             {v6.8h,   v7.8h},   [x3],  #32  // tmp2 (four rows at once)
230        subs            w5,  w5,  #4
231        sub             v16.8h,  v6.8h,   v4.8h
232        sub             v17.8h,  v7.8h,   v5.8h
233        sabd            v18.8h,  v4.8h,   v6.8h
234        sabd            v19.8h,  v5.8h,   v7.8h
235        uqsub           v18.8h,  v0.8h,   v18.8h
236        uqsub           v19.8h,  v0.8h,   v19.8h
237        ushr            v18.8h,  v18.8h,  #8
238        ushr            v19.8h,  v19.8h,  #8
239        shl             v20.8h,  v18.8h,  #9
240        shl             v21.8h,  v19.8h,  #9
241        sqdmulh         v20.8h,  v20.8h,  v16.8h
242        sqdmulh         v21.8h,  v21.8h,  v17.8h
243        add             v20.8h,  v20.8h,  v4.8h
244        add             v21.8h,  v21.8h,  v5.8h
245        sqrshrun        v22.8b,  v20.8h,  #4
246        sqrshrun        v23.8b,  v21.8h,  #4
247.if \type == 444
248        xtn             v18.8b,   v18.8h
249        xtn2            v18.16b,  v19.8h
250        sub             v18.16b,  v1.16b,  v18.16b
251        st1             {v18.16b}, [x6],  #16
252.elseif \type == 422
253        addp            v18.8h,   v18.8h,  v19.8h
254        xtn             v18.8b,   v18.8h
255        uhsub           v18.8b,   v3.8b,   v18.8b
256        st1             {v18.8b},  [x6],  #8
257.elseif \type == 420
258        trn1            v24.2d,   v18.2d,  v19.2d
259        trn2            v25.2d,   v18.2d,  v19.2d
260        add             v24.8h,   v24.8h,  v25.8h
261        addp            v18.8h,   v24.8h,  v24.8h
262        sub             v18.4h,   v3.4h,   v18.4h
263        rshrn           v18.8b,   v18.8h,  #2
264        st1             {v18.s}[0],  [x6],  #4
265.endif
266        st1             {v22.s}[0],  [x0],  x1
267        st1             {v22.s}[1],  [x12], x1
268        st1             {v23.s}[0],  [x0],  x1
269        st1             {v23.s}[1],  [x12], x1
270        b.gt            4b
271        ret
2728:
273        ld1             {v4.8h,   v5.8h},   [x2],  #32
274        ld1             {v6.8h,   v7.8h},   [x3],  #32
275        subs            w5,  w5,  #2
276        sub             v16.8h,  v6.8h,   v4.8h
277        sub             v17.8h,  v7.8h,   v5.8h
278        sabd            v18.8h,  v4.8h,   v6.8h
279        sabd            v19.8h,  v5.8h,   v7.8h
280        uqsub           v18.8h,  v0.8h,   v18.8h
281        uqsub           v19.8h,  v0.8h,   v19.8h
282        ushr            v18.8h,  v18.8h,  #8
283        ushr            v19.8h,  v19.8h,  #8
284        shl             v20.8h,  v18.8h,  #9
285        shl             v21.8h,  v19.8h,  #9
286        sqdmulh         v20.8h,  v20.8h,  v16.8h
287        sqdmulh         v21.8h,  v21.8h,  v17.8h
288        add             v20.8h,  v20.8h,  v4.8h
289        add             v21.8h,  v21.8h,  v5.8h
290        sqrshrun        v22.8b,  v20.8h,  #4
291        sqrshrun        v23.8b,  v21.8h,  #4
292.if \type == 444
293        xtn             v18.8b,  v18.8h
294        xtn2            v18.16b, v19.8h
295        sub             v18.16b, v1.16b,  v18.16b
296        st1             {v18.16b}, [x6],  #16
297.elseif \type == 422
298        addp            v18.8h,  v18.8h,  v19.8h
299        xtn             v18.8b,  v18.8h
300        uhsub           v18.8b,  v3.8b,   v18.8b
301        st1             {v18.8b},  [x6],  #8
302.elseif \type == 420
303        add             v18.8h,  v18.8h,  v19.8h
304        addp            v18.8h,  v18.8h,  v18.8h
305        sub             v18.4h,  v3.4h,   v18.4h
306        rshrn           v18.8b,  v18.8h,  #2
307        st1             {v18.s}[0],  [x6],  #4
308.endif
309        st1             {v22.8b},  [x0],  x1
310        st1             {v23.8b},  [x12], x1
311        b.gt            8b
312        ret
3131280:
314640:
315320:
316160:
317        mov             w11, w4
318        sub             x1,  x1,  w4,  uxtw
319.if \type == 444
320        add             x10, x6,  w4,  uxtw
321.elseif \type == 422
322        add             x10, x6,  x11, lsr #1
323.endif
324        add             x9,  x3,  w4,  uxtw #1
325        add             x7,  x2,  w4,  uxtw #1
326161:
327        mov             w8,  w4
32816:
329        ld1             {v4.8h,   v5.8h},   [x2],  #32
330        ld1             {v6.8h,   v7.8h},   [x3],  #32
331        ld1             {v16.8h,  v17.8h},  [x7],  #32
332        ld1             {v18.8h,  v19.8h},  [x9],  #32
333        subs            w8,  w8,  #16
334        sub             v6.8h,   v6.8h,   v4.8h
335        sub             v7.8h,   v7.8h,   v5.8h
336        sub             v18.8h,  v18.8h,  v16.8h
337        sub             v19.8h,  v19.8h,  v17.8h
338        abs             v20.8h,  v6.8h
339        abs             v21.8h,  v7.8h
340        abs             v22.8h,  v18.8h
341        abs             v23.8h,  v19.8h
342        uqsub           v20.8h,  v0.8h,   v20.8h
343        uqsub           v21.8h,  v0.8h,   v21.8h
344        uqsub           v22.8h,  v0.8h,   v22.8h
345        uqsub           v23.8h,  v0.8h,   v23.8h
346        ushr            v20.8h,  v20.8h,  #8
347        ushr            v21.8h,  v21.8h,  #8
348        ushr            v22.8h,  v22.8h,  #8
349        ushr            v23.8h,  v23.8h,  #8
350        shl             v24.8h,  v20.8h,  #9
351        shl             v25.8h,  v21.8h,  #9
352        shl             v26.8h,  v22.8h,  #9
353        shl             v27.8h,  v23.8h,  #9
354        sqdmulh         v24.8h,  v24.8h,  v6.8h
355        sqdmulh         v25.8h,  v25.8h,  v7.8h
356        sqdmulh         v26.8h,  v26.8h,  v18.8h
357        sqdmulh         v27.8h,  v27.8h,  v19.8h
358        add             v24.8h,  v24.8h,  v4.8h
359        add             v25.8h,  v25.8h,  v5.8h
360        add             v26.8h,  v26.8h,  v16.8h
361        add             v27.8h,  v27.8h,  v17.8h
362        sqrshrun        v24.8b,  v24.8h,  #4
363        sqrshrun        v25.8b,  v25.8h,  #4
364        sqrshrun        v26.8b,  v26.8h,  #4
365        sqrshrun        v27.8b,  v27.8h,  #4
366.if \type == 444
367        xtn             v20.8b,  v20.8h
368        xtn2            v20.16b, v21.8h
369        xtn             v21.8b,  v22.8h
370        xtn2            v21.16b, v23.8h
371        sub             v20.16b, v1.16b,  v20.16b
372        sub             v21.16b, v1.16b,  v21.16b
373        st1             {v20.16b}, [x6],  #16
374        st1             {v21.16b}, [x10], #16
375.elseif \type == 422
376        addp            v20.8h,  v20.8h,  v21.8h
377        addp            v21.8h,  v22.8h,  v23.8h
378        xtn             v20.8b,  v20.8h
379        xtn             v21.8b,  v21.8h
380        uhsub           v20.8b,  v3.8b,   v20.8b
381        uhsub           v21.8b,  v3.8b,   v21.8b
382        st1             {v20.8b},  [x6],  #8
383        st1             {v21.8b},  [x10], #8
384.elseif \type == 420
385        add             v20.8h,  v20.8h,  v22.8h
386        add             v21.8h,  v21.8h,  v23.8h
387        addp            v20.8h,  v20.8h,  v21.8h
388        sub             v20.8h,  v3.8h,   v20.8h
389        rshrn           v20.8b,  v20.8h,  #2
390        st1             {v20.8b},  [x6],  #8
391.endif
392        st1             {v24.8b,  v25.8b},  [x0],  #16
393        st1             {v26.8b,  v27.8b},  [x12], #16
394        b.gt            16b
395        subs            w5,  w5,  #2
396        add             x2,  x2,  w4,  uxtw #1
397        add             x3,  x3,  w4,  uxtw #1
398        add             x7,  x7,  w4,  uxtw #1
399        add             x9,  x9,  w4,  uxtw #1
400.if \type == 444
401        add             x6,  x6,  w4,  uxtw
402        add             x10, x10, w4,  uxtw
403.elseif \type == 422
404        add             x6,  x6,  x11, lsr #1
405        add             x10, x10, x11, lsr #1
406.endif
407        add             x0,  x0,  x1
408        add             x12, x12, x1
409        b.gt            161b
410        ret
411L(w_mask_\type\()_tbl):
412        .hword L(w_mask_\type\()_tbl) - 1280b
413        .hword L(w_mask_\type\()_tbl) -  640b
414        .hword L(w_mask_\type\()_tbl) -  320b
415        .hword L(w_mask_\type\()_tbl) -  160b
416        .hword L(w_mask_\type\()_tbl) -    8b
417        .hword L(w_mask_\type\()_tbl) -    4b
418endfunc
419.endm
420
421w_mask_fn 444
422w_mask_fn 422
423w_mask_fn 420
424
425
426function blend_8bpc_neon, export=1
427        adr             x6,  L(blend_tbl)
428        clz             w3,  w3
429        sub             w3,  w3,  #26
430        ldrh            w3,  [x6,  x3,  lsl #1]
431        sub             x6,  x6,  w3,  uxtw
432        movi            v4.16b,  #64
433        add             x8,  x0,  x1
434        lsl             x1,  x1,  #1
435        br              x6
4364:
437        ld1             {v2.8b},     [x5],  #8
438        ld1             {v1.d}[0],   [x2],  #8
439        ld1             {v0.s}[0],   [x0]
440        subs            w4,  w4,  #2
441        ld1             {v0.s}[1],   [x8]
442        sub             v3.8b,   v4.8b,   v2.8b
443        umull           v5.8h,   v1.8b,   v2.8b
444        umlal           v5.8h,   v0.8b,   v3.8b
445        rshrn           v6.8b,   v5.8h,   #6
446        st1             {v6.s}[0],   [x0],  x1
447        st1             {v6.s}[1],   [x8],  x1
448        b.gt            4b
449        ret
4508:
451        ld1             {v2.16b},  [x5],  #16
452        ld1             {v1.16b},  [x2],  #16
453        ld1             {v0.d}[0],   [x0]
454        ld1             {v0.d}[1],   [x8]
455        sub             v3.16b,  v4.16b,  v2.16b
456        subs            w4,  w4,  #2
457        umull           v5.8h,   v1.8b,   v2.8b
458        umlal           v5.8h,   v0.8b,   v3.8b
459        umull2          v6.8h,   v1.16b,  v2.16b
460        umlal2          v6.8h,   v0.16b,  v3.16b
461        rshrn           v7.8b,   v5.8h,   #6
462        rshrn2          v7.16b,  v6.8h,   #6
463        st1             {v7.d}[0],   [x0],  x1
464        st1             {v7.d}[1],   [x8],  x1
465        b.gt            8b
466        ret
46716:
468        ld1             {v1.16b,  v2.16b},  [x5],  #32
469        ld1             {v5.16b,  v6.16b},  [x2],  #32
470        ld1             {v0.16b},  [x0]
471        subs            w4,  w4,  #2
472        sub             v7.16b,  v4.16b,  v1.16b
473        sub             v20.16b, v4.16b,  v2.16b
474        ld1             {v3.16b},  [x8]
475        umull           v16.8h,  v5.8b,   v1.8b
476        umlal           v16.8h,  v0.8b,   v7.8b
477        umull2          v17.8h,  v5.16b,  v1.16b
478        umlal2          v17.8h,  v0.16b,  v7.16b
479        umull           v21.8h,  v6.8b,   v2.8b
480        umlal           v21.8h,  v3.8b,   v20.8b
481        umull2          v22.8h,  v6.16b,  v2.16b
482        umlal2          v22.8h,  v3.16b,  v20.16b
483        rshrn           v18.8b,  v16.8h,  #6
484        rshrn2          v18.16b, v17.8h,  #6
485        rshrn           v19.8b,  v21.8h,  #6
486        rshrn2          v19.16b, v22.8h,  #6
487        st1             {v18.16b}, [x0],  x1
488        st1             {v19.16b}, [x8],  x1
489        b.gt            16b
490        ret
49132:
492        ld1             {v0.16b,  v1.16b,  v2.16b,  v3.16b},  [x5],  #64
493        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x2],  #64
494        ld1             {v20.16b, v21.16b}, [x0]
495        subs            w4,  w4,  #2
496        ld1             {v22.16b, v23.16b}, [x8]
497        sub             v5.16b,  v4.16b,  v0.16b
498        sub             v6.16b,  v4.16b,  v1.16b
499        sub             v30.16b, v4.16b,  v2.16b
500        sub             v31.16b, v4.16b,  v3.16b
501        umull           v24.8h,  v16.8b,  v0.8b
502        umlal           v24.8h,  v20.8b,  v5.8b
503        umull2          v26.8h,  v16.16b, v0.16b
504        umlal2          v26.8h,  v20.16b, v5.16b
505        umull           v28.8h,  v17.8b,  v1.8b
506        umlal           v28.8h,  v21.8b,  v6.8b
507        umull2          v7.8h,   v17.16b, v1.16b
508        umlal2          v7.8h,   v21.16b, v6.16b
509        umull           v27.8h,  v18.8b,  v2.8b
510        umlal           v27.8h,  v22.8b,  v30.8b
511        umull2          v1.8h,   v18.16b, v2.16b
512        umlal2          v1.8h,   v22.16b, v30.16b
513        umull           v29.8h,  v19.8b,  v3.8b
514        umlal           v29.8h,  v23.8b,  v31.8b
515        umull2          v21.8h,  v19.16b, v3.16b
516        umlal2          v21.8h,  v23.16b, v31.16b
517        rshrn           v24.8b,  v24.8h,  #6
518        rshrn2          v24.16b, v26.8h,  #6
519        rshrn           v25.8b,  v28.8h,  #6
520        rshrn2          v25.16b, v7.8h,   #6
521        rshrn           v27.8b,  v27.8h,  #6
522        rshrn2          v27.16b, v1.8h,   #6
523        rshrn           v28.8b,  v29.8h,  #6
524        rshrn2          v28.16b, v21.8h,  #6
525        st1             {v24.16b, v25.16b}, [x0],  x1
526        st1             {v27.16b, v28.16b}, [x8],  x1
527        b.gt            32b
528        ret
529L(blend_tbl):
530        .hword L(blend_tbl) - 32b
531        .hword L(blend_tbl) - 16b
532        .hword L(blend_tbl) -  8b
533        .hword L(blend_tbl) -  4b
534endfunc
535
536function blend_h_8bpc_neon, export=1
537        adr             x6,  L(blend_h_tbl)
538        movrel          x5,  X(obmc_masks)
539        add             x5,  x5,  w4,  uxtw
540        sub             w4,  w4,  w4,  lsr #2
541        clz             w7,  w3
542        movi            v4.16b,  #64
543        add             x8,  x0,  x1
544        lsl             x1,  x1,  #1
545        sub             w7,  w7,  #24
546        ldrh            w7,  [x6,  x7,  lsl #1]
547        sub             x6,  x6,  w7, uxtw
548        br              x6
5492:
550        ld1             {v0.h}[0],   [x5],  #2
551        ld1             {v1.s}[0],   [x2],  #4
552        subs            w4,  w4,  #2
553        ld1             {v2.h}[0],   [x0]
554        zip1            v0.8b,   v0.8b,   v0.8b
555        sub             v3.8b,   v4.8b,   v0.8b
556        ld1             {v2.h}[1],   [x8]
557        umull           v5.8h,   v1.8b,   v0.8b
558        umlal           v5.8h,   v2.8b,   v3.8b
559        rshrn           v5.8b,   v5.8h,   #6
560        st1             {v5.h}[0],   [x0],  x1
561        st1             {v5.h}[1],   [x8],  x1
562        b.gt            2b
563        ret
5644:
565        ld2r            {v0.8b,   v1.8b},   [x5],  #2
566        ld1             {v2.8b},   [x2],  #8
567        subs            w4,  w4,  #2
568        ext             v0.8b,   v0.8b,   v1.8b,   #4
569        ld1             {v3.s}[0],   [x0]
570        sub             v5.8b,   v4.8b,   v0.8b
571        ld1             {v3.s}[1],   [x8]
572        umull           v6.8h,   v2.8b,   v0.8b
573        umlal           v6.8h,   v3.8b,   v5.8b
574        rshrn           v6.8b,   v6.8h,   #6
575        st1             {v6.s}[0],   [x0],  x1
576        st1             {v6.s}[1],   [x8],  x1
577        b.gt            4b
578        ret
5798:
580        ld2r            {v0.16b,  v1.16b},  [x5],  #2
581        ld1             {v2.16b},  [x2],  #16
582        ld1             {v3.d}[0],   [x0]
583        ext             v0.16b,  v0.16b,  v1.16b,  #8
584        sub             v5.16b,  v4.16b,  v0.16b
585        ld1             {v3.d}[1],   [x8]
586        subs            w4,  w4,  #2
587        umull           v6.8h,   v0.8b,   v2.8b
588        umlal           v6.8h,   v3.8b,   v5.8b
589        umull2          v7.8h,   v0.16b,  v2.16b
590        umlal2          v7.8h,   v3.16b,  v5.16b
591        rshrn           v16.8b,  v6.8h,   #6
592        rshrn2          v16.16b, v7.8h,   #6
593        st1             {v16.d}[0],  [x0],  x1
594        st1             {v16.d}[1],  [x8],  x1
595        b.gt            8b
596        ret
59716:
598        ld2r            {v0.16b,  v1.16b},  [x5],  #2
599        ld1             {v2.16b,  v3.16b},  [x2],  #32
600        ld1             {v5.16b},  [x0]
601        sub             v7.16b,  v4.16b,  v0.16b
602        sub             v16.16b, v4.16b,  v1.16b
603        ld1             {v6.16b},  [x8]
604        subs            w4,  w4,  #2
605        umull           v17.8h,  v0.8b,   v2.8b
606        umlal           v17.8h,  v5.8b,   v7.8b
607        umull2          v18.8h,  v0.16b,  v2.16b
608        umlal2          v18.8h,  v5.16b,  v7.16b
609        umull           v19.8h,  v1.8b,   v3.8b
610        umlal           v19.8h,  v6.8b,   v16.8b
611        umull2          v20.8h,  v1.16b,  v3.16b
612        umlal2          v20.8h,  v6.16b,  v16.16b
613        rshrn           v21.8b,  v17.8h,  #6
614        rshrn2          v21.16b, v18.8h,  #6
615        rshrn           v22.8b,  v19.8h,  #6
616        rshrn2          v22.16b, v20.8h,  #6
617        st1             {v21.16b}, [x0],  x1
618        st1             {v22.16b}, [x8],  x1
619        b.gt            16b
620        ret
6211280:
622640:
623320:
624        sub             x1,  x1,  w3,  uxtw
625        add             x7,  x2,  w3,  uxtw
626321:
627        ld2r            {v0.16b,  v1.16b},  [x5],  #2
628        mov             w6,  w3
629        sub             v20.16b, v4.16b,  v0.16b
630        sub             v21.16b, v4.16b,  v1.16b
63132:
632        ld1             {v16.16b, v17.16b}, [x2],  #32
633        ld1             {v2.16b,  v3.16b},  [x0]
634        subs            w6,  w6,  #32
635        umull           v23.8h,  v0.8b,   v16.8b
636        umlal           v23.8h,  v2.8b,   v20.8b
637        ld1             {v18.16b, v19.16b}, [x7],  #32
638        umull2          v27.8h,  v0.16b,  v16.16b
639        umlal2          v27.8h,  v2.16b,  v20.16b
640        ld1             {v6.16b,  v7.16b},  [x8]
641        umull           v24.8h,  v0.8b,   v17.8b
642        umlal           v24.8h,  v3.8b,   v20.8b
643        umull2          v28.8h,  v0.16b,  v17.16b
644        umlal2          v28.8h,  v3.16b,  v20.16b
645        umull           v25.8h,  v1.8b,   v18.8b
646        umlal           v25.8h,  v6.8b,   v21.8b
647        umull2          v5.8h,   v1.16b,  v18.16b
648        umlal2          v5.8h,   v6.16b,  v21.16b
649        rshrn           v29.8b,  v23.8h,  #6
650        rshrn2          v29.16b, v27.8h,  #6
651        umull           v26.8h,  v1.8b,   v19.8b
652        umlal           v26.8h,  v7.8b,   v21.8b
653        umull2          v31.8h,  v1.16b,  v19.16b
654        umlal2          v31.8h,  v7.16b,  v21.16b
655        rshrn           v30.8b,  v24.8h,  #6
656        rshrn2          v30.16b, v28.8h,  #6
657        rshrn           v23.8b,  v25.8h,  #6
658        rshrn2          v23.16b, v5.8h,   #6
659        rshrn           v24.8b,  v26.8h,  #6
660        st1             {v29.16b, v30.16b}, [x0],  #32
661        rshrn2          v24.16b, v31.8h,  #6
662        st1             {v23.16b, v24.16b}, [x8],  #32
663        b.gt            32b
664        subs            w4,  w4,  #2
665        add             x0,  x0,  x1
666        add             x8,  x8,  x1
667        add             x2,  x2,  w3,  uxtw
668        add             x7,  x7,  w3,  uxtw
669        b.gt            321b
670        ret
671L(blend_h_tbl):
672        .hword L(blend_h_tbl) - 1280b
673        .hword L(blend_h_tbl) -  640b
674        .hword L(blend_h_tbl) -  320b
675        .hword L(blend_h_tbl) -   16b
676        .hword L(blend_h_tbl) -    8b
677        .hword L(blend_h_tbl) -    4b
678        .hword L(blend_h_tbl) -    2b
679endfunc
680
681function blend_v_8bpc_neon, export=1
682        adr             x6,  L(blend_v_tbl)
683        movrel          x5,  X(obmc_masks)
684        add             x5,  x5,  w3,  uxtw
685        clz             w3,  w3
686        movi            v4.16b,  #64
687        add             x8,  x0,  x1
688        lsl             x1,  x1,  #1
689        sub             w3,  w3,  #26
690        ldrh            w3,  [x6,  x3,  lsl #1]
691        sub             x6,  x6,  w3,  uxtw
692        br              x6
69320:
694        ld1r            {v0.8b},   [x5]
695        sub             v1.8b,   v4.8b,   v0.8b
6962:
697        ld1             {v2.h}[0],   [x2],  #2
698        ld1             {v3.b}[0],   [x0]
699        subs            w4,  w4,  #2
700        ld1             {v2.b}[1],   [x2]
701        ld1             {v3.b}[1],   [x8]
702        umull           v5.8h,   v2.8b,   v0.8b
703        umlal           v5.8h,   v3.8b,   v1.8b
704        rshrn           v5.8b,   v5.8h,   #6
705        add             x2,  x2,  #2
706        st1             {v5.b}[0],   [x0],  x1
707        st1             {v5.b}[1],   [x8],  x1
708        b.gt            2b
709        ret
71040:
711        ld1r            {v0.2s},   [x5]
712        sub             x1,  x1,  #2
713        sub             v1.8b,   v4.8b,   v0.8b
7144:
715        ld1             {v2.8b},   [x2],  #8
716        ld1             {v3.s}[0],   [x0]
717        ld1             {v3.s}[1],   [x8]
718        subs            w4,  w4,  #2
719        umull           v5.8h,   v2.8b,   v0.8b
720        umlal           v5.8h,   v3.8b,   v1.8b
721        rshrn           v5.8b,   v5.8h,   #6
722        st1             {v5.h}[0],   [x0],  #2
723        st1             {v5.h}[2],   [x8],  #2
724        st1             {v5.b}[2],   [x0],  x1
725        st1             {v5.b}[6],   [x8],  x1
726        b.gt            4b
727        ret
72880:
729        ld1r            {v0.2d},   [x5]
730        sub             x1,  x1,  #4
731        sub             v1.16b,  v4.16b,  v0.16b
7328:
733        ld1             {v2.16b},  [x2],  #16
734        ld1             {v3.d}[0],   [x0]
735        ld1             {v3.d}[1],   [x8]
736        subs            w4,  w4,  #2
737        umull           v5.8h,  v0.8b,  v2.8b
738        umlal           v5.8h,  v3.8b,  v1.8b
739        umull2          v6.8h,  v0.16b, v2.16b
740        umlal2          v6.8h,  v3.16b, v1.16b
741        rshrn           v7.8b,  v5.8h,  #6
742        rshrn2          v7.16b, v6.8h,  #6
743        st1             {v7.s}[0],   [x0],  #4
744        st1             {v7.s}[2],   [x8],  #4
745        st1             {v7.h}[2],   [x0],  x1
746        st1             {v7.h}[6],   [x8],  x1
747        b.gt            8b
748        ret
749160:
750        ld1             {v0.16b},  [x5]
751        sub             x1,  x1,  #8
752        sub             v2.16b,  v4.16b,  v0.16b
75316:
754        ld1             {v5.16b,  v6.16b},  [x2],  #32
755        ld1             {v7.16b},  [x0]
756        subs            w4,  w4,  #2
757        ld1             {v16.16b}, [x8]
758        umull           v17.8h,  v5.8b,   v0.8b
759        umlal           v17.8h,  v7.8b,   v2.8b
760        umull2          v18.8h,  v5.16b,  v0.16b
761        umlal2          v18.8h,  v7.16b,  v2.16b
762        umull           v20.8h,  v6.8b,   v0.8b
763        umlal           v20.8h,  v16.8b,  v2.8b
764        umull2          v21.8h,  v6.16b,  v0.16b
765        umlal2          v21.8h,  v16.16b, v2.16b
766        rshrn           v19.8b,  v17.8h,  #6
767        rshrn2          v19.16b, v18.8h,  #6
768        rshrn           v22.8b,  v20.8h,  #6
769        rshrn2          v22.16b, v21.8h,  #6
770        st1             {v19.8b},  [x0],  #8
771        st1             {v22.8b},  [x8],  #8
772        st1             {v19.s}[2],  [x0],  x1
773        st1             {v22.s}[2],  [x8],  x1
774        b.gt            16b
775        ret
776320:
777        ld1             {v0.16b,  v1.16b},  [x5]
778        sub             x1,  x1,  #16
779        sub             v2.16b,  v4.16b,  v0.16b
780        sub             v3.8b,   v4.8b,   v1.8b
78132:
782        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x2],  #64
783        ld1             {v5.16b,  v6.16b},  [x0]
784        subs            w4,  w4,  #2
785        ld1             {v20.16b, v21.16b}, [x8]
786        umull           v22.8h,  v16.8b,  v0.8b
787        umlal           v22.8h,  v5.8b,   v2.8b
788        umull2          v23.8h,  v16.16b, v0.16b
789        umlal2          v23.8h,  v5.16b,  v2.16b
790        umull           v28.8h,  v17.8b,  v1.8b
791        umlal           v28.8h,  v6.8b,   v3.8b
792        umull           v30.8h,  v18.8b,  v0.8b
793        umlal           v30.8h,  v20.8b,  v2.8b
794        umull2          v31.8h,  v18.16b, v0.16b
795        umlal2          v31.8h,  v20.16b, v2.16b
796        umull           v25.8h,  v19.8b,  v1.8b
797        umlal           v25.8h,  v21.8b,  v3.8b
798        rshrn           v24.8b,  v22.8h,  #6
799        rshrn2          v24.16b, v23.8h,  #6
800        rshrn           v28.8b,  v28.8h,  #6
801        rshrn           v30.8b,  v30.8h,  #6
802        rshrn2          v30.16b, v31.8h,  #6
803        rshrn           v27.8b,  v25.8h,  #6
804        st1             {v24.16b}, [x0],  #16
805        st1             {v30.16b}, [x8],  #16
806        st1             {v28.8b},  [x0],  x1
807        st1             {v27.8b},  [x8],  x1
808        b.gt            32b
809        ret
810L(blend_v_tbl):
811        .hword L(blend_v_tbl) - 320b
812        .hword L(blend_v_tbl) - 160b
813        .hword L(blend_v_tbl) -  80b
814        .hword L(blend_v_tbl) -  40b
815        .hword L(blend_v_tbl) -  20b
816endfunc
817
818
819// This has got the same signature as the put_8tap functions,
820// and assumes that x8 is set to (clz(w)-24).
821function put_neon
822        adr             x9,  L(put_tbl)
823        ldrh            w8,  [x9, x8, lsl #1]
824        sub             x9,  x9,  w8, uxtw
825        br              x9
826
8272:
828        ld1             {v0.h}[0], [x2], x3
829        ld1             {v1.h}[0], [x2], x3
830        subs            w5,  w5,  #2
831        st1             {v0.h}[0], [x0], x1
832        st1             {v1.h}[0], [x0], x1
833        b.gt            2b
834        ret
8354:
836        ld1             {v0.s}[0], [x2], x3
837        ld1             {v1.s}[0], [x2], x3
838        subs            w5,  w5,  #2
839        st1             {v0.s}[0], [x0], x1
840        st1             {v1.s}[0], [x0], x1
841        b.gt            4b
842        ret
8438:
844        ld1             {v0.8b}, [x2], x3
845        ld1             {v1.8b}, [x2], x3
846        subs            w5,  w5,  #2
847        st1             {v0.8b}, [x0], x1
848        st1             {v1.8b}, [x0], x1
849        b.gt            8b
850        ret
851160:
852        add             x8,  x0,  x1
853        lsl             x1,  x1,  #1
854        add             x9,  x2,  x3
855        lsl             x3,  x3,  #1
85616:
857        ld1             {v0.16b}, [x2], x3
858        ld1             {v1.16b}, [x9], x3
859        subs            w5,  w5,  #2
860        st1             {v0.16b}, [x0], x1
861        st1             {v1.16b}, [x8], x1
862        b.gt            16b
863        ret
86432:
865        ldp             x6,  x7,  [x2]
866        ldp             x8,  x9,  [x2, #16]
867        stp             x6,  x7,  [x0]
868        subs            w5,  w5,  #1
869        stp             x8,  x9,  [x0, #16]
870        add             x2,  x2,  x3
871        add             x0,  x0,  x1
872        b.gt            32b
873        ret
87464:
875        ldp             x6,  x7,  [x2]
876        ldp             x8,  x9,  [x2, #16]
877        stp             x6,  x7,  [x0]
878        ldp             x10, x11, [x2, #32]
879        stp             x8,  x9,  [x0, #16]
880        subs            w5,  w5,  #1
881        ldp             x12, x13, [x2, #48]
882        stp             x10, x11, [x0, #32]
883        stp             x12, x13, [x0, #48]
884        add             x2,  x2,  x3
885        add             x0,  x0,  x1
886        b.gt            64b
887        ret
888128:
889        ldp             q0,  q1,  [x2]
890        ldp             q2,  q3,  [x2, #32]
891        stp             q0,  q1,  [x0]
892        ldp             q4,  q5,  [x2, #64]
893        stp             q2,  q3,  [x0, #32]
894        ldp             q6,  q7,  [x2, #96]
895        subs            w5,  w5,  #1
896        stp             q4,  q5,  [x0, #64]
897        stp             q6,  q7,  [x0, #96]
898        add             x2,  x2,  x3
899        add             x0,  x0,  x1
900        b.gt            128b
901        ret
902
903L(put_tbl):
904        .hword L(put_tbl) - 128b
905        .hword L(put_tbl) -  64b
906        .hword L(put_tbl) -  32b
907        .hword L(put_tbl) - 160b
908        .hword L(put_tbl) -   8b
909        .hword L(put_tbl) -   4b
910        .hword L(put_tbl) -   2b
911endfunc
912
913
914// This has got the same signature as the prep_8tap functions,
915// and assumes that x8 is set to (clz(w)-24), and x7 to w*2.
916function prep_neon
917        adr             x9,  L(prep_tbl)
918        ldrh            w8,  [x9, x8, lsl #1]
919        sub             x9,  x9,  w8, uxtw
920        br              x9
921
9224:
923        ld1             {v0.s}[0], [x1], x2
924        ld1             {v1.s}[0], [x1], x2
925        subs            w4,  w4,  #2
926        ushll           v0.8h, v0.8b, #4
927        ushll           v1.8h, v1.8b, #4
928        st1             {v0.4h, v1.4h}, [x0], #16
929        b.gt            4b
930        ret
9318:
932        ld1             {v0.8b}, [x1], x2
933        ld1             {v1.8b}, [x1], x2
934        subs            w4,  w4,  #2
935        ushll           v0.8h, v0.8b, #4
936        ushll           v1.8h, v1.8b, #4
937        st1             {v0.8h, v1.8h}, [x0], #32
938        b.gt            8b
939        ret
940160:
941        add             x9,  x1,  x2
942        lsl             x2,  x2,  #1
94316:
944        ld1             {v0.16b}, [x1], x2
945        ld1             {v1.16b}, [x9], x2
946        subs            w4,  w4,  #2
947        ushll           v4.8h, v0.8b,  #4
948        ushll2          v5.8h, v0.16b, #4
949        ushll           v6.8h, v1.8b,  #4
950        ushll2          v7.8h, v1.16b, #4
951        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
952        b.gt            16b
953        ret
954320:
955        add             x8,  x0,  w3, uxtw
95632:
957        ld1             {v0.16b, v1.16b},  [x1], x2
958        subs            w4,  w4,  #2
959        ushll           v4.8h,  v0.8b,  #4
960        ushll2          v5.8h,  v0.16b, #4
961        ld1             {v2.16b, v3.16b},  [x1], x2
962        ushll           v6.8h,  v1.8b,  #4
963        ushll2          v7.8h,  v1.16b, #4
964        ushll           v16.8h, v2.8b,  #4
965        st1             {v4.8h,  v5.8h},  [x0], x7
966        ushll2          v17.8h, v2.16b, #4
967        st1             {v6.8h,  v7.8h},  [x8], x7
968        ushll           v18.8h, v3.8b,  #4
969        st1             {v16.8h, v17.8h}, [x0], x7
970        ushll2          v19.8h, v3.16b, #4
971        st1             {v18.8h, v19.8h}, [x8], x7
972        b.gt            32b
973        ret
974640:
975        add             x8,  x0,  #32
976        mov             x6,  #64
97764:
978        ldp             q0,  q1,  [x1]
979        subs            w4,  w4,  #1
980        ushll           v4.8h,  v0.8b,  #4
981        ushll2          v5.8h,  v0.16b, #4
982        ldp             q2,  q3,  [x1, #32]
983        ushll           v6.8h,  v1.8b,  #4
984        ushll2          v7.8h,  v1.16b, #4
985        add             x1,  x1,  x2
986        ushll           v16.8h, v2.8b,  #4
987        st1             {v4.8h,  v5.8h},  [x0], x6
988        ushll2          v17.8h, v2.16b, #4
989        ushll           v18.8h, v3.8b,  #4
990        st1             {v6.8h,  v7.8h},  [x8], x6
991        ushll2          v19.8h, v3.16b, #4
992        st1             {v16.8h, v17.8h}, [x0], x6
993        st1             {v18.8h, v19.8h}, [x8], x6
994        b.gt            64b
995        ret
9961280:
997        add             x8,  x0,  #64
998        mov             x6,  #128
999128:
1000        ldp             q0,  q1,  [x1]
1001        ldp             q2,  q3,  [x1, #32]
1002        ushll           v16.8h,  v0.8b,  #4
1003        ushll2          v17.8h,  v0.16b, #4
1004        ushll           v18.8h,  v1.8b,  #4
1005        ushll2          v19.8h,  v1.16b, #4
1006        ushll           v20.8h,  v2.8b,  #4
1007        ushll2          v21.8h,  v2.16b, #4
1008        ldp             q4,  q5,  [x1, #64]
1009        st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x6
1010        ushll           v22.8h,  v3.8b,  #4
1011        ushll2          v23.8h,  v3.16b, #4
1012        ushll           v24.8h,  v4.8b,  #4
1013        ushll2          v25.8h,  v4.16b, #4
1014        ushll           v26.8h,  v5.8b,  #4
1015        ushll2          v27.8h,  v5.16b, #4
1016        ldp             q6,  q7,  [x1, #96]
1017        st1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x8], x6
1018        ushll           v28.8h,  v6.8b,  #4
1019        ushll2          v29.8h,  v6.16b, #4
1020        ushll           v30.8h,  v7.8b,  #4
1021        ushll2          v31.8h,  v7.16b, #4
1022        subs            w4,  w4,  #1
1023        add             x1,  x1,  x2
1024        st1             {v24.8h, v25.8h, v26.8h, v27.8h}, [x0], x6
1025        st1             {v28.8h, v29.8h, v30.8h, v31.8h}, [x8], x6
1026        b.gt            128b
1027        ret
1028
1029L(prep_tbl):
1030        .hword L(prep_tbl) - 1280b
1031        .hword L(prep_tbl) -  640b
1032        .hword L(prep_tbl) -  320b
1033        .hword L(prep_tbl) -  160b
1034        .hword L(prep_tbl) -    8b
1035        .hword L(prep_tbl) -    4b
1036endfunc
1037
1038
1039.macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6
1040        ld1             {\d0\wd}[0], [\s0], \strd
1041        ld1             {\d1\wd}[0], [\s1], \strd
1042.ifnb \d2
1043        ld1             {\d2\wd}[0], [\s0], \strd
1044        ld1             {\d3\wd}[0], [\s1], \strd
1045.endif
1046.ifnb \d4
1047        ld1             {\d4\wd}[0], [\s0], \strd
1048.endif
1049.ifnb \d5
1050        ld1             {\d5\wd}[0], [\s1], \strd
1051.endif
1052.ifnb \d6
1053        ld1             {\d6\wd}[0], [\s0], \strd
1054.endif
1055.endm
1056.macro load_reg s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6
1057        ld1             {\d0\wd}, [\s0], \strd
1058        ld1             {\d1\wd}, [\s1], \strd
1059.ifnb \d2
1060        ld1             {\d2\wd}, [\s0], \strd
1061        ld1             {\d3\wd}, [\s1], \strd
1062.endif
1063.ifnb \d4
1064        ld1             {\d4\wd}, [\s0], \strd
1065.endif
1066.ifnb \d5
1067        ld1             {\d5\wd}, [\s1], \strd
1068.endif
1069.ifnb \d6
1070        ld1             {\d6\wd}, [\s0], \strd
1071.endif
1072.endm
1073.macro load_h s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
1074        load_slice      \s0, \s1, \strd, .h, \d0, \d1, \d2, \d3, \d4, \d5, \d6
1075.endm
1076.macro load_s s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
1077        load_slice      \s0, \s1, \strd, .s, \d0, \d1, \d2, \d3, \d4, \d5, \d6
1078.endm
1079.macro load_8b s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
1080        load_reg        \s0, \s1, \strd, .8b, \d0, \d1, \d2, \d3, \d4, \d5, \d6
1081.endm
1082.macro load_16b s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
1083        load_reg        \s0, \s1, \strd, .16b, \d0, \d1, \d2, \d3, \d4, \d5, \d6
1084.endm
1085.macro interleave_1 wd, r0, r1, r2, r3, r4
1086        trn1            \r0\wd, \r0\wd, \r1\wd
1087        trn1            \r1\wd, \r1\wd, \r2\wd
1088.ifnb \r3
1089        trn1            \r2\wd, \r2\wd, \r3\wd
1090        trn1            \r3\wd, \r3\wd, \r4\wd
1091.endif
1092.endm
1093.macro interleave_1_h r0, r1, r2, r3, r4
1094        interleave_1    .4h, \r0, \r1, \r2, \r3, \r4
1095.endm
1096.macro interleave_1_s r0, r1, r2, r3, r4
1097        interleave_1    .2s, \r0, \r1, \r2, \r3, \r4
1098.endm
1099.macro interleave_2 wd, r0, r1, r2, r3, r4, r5
1100        trn1            \r0\wd,  \r0\wd, \r2\wd
1101        trn1            \r1\wd,  \r1\wd, \r3\wd
1102        trn1            \r2\wd,  \r2\wd, \r4\wd
1103        trn1            \r3\wd,  \r3\wd, \r5\wd
1104.endm
1105.macro interleave_2_s r0, r1, r2, r3, r4, r5
1106        interleave_2    .2s, \r0, \r1, \r2, \r3, \r4, \r5
1107.endm
1108.macro uxtl_b r0, r1, r2, r3, r4, r5, r6
1109        uxtl            \r0\().8h, \r0\().8b
1110        uxtl            \r1\().8h, \r1\().8b
1111.ifnb \r2
1112        uxtl            \r2\().8h, \r2\().8b
1113        uxtl            \r3\().8h, \r3\().8b
1114.endif
1115.ifnb \r4
1116        uxtl            \r4\().8h, \r4\().8b
1117.endif
1118.ifnb \r5
1119        uxtl            \r5\().8h, \r5\().8b
1120.endif
1121.ifnb \r6
1122        uxtl            \r6\().8h, \r6\().8b
1123.endif
1124.endm
1125.macro mul_mla_4 d, s0, s1, s2, s3, wd
1126        mul             \d\wd,  \s0\wd,  v0.h[0]
1127        mla             \d\wd,  \s1\wd,  v0.h[1]
1128        mla             \d\wd,  \s2\wd,  v0.h[2]
1129        mla             \d\wd,  \s3\wd,  v0.h[3]
1130.endm
1131// Interleaving the mul/mla chains actually hurts performance
1132// significantly on Cortex A53, thus keeping mul/mla tightly
1133// chained like this.
1134.macro mul_mla_8_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8
1135        mul             \d0\().8h, \s0\().8h, v0.h[0]
1136        mla             \d0\().8h, \s1\().8h, v0.h[1]
1137        mla             \d0\().8h, \s2\().8h, v0.h[2]
1138        mla             \d0\().8h, \s3\().8h, v0.h[3]
1139        mla             \d0\().8h, \s4\().8h, v0.h[4]
1140        mla             \d0\().8h, \s5\().8h, v0.h[5]
1141        mla             \d0\().8h, \s6\().8h, v0.h[6]
1142        mla             \d0\().8h, \s7\().8h, v0.h[7]
1143        mul             \d1\().8h, \s1\().8h, v0.h[0]
1144        mla             \d1\().8h, \s2\().8h, v0.h[1]
1145        mla             \d1\().8h, \s3\().8h, v0.h[2]
1146        mla             \d1\().8h, \s4\().8h, v0.h[3]
1147        mla             \d1\().8h, \s5\().8h, v0.h[4]
1148        mla             \d1\().8h, \s6\().8h, v0.h[5]
1149        mla             \d1\().8h, \s7\().8h, v0.h[6]
1150        mla             \d1\().8h, \s8\().8h, v0.h[7]
1151.endm
1152.macro mul_mla_8_2 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9
1153        mul             \d0\().8h, \s0\().8h, v0.h[0]
1154        mla             \d0\().8h, \s1\().8h, v0.h[1]
1155        mla             \d0\().8h, \s2\().8h, v0.h[2]
1156        mla             \d0\().8h, \s3\().8h, v0.h[3]
1157        mla             \d0\().8h, \s4\().8h, v0.h[4]
1158        mla             \d0\().8h, \s5\().8h, v0.h[5]
1159        mla             \d0\().8h, \s6\().8h, v0.h[6]
1160        mla             \d0\().8h, \s7\().8h, v0.h[7]
1161        mul             \d1\().8h, \s2\().8h, v0.h[0]
1162        mla             \d1\().8h, \s3\().8h, v0.h[1]
1163        mla             \d1\().8h, \s4\().8h, v0.h[2]
1164        mla             \d1\().8h, \s5\().8h, v0.h[3]
1165        mla             \d1\().8h, \s6\().8h, v0.h[4]
1166        mla             \d1\().8h, \s7\().8h, v0.h[5]
1167        mla             \d1\().8h, \s8\().8h, v0.h[6]
1168        mla             \d1\().8h, \s9\().8h, v0.h[7]
1169.endm
1170.macro mul_mla_8_4 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11
1171        mul             \d0\().8h, \s0\().8h,  v0.h[0]
1172        mla             \d0\().8h, \s1\().8h,  v0.h[1]
1173        mla             \d0\().8h, \s2\().8h,  v0.h[2]
1174        mla             \d0\().8h, \s3\().8h,  v0.h[3]
1175        mla             \d0\().8h, \s4\().8h,  v0.h[4]
1176        mla             \d0\().8h, \s5\().8h,  v0.h[5]
1177        mla             \d0\().8h, \s6\().8h,  v0.h[6]
1178        mla             \d0\().8h, \s7\().8h,  v0.h[7]
1179        mul             \d1\().8h, \s4\().8h,  v0.h[0]
1180        mla             \d1\().8h, \s5\().8h,  v0.h[1]
1181        mla             \d1\().8h, \s6\().8h,  v0.h[2]
1182        mla             \d1\().8h, \s7\().8h,  v0.h[3]
1183        mla             \d1\().8h, \s8\().8h,  v0.h[4]
1184        mla             \d1\().8h, \s9\().8h,  v0.h[5]
1185        mla             \d1\().8h, \s10\().8h, v0.h[6]
1186        mla             \d1\().8h, \s11\().8h, v0.h[7]
1187.endm
1188.macro sqrshrun_b shift, r0, r1, r2, r3
1189        sqrshrun        \r0\().8b, \r0\().8h,  #\shift
1190.ifnb \r1
1191        sqrshrun        \r1\().8b, \r1\().8h,  #\shift
1192.endif
1193.ifnb \r2
1194        sqrshrun        \r2\().8b, \r2\().8h,  #\shift
1195        sqrshrun        \r3\().8b, \r3\().8h,  #\shift
1196.endif
1197.endm
1198.macro srshr_h shift, r0, r1, r2, r3
1199        srshr           \r0\().8h, \r0\().8h,  #\shift
1200.ifnb \r1
1201        srshr           \r1\().8h, \r1\().8h,  #\shift
1202.endif
1203.ifnb \r2
1204        srshr           \r2\().8h, \r2\().8h,  #\shift
1205        srshr           \r3\().8h, \r3\().8h,  #\shift
1206.endif
1207.endm
1208.macro st_h strd, reg, lanes
1209        st1             {\reg\().h}[0], [x0], \strd
1210        st1             {\reg\().h}[1], [x8], \strd
1211.if \lanes > 2
1212        st1             {\reg\().h}[2], [x0], \strd
1213        st1             {\reg\().h}[3], [x8], \strd
1214.endif
1215.endm
1216.macro st_s strd, r0, r1
1217        st1             {\r0\().s}[0], [x0], \strd
1218        st1             {\r0\().s}[1], [x8], \strd
1219.ifnb \r1
1220        st1             {\r1\().s}[0], [x0], \strd
1221        st1             {\r1\().s}[1], [x8], \strd
1222.endif
1223.endm
1224.macro st_d strd, r0, r1
1225        st1             {\r0\().d}[0], [x0], \strd
1226        st1             {\r0\().d}[1], [x8], \strd
1227.ifnb \r1
1228        st1             {\r1\().d}[0], [x0], \strd
1229        st1             {\r1\().d}[1], [x8], \strd
1230.endif
1231.endm
1232.macro shift_store_4 type, strd, r0, r1
1233.ifc \type, put
1234        sqrshrun_b      6,     \r0, \r1
1235        st_s            \strd, \r0, \r1
1236.else
1237        srshr_h         2,     \r0, \r1
1238        st_d            \strd, \r0, \r1
1239.endif
1240.endm
1241.macro st_reg strd, wd, r0, r1, r2, r3, r4, r5, r6, r7
1242        st1             {\r0\wd}, [x0], \strd
1243        st1             {\r1\wd}, [x8], \strd
1244.ifnb \r2
1245        st1             {\r2\wd}, [x0], \strd
1246        st1             {\r3\wd}, [x8], \strd
1247.endif
1248.ifnb \r4
1249        st1             {\r4\wd}, [x0], \strd
1250        st1             {\r5\wd}, [x8], \strd
1251        st1             {\r6\wd}, [x0], \strd
1252        st1             {\r7\wd}, [x8], \strd
1253.endif
1254.endm
1255.macro st_8b strd, r0, r1, r2, r3, r4, r5, r6, r7
1256        st_reg          \strd, .8b,  \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7
1257.endm
1258.macro st_16b strd, r0, r1, r2, r3, r4, r5, r6, r7
1259        st_reg          \strd, .16b, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7
1260.endm
1261.macro shift_store_8 type, strd, r0, r1, r2, r3
1262.ifc \type, put
1263        sqrshrun_b      6,     \r0, \r1, \r2, \r3
1264        st_8b           \strd, \r0, \r1, \r2, \r3
1265.else
1266        srshr_h         2,     \r0, \r1, \r2, \r3
1267        st_16b          \strd, \r0, \r1, \r2, \r3
1268.endif
1269.endm
1270.macro shift_store_16 type, strd, r0, r1, r2, r3
1271.ifc \type, put
1272        sqrshrun        \r0\().8b,  \r0\().8h, #6
1273        sqrshrun2       \r0\().16b, \r1\().8h, #6
1274        sqrshrun        \r2\().8b,  \r2\().8h, #6
1275        sqrshrun2       \r2\().16b, \r3\().8h, #6
1276        st_16b          \strd, \r0, \r2
1277.else
1278        srshr_h         2,     \r0, \r1, \r2, \r3
1279        st1             {\r0\().8h, \r1\().8h}, [x0], \strd
1280        st1             {\r2\().8h, \r3\().8h}, [x8], \strd
1281.endif
1282.endm
1283
1284.macro make_8tap_fn op, type, type_h, type_v
1285function \op\()_8tap_\type\()_8bpc_neon, export=1
1286        mov             x8,  \type_h
1287        mov             x9,  \type_v
1288        b               \op\()_8tap_neon
1289endfunc
1290.endm
1291
1292// No spaces in these expressions, due to gas-preprocessor.
1293#define REGULAR ((0*15<<7)|3*15)
1294#define SMOOTH  ((1*15<<7)|4*15)
1295#define SHARP   ((2*15<<7)|3*15)
1296
1297.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, ds2, sr2, shift_hv
1298make_8tap_fn \type, regular,        REGULAR, REGULAR
1299make_8tap_fn \type, regular_smooth, REGULAR, SMOOTH
1300make_8tap_fn \type, regular_sharp,  REGULAR, SHARP
1301make_8tap_fn \type, smooth,         SMOOTH,  SMOOTH
1302make_8tap_fn \type, smooth_regular, SMOOTH,  REGULAR
1303make_8tap_fn \type, smooth_sharp,   SMOOTH,  SHARP
1304make_8tap_fn \type, sharp,          SHARP,   SHARP
1305make_8tap_fn \type, sharp_regular,  SHARP,   REGULAR
1306make_8tap_fn \type, sharp_smooth,   SHARP,   SMOOTH
1307
1308function \type\()_8tap_neon
1309        mov             w10,  #0x4081  // (1 << 14) | (1 << 7) | (1 << 0)
1310        mul             \mx,  \mx, w10
1311        mul             \my,  \my, w10
1312        add             \mx,  \mx, w8 // mx, 8tap_h, 4tap_h
1313        add             \my,  \my, w9 // my, 8tap_v, 4tap_v
1314.ifc \type, prep
1315        uxtw            \d_strd, \w
1316        lsl             \d_strd, \d_strd, #1
1317.endif
1318
1319        clz             w8,  \w
1320        tst             \mx, #(0x7f << 14)
1321        sub             w8,  w8,  #24
1322        movrel          x10, X(mc_subpel_filters), -8
1323        b.ne            L(\type\()_8tap_h)
1324        tst             \my, #(0x7f << 14)
1325        b.ne            L(\type\()_8tap_v)
1326        b               \type\()_neon
1327
1328L(\type\()_8tap_h):
1329        cmp             \w,  #4
1330        ubfx            w9,  \mx, #7, #7
1331        and             \mx, \mx, #0x7f
1332        b.le            4f
1333        mov             \mx,  w9
13344:
1335        tst             \my,  #(0x7f << 14)
1336        add             \xmx, x10, \mx, uxtw #3
1337        b.ne            L(\type\()_8tap_hv)
1338
1339        adr             x9,  L(\type\()_8tap_h_tbl)
1340        ldrh            w8,  [x9, x8, lsl #1]
1341        sub             x9,  x9,  w8, uxtw
1342        br              x9
1343
134420:     // 2xN h
1345.ifc \type, put
1346        add             \xmx,  \xmx,  #2
1347        ld1             {v0.s}[0], [\xmx]
1348        sub             \src,  \src,  #1
1349        add             \ds2,  \dst,  \d_strd
1350        add             \sr2,  \src,  \s_strd
1351        lsl             \d_strd,  \d_strd,  #1
1352        lsl             \s_strd,  \s_strd,  #1
1353        sxtl            v0.8h,  v0.8b
13542:
1355        ld1             {v4.8b},  [\src], \s_strd
1356        ld1             {v6.8b},  [\sr2], \s_strd
1357        uxtl            v4.8h,  v4.8b
1358        uxtl            v6.8h,  v6.8b
1359        ext             v5.16b, v4.16b, v4.16b, #2
1360        ext             v7.16b, v6.16b, v6.16b, #2
1361        subs            \h,  \h,  #2
1362        trn1            v3.2s,  v4.2s,  v6.2s
1363        trn2            v6.2s,  v4.2s,  v6.2s
1364        trn1            v4.2s,  v5.2s,  v7.2s
1365        trn2            v7.2s,  v5.2s,  v7.2s
1366        mul             v3.4h,  v3.4h,  v0.h[0]
1367        mla             v3.4h,  v4.4h,  v0.h[1]
1368        mla             v3.4h,  v6.4h,  v0.h[2]
1369        mla             v3.4h,  v7.4h,  v0.h[3]
1370        srshr           v3.4h,  v3.4h,  #2
1371        sqrshrun        v3.8b,  v3.8h,  #4
1372        st1             {v3.h}[0], [\dst], \d_strd
1373        st1             {v3.h}[1], [\ds2], \d_strd
1374        b.gt            2b
1375        ret
1376.endif
1377
137840:     // 4xN h
1379        add             \xmx,  \xmx,  #2
1380        ld1             {v0.s}[0], [\xmx]
1381        sub             \src,  \src,  #1
1382        add             \ds2,  \dst,  \d_strd
1383        add             \sr2,  \src,  \s_strd
1384        lsl             \d_strd,  \d_strd,  #1
1385        lsl             \s_strd,  \s_strd,  #1
1386        sxtl            v0.8h,  v0.8b
13874:
1388        ld1             {v16.8b}, [\src], \s_strd
1389        ld1             {v20.8b}, [\sr2], \s_strd
1390        uxtl            v16.8h,  v16.8b
1391        uxtl            v20.8h,  v20.8b
1392        ext             v17.16b, v16.16b, v16.16b, #2
1393        ext             v18.16b, v16.16b, v16.16b, #4
1394        ext             v19.16b, v16.16b, v16.16b, #6
1395        ext             v21.16b, v20.16b, v20.16b, #2
1396        ext             v22.16b, v20.16b, v20.16b, #4
1397        ext             v23.16b, v20.16b, v20.16b, #6
1398        subs            \h,  \h,  #2
1399        mul             v16.4h,  v16.4h,  v0.h[0]
1400        mla             v16.4h,  v17.4h,  v0.h[1]
1401        mla             v16.4h,  v18.4h,  v0.h[2]
1402        mla             v16.4h,  v19.4h,  v0.h[3]
1403        mul             v20.4h,  v20.4h,  v0.h[0]
1404        mla             v20.4h,  v21.4h,  v0.h[1]
1405        mla             v20.4h,  v22.4h,  v0.h[2]
1406        mla             v20.4h,  v23.4h,  v0.h[3]
1407        srshr           v16.4h,  v16.4h,  #2
1408        srshr           v20.4h,  v20.4h,  #2
1409.ifc \type, put
1410        sqrshrun        v16.8b,  v16.8h,  #4
1411        sqrshrun        v20.8b,  v20.8h,  #4
1412        st1             {v16.s}[0], [\dst], \d_strd
1413        st1             {v20.s}[0], [\ds2], \d_strd
1414.else
1415        st1             {v16.4h}, [\dst], \d_strd
1416        st1             {v20.4h}, [\ds2], \d_strd
1417.endif
1418        b.gt            4b
1419        ret
1420
142180:     // 8xN h
1422        ld1             {v0.8b}, [\xmx]
1423        sub             \src,  \src,  #3
1424        add             \ds2,  \dst,  \d_strd
1425        add             \sr2,  \src,  \s_strd
1426        lsl             \d_strd,  \d_strd,  #1
1427        lsl             \s_strd,  \s_strd,  #1
1428        sxtl            v0.8h, v0.8b
14298:
1430        ld1             {v16.8b, v17.8b},  [\src], \s_strd
1431        ld1             {v20.8b, v21.8b},  [\sr2], \s_strd
1432        uxtl            v16.8h,  v16.8b
1433        uxtl            v17.8h,  v17.8b
1434        uxtl            v20.8h,  v20.8b
1435        uxtl            v21.8h,  v21.8b
1436
1437        mul             v18.8h,  v16.8h,  v0.h[0]
1438        mul             v22.8h,  v20.8h,  v0.h[0]
1439.irpc i, 1234567
1440        ext             v19.16b, v16.16b, v17.16b, #(2*\i)
1441        ext             v23.16b, v20.16b, v21.16b, #(2*\i)
1442        mla             v18.8h,  v19.8h,  v0.h[\i]
1443        mla             v22.8h,  v23.8h,  v0.h[\i]
1444.endr
1445        subs            \h,  \h,  #2
1446        srshr           v18.8h,  v18.8h, #2
1447        srshr           v22.8h,  v22.8h, #2
1448.ifc \type, put
1449        sqrshrun        v18.8b,  v18.8h, #4
1450        sqrshrun        v22.8b,  v22.8h, #4
1451        st1             {v18.8b}, [\dst], \d_strd
1452        st1             {v22.8b}, [\ds2], \d_strd
1453.else
1454        st1             {v18.8h}, [\dst], \d_strd
1455        st1             {v22.8h}, [\ds2], \d_strd
1456.endif
1457        b.gt            8b
1458        ret
1459160:
1460320:
1461640:
14621280:   // 16xN, 32xN, ... h
1463        ld1             {v0.8b}, [\xmx]
1464        sub             \src,  \src,  #3
1465        add             \ds2,  \dst,  \d_strd
1466        add             \sr2,  \src,  \s_strd
1467        lsl             \s_strd,  \s_strd,  #1
1468        sxtl            v0.8h, v0.8b
1469
1470        sub             \s_strd,  \s_strd,  \w, uxtw
1471        sub             \s_strd,  \s_strd,  #8
1472.ifc \type, put
1473        lsl             \d_strd,  \d_strd,  #1
1474        sub             \d_strd,  \d_strd,  \w, uxtw
1475.endif
1476161:
1477        ld1             {v16.8b, v17.8b, v18.8b},  [\src], #24
1478        ld1             {v20.8b, v21.8b, v22.8b},  [\sr2], #24
1479        mov             \mx, \w
1480        uxtl            v16.8h,  v16.8b
1481        uxtl            v17.8h,  v17.8b
1482        uxtl            v18.8h,  v18.8b
1483        uxtl            v20.8h,  v20.8b
1484        uxtl            v21.8h,  v21.8b
1485        uxtl            v22.8h,  v22.8b
1486
148716:
1488        mul             v24.8h,  v16.8h,  v0.h[0]
1489        mul             v25.8h,  v17.8h,  v0.h[0]
1490        mul             v26.8h,  v20.8h,  v0.h[0]
1491        mul             v27.8h,  v21.8h,  v0.h[0]
1492.irpc i, 1234567
1493        ext             v28.16b, v16.16b, v17.16b, #(2*\i)
1494        ext             v29.16b, v17.16b, v18.16b, #(2*\i)
1495        ext             v30.16b, v20.16b, v21.16b, #(2*\i)
1496        ext             v31.16b, v21.16b, v22.16b, #(2*\i)
1497        mla             v24.8h,  v28.8h,  v0.h[\i]
1498        mla             v25.8h,  v29.8h,  v0.h[\i]
1499        mla             v26.8h,  v30.8h,  v0.h[\i]
1500        mla             v27.8h,  v31.8h,  v0.h[\i]
1501.endr
1502        srshr           v24.8h,  v24.8h, #2
1503        srshr           v25.8h,  v25.8h, #2
1504        srshr           v26.8h,  v26.8h, #2
1505        srshr           v27.8h,  v27.8h, #2
1506        subs            \mx, \mx, #16
1507.ifc \type, put
1508        sqrshrun        v24.8b,  v24.8h, #4
1509        sqrshrun2       v24.16b, v25.8h, #4
1510        sqrshrun        v26.8b,  v26.8h, #4
1511        sqrshrun2       v26.16b, v27.8h, #4
1512        st1             {v24.16b}, [\dst], #16
1513        st1             {v26.16b}, [\ds2], #16
1514.else
1515        st1             {v24.8h, v25.8h}, [\dst], #32
1516        st1             {v26.8h, v27.8h}, [\ds2], #32
1517.endif
1518        b.le            9f
1519
1520        mov             v16.16b, v18.16b
1521        mov             v20.16b, v22.16b
1522        ld1             {v17.8b, v18.8b}, [\src], #16
1523        ld1             {v21.8b, v22.8b}, [\sr2], #16
1524        uxtl            v17.8h,  v17.8b
1525        uxtl            v18.8h,  v18.8b
1526        uxtl            v21.8h,  v21.8b
1527        uxtl            v22.8h,  v22.8b
1528        b               16b
1529
15309:
1531        add             \dst,  \dst,  \d_strd
1532        add             \ds2,  \ds2,  \d_strd
1533        add             \src,  \src,  \s_strd
1534        add             \sr2,  \sr2,  \s_strd
1535
1536        subs            \h,  \h,  #2
1537        b.gt            161b
1538        ret
1539
1540L(\type\()_8tap_h_tbl):
1541        .hword L(\type\()_8tap_h_tbl) - 1280b
1542        .hword L(\type\()_8tap_h_tbl) -  640b
1543        .hword L(\type\()_8tap_h_tbl) -  320b
1544        .hword L(\type\()_8tap_h_tbl) -  160b
1545        .hword L(\type\()_8tap_h_tbl) -   80b
1546        .hword L(\type\()_8tap_h_tbl) -   40b
1547        .hword L(\type\()_8tap_h_tbl) -   20b
1548        .hword 0
1549
1550
1551L(\type\()_8tap_v):
1552        cmp             \h,  #4
1553        ubfx            w9,  \my, #7, #7
1554        and             \my, \my, #0x7f
1555        b.le            4f
1556        mov             \my, w9
15574:
1558        add             \xmy, x10, \my, uxtw #3
1559
1560        adr             x9,  L(\type\()_8tap_v_tbl)
1561        ldrh            w8,  [x9, x8, lsl #1]
1562        sub             x9,  x9,  w8, uxtw
1563        br              x9
1564
156520:     // 2xN v
1566.ifc \type, put
1567        b.gt            28f
1568
1569        cmp             \h,  #2
1570        add             \xmy, \xmy, #2
1571        ld1             {v0.s}[0], [\xmy]
1572        sub             \src,  \src,  \s_strd
1573        add             \ds2,  \dst,  \d_strd
1574        add             \sr2,  \src,  \s_strd
1575        lsl             \s_strd,  \s_strd,  #1
1576        lsl             \d_strd,  \d_strd,  #1
1577        sxtl            v0.8h, v0.8b
1578
1579        // 2x2 v
1580        load_h          \src, \sr2, \s_strd, v1, v2, v3, v4, v5
1581        interleave_1_h  v1, v2, v3, v4, v5
1582        b.gt            24f
1583        uxtl_b          v1, v2, v3, v4
1584        mul_mla_4       v6, v1, v2, v3, v4, .4h
1585        sqrshrun_b      6,  v6
1586        st_h            \d_strd, v6, 2
1587        ret
1588
158924:     // 2x4 v
1590        load_h          \sr2, \src, \s_strd, v6, v7
1591        interleave_1_h  v5, v6, v7
1592        interleave_2_s  v1, v2, v3, v4, v5, v6
1593        uxtl_b          v1, v2, v3, v4
1594        mul_mla_4       v6, v1, v2, v3, v4, .8h
1595        sqrshrun_b      6,  v6
1596        st_h            \d_strd, v6, 4
1597        ret
1598
159928:     // 2x8, 2x16 v
1600        ld1             {v0.8b}, [\xmy]
1601        sub             \sr2,  \src,  \s_strd, lsl #1
1602        add             \ds2,  \dst,  \d_strd
1603        sub             \src,  \sr2,  \s_strd
1604        lsl             \d_strd,  \d_strd,  #1
1605        lsl             \s_strd,  \s_strd,  #1
1606        sxtl            v0.8h, v0.8b
1607
1608        load_h          \src, \sr2, \s_strd, v1,  v2,  v3,  v4, v5, v6, v7
1609        interleave_1_h  v1,  v2,  v3,  v4,  v5
1610        interleave_1_h  v5,  v6,  v7
1611        interleave_2_s  v1,  v2,  v3,  v4,  v5,  v6
1612        uxtl_b          v1,  v2,  v3,  v4
1613216:
1614        subs            \h,  \h,  #8
1615        load_h          \sr2, \src, \s_strd, v16, v17, v18, v19
1616        load_h          \sr2, \src, \s_strd, v20, v21, v22, v23
1617        interleave_1_h  v7,  v16, v17, v18, v19
1618        interleave_1_h  v19, v20, v21, v22, v23
1619        interleave_2_s  v5,  v6,  v7,  v16, v17, v18
1620        interleave_2_s  v17, v18, v19, v20, v21, v22
1621        uxtl_b          v5,  v6,  v7,  v16
1622        uxtl_b          v17, v18, v19, v20
1623        mul_mla_8_4     v30, v31, v1,  v2,  v3,  v4,  v5,  v6,  v7,  v16, v17, v18, v19, v20
1624        sqrshrun_b      6,   v30, v31
1625        st_h            \d_strd, v30, 4
1626        st_h            \d_strd, v31, 4
1627        b.le            0f
1628        mov             v1.16b,  v17.16b
1629        mov             v2.16b,  v18.16b
1630        mov             v3.16b,  v19.16b
1631        mov             v4.16b,  v20.16b
1632        mov             v5.16b,  v21.16b
1633        mov             v6.16b,  v22.16b
1634        mov             v7.16b,  v23.16b
1635        b               216b
16360:
1637        ret
1638.endif
1639
164040:
1641        b.gt            480f
1642
1643        // 4x2, 4x4 v
1644        cmp             \h,  #2
1645        add             \xmy, \xmy, #2
1646        ld1             {v0.s}[0], [\xmy]
1647        sub             \src, \src, \s_strd
1648        add             \ds2, \dst, \d_strd
1649        add             \sr2, \src, \s_strd
1650        lsl             \s_strd, \s_strd, #1
1651        lsl             \d_strd, \d_strd, #1
1652        sxtl            v0.8h, v0.8b
1653
1654        load_s          \src, \sr2, \s_strd, v1, v2, v3, v4, v5
1655        interleave_1_s  v1, v2, v3, v4, v5
1656        uxtl_b          v1, v2, v3, v4
1657        mul_mla_4       v6, v1, v2, v3, v4, .8h
1658        shift_store_4   \type, \d_strd, v6
1659        b.le            0f
1660        load_s          \sr2, \src, \s_strd, v6, v7
1661        interleave_1_s  v5, v6, v7
1662        uxtl_b          v5, v6
1663        mul_mla_4       v7, v3, v4, v5, v6, .8h
1664        shift_store_4   \type, \d_strd, v7
16650:
1666        ret
1667
1668480:    // 4x8, 4x16 v
1669        ld1             {v0.8b}, [\xmy]
1670        sub             \sr2, \src, \s_strd, lsl #1
1671        add             \ds2, \dst, \d_strd
1672        sub             \src, \sr2, \s_strd
1673        lsl             \s_strd, \s_strd, #1
1674        lsl             \d_strd, \d_strd, #1
1675        sxtl            v0.8h, v0.8b
1676
1677        load_s          \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22
1678        interleave_1_s  v16, v17, v18
1679        interleave_1_s  v18, v19, v20, v21, v22
1680        uxtl_b          v16, v17
1681        uxtl_b          v18, v19, v20, v21
1682
168348:
1684        subs            \h,  \h,  #4
1685        load_s          \sr2, \src, \s_strd, v23, v24, v25, v26
1686        interleave_1_s  v22, v23, v24, v25, v26
1687        uxtl_b          v22, v23, v24, v25
1688        mul_mla_8_2     v1,  v2,  v16, v17, v18, v19, v20, v21, v22, v23, v24, v25
1689        shift_store_4   \type, \d_strd, v1, v2
1690        b.le            0f
1691        subs            \h,  \h,  #4
1692        load_s          \sr2,  \src, \s_strd, v27, v16, v17, v18
1693        interleave_1_s  v26, v27, v16, v17, v18
1694        uxtl_b          v26, v27, v16, v17
1695        mul_mla_8_2     v1,  v2,  v20, v21, v22, v23, v24, v25, v26, v27, v16, v17
1696        shift_store_4   \type, \d_strd, v1, v2
1697        b.le            0f
1698        subs            \h,  \h,  #4
1699        load_s          \sr2, \src, \s_strd, v19, v20, v21, v22
1700        interleave_1_s  v18, v19, v20, v21, v22
1701        uxtl_b          v18, v19, v20, v21
1702        mul_mla_8_2     v1,  v2,  v24, v25, v26, v27, v16, v17, v18, v19, v20, v21
1703        shift_store_4   \type, \d_strd, v1, v2
1704        b.gt            48b
17050:
1706        ret
1707
170880:
1709        b.gt            880f
1710
1711        // 8x2, 8x4 v
1712        cmp             \h,  #2
1713        add             \xmy, \xmy, #2
1714        ld1             {v0.s}[0], [\xmy]
1715        sub             \src, \src, \s_strd
1716        add             \ds2, \dst, \d_strd
1717        add             \sr2, \src, \s_strd
1718        lsl             \s_strd, \s_strd, #1
1719        lsl             \d_strd, \d_strd, #1
1720        sxtl            v0.8h, v0.8b
1721
1722        load_8b         \src, \sr2, \s_strd, v1, v2, v3, v4, v5
1723        uxtl_b          v1, v2, v3, v4, v5
1724        mul_mla_4       v6, v1, v2, v3, v4, .8h
1725        mul_mla_4       v7, v2, v3, v4, v5, .8h
1726        shift_store_8   \type, \d_strd, v6, v7
1727        b.le            0f
1728        load_8b         \sr2, \src, \s_strd, v6, v7
1729        uxtl_b          v6, v7
1730        mul_mla_4       v1, v3, v4, v5, v6, .8h
1731        mul_mla_4       v2, v4, v5, v6, v7, .8h
1732        shift_store_8   \type, \d_strd, v1, v2
17330:
1734        ret
1735
1736880:    // 8x6, 8x8, 8x16, 8x32 v
17371680:   // 16x8, 16x16, ...
1738320:    // 32x8, 32x16, ...
1739640:
17401280:
1741        ld1             {v0.8b}, [\xmy]
1742        sub             \src, \src, \s_strd
1743        sub             \src, \src, \s_strd, lsl #1
1744        sxtl            v0.8h, v0.8b
1745        mov             \my,  \h
1746168:
1747        add             \ds2, \dst, \d_strd
1748        add             \sr2, \src, \s_strd
1749        lsl             \s_strd, \s_strd, #1
1750        lsl             \d_strd, \d_strd, #1
1751
1752        load_8b         \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22
1753        uxtl_b          v16, v17, v18, v19, v20, v21, v22
1754
175588:
1756        subs            \h,  \h,  #2
1757        load_8b         \sr2, \src, \s_strd, v23, v24
1758        uxtl_b          v23, v24
1759        mul_mla_8_1     v1,  v2,  v16, v17, v18, v19, v20, v21, v22, v23, v24
1760        shift_store_8   \type, \d_strd, v1, v2
1761        b.le            9f
1762        subs            \h,  \h,  #2
1763        load_8b         \sr2, \src, \s_strd, v25, v26
1764        uxtl_b          v25, v26
1765        mul_mla_8_1     v3,  v4,  v18, v19, v20, v21, v22, v23, v24, v25, v26
1766        shift_store_8   \type, \d_strd, v3, v4
1767        b.le            9f
1768        subs            \h,  \h,  #2
1769        load_8b         \sr2, \src, \s_strd, v27, v16
1770        uxtl_b          v27, v16
1771        mul_mla_8_1     v1,  v2,  v20, v21, v22, v23, v24, v25, v26, v27, v16
1772        shift_store_8   \type, \d_strd, v1, v2
1773        b.le            9f
1774        subs            \h,  \h,  #2
1775        load_8b         \sr2, \src, \s_strd, v17, v18
1776        uxtl_b          v17, v18
1777        mul_mla_8_1     v3,  v4,  v22, v23, v24, v25, v26, v27, v16, v17, v18
1778        shift_store_8   \type, \d_strd, v3, v4
1779        b.le            9f
1780        subs            \h,  \h,  #4
1781        load_8b         \sr2, \src, \s_strd, v19, v20, v21, v22
1782        uxtl_b          v19, v20, v21, v22
1783        mul_mla_8_1     v1,  v2,  v24, v25, v26, v27, v16, v17, v18, v19, v20
1784        mul_mla_8_1     v3,  v4,  v26, v27, v16, v17, v18, v19, v20, v21, v22
1785        shift_store_8   \type, \d_strd, v1, v2, v3, v4
1786        b.gt            88b
17879:
1788        subs            \w,  \w,  #8
1789        b.le            0f
1790        asr             \s_strd, \s_strd, #1
1791        asr             \d_strd, \d_strd, #1
1792        msub            \src, \s_strd, \xmy, \src
1793        msub            \dst, \d_strd, \xmy, \dst
1794        sub             \src, \src, \s_strd, lsl #3
1795        mov             \h,  \my
1796        add             \src, \src, #8
1797.ifc \type, put
1798        add             \dst, \dst, #8
1799.else
1800        add             \dst, \dst, #16
1801.endif
1802        b               168b
18030:
1804        ret
1805
1806160:
1807        b.gt            1680b
1808
1809        // 16x2, 16x4 v
1810        add             \xmy, \xmy, #2
1811        ld1             {v0.s}[0], [\xmy]
1812        sub             \src, \src, \s_strd
1813        add             \ds2, \dst, \d_strd
1814        add             \sr2, \src, \s_strd
1815        lsl             \s_strd, \s_strd, #1
1816        lsl             \d_strd, \d_strd, #1
1817        sxtl            v0.8h, v0.8b
1818
1819        cmp             \h,  #2
1820        load_16b        \src, \sr2, \s_strd, v1,  v2,  v3,  v4,  v5
1821        uxtl            v16.8h, v1.8b
1822        uxtl            v17.8h, v2.8b
1823        uxtl            v18.8h, v3.8b
1824        uxtl            v19.8h, v4.8b
1825        uxtl            v20.8h, v5.8b
1826        uxtl2           v23.8h, v1.16b
1827        uxtl2           v24.8h, v2.16b
1828        uxtl2           v25.8h, v3.16b
1829        uxtl2           v26.8h, v4.16b
1830        uxtl2           v27.8h, v5.16b
1831        mul_mla_4       v1,  v16, v17, v18, v19, .8h
1832        mul_mla_4       v16, v17, v18, v19, v20, .8h
1833        mul_mla_4       v2,  v23, v24, v25, v26, .8h
1834        mul_mla_4       v17, v24, v25, v26, v27, .8h
1835        shift_store_16  \type, \d_strd, v1, v2, v16, v17
1836        b.le            0f
1837        load_16b        \sr2, \src, \s_strd, v6,  v7
1838        uxtl            v21.8h, v6.8b
1839        uxtl            v22.8h, v7.8b
1840        uxtl2           v28.8h, v6.16b
1841        uxtl2           v29.8h, v7.16b
1842        mul_mla_4       v1,  v18, v19, v20, v21, .8h
1843        mul_mla_4       v3,  v19, v20, v21, v22, .8h
1844        mul_mla_4       v2,  v25, v26, v27, v28, .8h
1845        mul_mla_4       v4,  v26, v27, v28, v29, .8h
1846        shift_store_16  \type, \d_strd, v1, v2, v3, v4
18470:
1848        ret
1849
1850L(\type\()_8tap_v_tbl):
1851        .hword L(\type\()_8tap_v_tbl) - 1280b
1852        .hword L(\type\()_8tap_v_tbl) -  640b
1853        .hword L(\type\()_8tap_v_tbl) -  320b
1854        .hword L(\type\()_8tap_v_tbl) -  160b
1855        .hword L(\type\()_8tap_v_tbl) -   80b
1856        .hword L(\type\()_8tap_v_tbl) -   40b
1857        .hword L(\type\()_8tap_v_tbl) -   20b
1858        .hword 0
1859
1860L(\type\()_8tap_hv):
1861        cmp             \h,  #4
1862        ubfx            w9,  \my, #7, #7
1863        and             \my, \my, #0x7f
1864        b.le            4f
1865        mov             \my,  w9
18664:
1867        add             \xmy,  x10, \my, uxtw #3
1868
1869        adr             x9,  L(\type\()_8tap_hv_tbl)
1870        ldrh            w8,  [x9, x8, lsl #1]
1871        sub             x9,  x9,  w8, uxtw
1872        br              x9
1873
187420:
1875.ifc \type, put
1876        add             \xmx,  \xmx,  #2
1877        ld1             {v0.s}[0],  [\xmx]
1878        b.gt            280f
1879        add             \xmy,  \xmy,  #2
1880        ld1             {v1.s}[0],  [\xmy]
1881
1882        // 2x2, 2x4 hv
1883        sub             \sr2, \src, #1
1884        sub             \src, \sr2, \s_strd
1885        add             \ds2, \dst, \d_strd
1886        lsl             \s_strd, \s_strd, #1
1887        lsl             \d_strd, \d_strd, #1
1888        sxtl            v0.8h,  v0.8b
1889        sxtl            v1.8h,  v1.8b
1890        mov             x15, x30
1891
1892        ld1             {v28.8b}, [\src], \s_strd
1893        uxtl            v28.8h,  v28.8b
1894        ext             v29.16b, v28.16b, v28.16b, #2
1895        mul             v28.4h,  v28.4h,  v0.4h
1896        mul             v29.4h,  v29.4h,  v0.4h
1897        addp            v28.4h,  v28.4h,  v29.4h
1898        addp            v16.4h,  v28.4h,  v28.4h
1899        srshr           v16.4h,  v16.4h,  #2
1900        bl              L(\type\()_8tap_filter_2)
1901
1902        trn1            v16.2s, v16.2s, v28.2s
1903        mov             v17.8b, v28.8b
1904
19052:
1906        bl              L(\type\()_8tap_filter_2)
1907
1908        ext             v18.8b, v17.8b, v28.8b, #4
1909        smull           v2.4s,  v16.4h, v1.h[0]
1910        smlal           v2.4s,  v17.4h, v1.h[1]
1911        smlal           v2.4s,  v18.4h, v1.h[2]
1912        smlal           v2.4s,  v28.4h, v1.h[3]
1913
1914        sqrshrn         v2.4h,  v2.4s,  #\shift_hv
1915        sqxtun          v2.8b,  v2.8h
1916        subs            \h,  \h,  #2
1917        st1             {v2.h}[0], [\dst], \d_strd
1918        st1             {v2.h}[1], [\ds2], \d_strd
1919        b.le            0f
1920        mov             v16.8b, v18.8b
1921        mov             v17.8b, v28.8b
1922        b               2b
1923
1924280:    // 2x8, 2x16, 2x32 hv
1925        ld1             {v1.8b},  [\xmy]
1926        sub             \src, \src, #1
1927        sub             \sr2, \src, \s_strd, lsl #1
1928        sub             \src, \sr2, \s_strd
1929        add             \ds2, \dst, \d_strd
1930        lsl             \s_strd, \s_strd, #1
1931        lsl             \d_strd, \d_strd, #1
1932        sxtl            v0.8h,  v0.8b
1933        sxtl            v1.8h,  v1.8b
1934        mov             x15, x30
1935
1936        ld1             {v28.8b}, [\src], \s_strd
1937        uxtl            v28.8h,  v28.8b
1938        ext             v29.16b, v28.16b, v28.16b, #2
1939        mul             v28.4h,  v28.4h,  v0.4h
1940        mul             v29.4h,  v29.4h,  v0.4h
1941        addp            v28.4h,  v28.4h,  v29.4h
1942        addp            v16.4h,  v28.4h,  v28.4h
1943        srshr           v16.4h,  v16.4h,  #2
1944
1945        bl              L(\type\()_8tap_filter_2)
1946        trn1            v16.2s, v16.2s, v28.2s
1947        mov             v17.8b, v28.8b
1948        bl              L(\type\()_8tap_filter_2)
1949        ext             v18.8b, v17.8b, v28.8b, #4
1950        mov             v19.8b, v28.8b
1951        bl              L(\type\()_8tap_filter_2)
1952        ext             v20.8b, v19.8b, v28.8b, #4
1953        mov             v21.8b, v28.8b
1954
195528:
1956        bl              L(\type\()_8tap_filter_2)
1957        ext             v22.8b, v21.8b, v28.8b, #4
1958        smull           v2.4s,  v16.4h, v1.h[0]
1959        smlal           v2.4s,  v17.4h, v1.h[1]
1960        smlal           v2.4s,  v18.4h, v1.h[2]
1961        smlal           v2.4s,  v19.4h, v1.h[3]
1962        smlal           v2.4s,  v20.4h, v1.h[4]
1963        smlal           v2.4s,  v21.4h, v1.h[5]
1964        smlal           v2.4s,  v22.4h, v1.h[6]
1965        smlal           v2.4s,  v28.4h, v1.h[7]
1966
1967        sqrshrn         v2.4h,  v2.4s,  #\shift_hv
1968        sqxtun          v2.8b,  v2.8h
1969        subs            \h,  \h,  #2
1970        st1             {v2.h}[0], [\dst], \d_strd
1971        st1             {v2.h}[1], [\ds2], \d_strd
1972        b.le            0f
1973        mov             v16.8b, v18.8b
1974        mov             v17.8b, v19.8b
1975        mov             v18.8b, v20.8b
1976        mov             v19.8b, v21.8b
1977        mov             v20.8b, v22.8b
1978        mov             v21.8b, v28.8b
1979        b               28b
1980
19810:
1982        br              x15
1983
1984L(\type\()_8tap_filter_2):
1985        ld1             {v28.8b},  [\sr2], \s_strd
1986        ld1             {v30.8b},  [\src], \s_strd
1987        uxtl            v28.8h,  v28.8b
1988        uxtl            v30.8h,  v30.8b
1989        ext             v29.16b, v28.16b, v28.16b, #2
1990        ext             v31.16b, v30.16b, v30.16b, #2
1991        trn1            v27.2s,  v28.2s,  v30.2s
1992        trn2            v30.2s,  v28.2s,  v30.2s
1993        trn1            v28.2s,  v29.2s,  v31.2s
1994        trn2            v31.2s,  v29.2s,  v31.2s
1995        mul             v27.4h,  v27.4h,  v0.h[0]
1996        mla             v27.4h,  v28.4h,  v0.h[1]
1997        mla             v27.4h,  v30.4h,  v0.h[2]
1998        mla             v27.4h,  v31.4h,  v0.h[3]
1999        srshr           v28.4h,  v27.4h,  #2
2000        ret
2001.endif
2002
200340:
2004        add             \xmx, \xmx, #2
2005        ld1             {v0.s}[0],  [\xmx]
2006        b.gt            480f
2007        add             \xmy, \xmy,  #2
2008        ld1             {v1.s}[0],  [\xmy]
2009        sub             \sr2, \src, #1
2010        sub             \src, \sr2, \s_strd
2011        add             \ds2, \dst, \d_strd
2012        lsl             \s_strd, \s_strd, #1
2013        lsl             \d_strd, \d_strd, #1
2014        sxtl            v0.8h,  v0.8b
2015        sxtl            v1.8h,  v1.8b
2016        mov             x15, x30
2017
2018        // 4x2, 4x4 hv
2019        ld1             {v26.8b}, [\src], \s_strd
2020        uxtl            v26.8h,  v26.8b
2021        ext             v28.16b, v26.16b, v26.16b, #2
2022        ext             v29.16b, v26.16b, v26.16b, #4
2023        ext             v30.16b, v26.16b, v26.16b, #6
2024        mul             v31.4h,  v26.4h,  v0.h[0]
2025        mla             v31.4h,  v28.4h,  v0.h[1]
2026        mla             v31.4h,  v29.4h,  v0.h[2]
2027        mla             v31.4h,  v30.4h,  v0.h[3]
2028        srshr           v16.4h,  v31.4h,  #2
2029
2030        bl              L(\type\()_8tap_filter_4)
2031        mov             v17.8b, v28.8b
2032        mov             v18.8b, v29.8b
2033
20344:
2035        bl              L(\type\()_8tap_filter_4)
2036        // Interleaving the mul/mla chains actually hurts performance
2037        // significantly on Cortex A53, thus keeping mul/mla tightly
2038        // chained like this.
2039        smull           v2.4s,  v16.4h, v1.h[0]
2040        smlal           v2.4s,  v17.4h, v1.h[1]
2041        smlal           v2.4s,  v18.4h, v1.h[2]
2042        smlal           v2.4s,  v28.4h, v1.h[3]
2043        smull           v3.4s,  v17.4h, v1.h[0]
2044        smlal           v3.4s,  v18.4h, v1.h[1]
2045        smlal           v3.4s,  v28.4h, v1.h[2]
2046        smlal           v3.4s,  v29.4h, v1.h[3]
2047        sqrshrn         v2.4h,  v2.4s,  #\shift_hv
2048        sqrshrn         v3.4h,  v3.4s,  #\shift_hv
2049        subs            \h,  \h,  #2
2050.ifc \type, put
2051        sqxtun          v2.8b,  v2.8h
2052        sqxtun          v3.8b,  v3.8h
2053        st1             {v2.s}[0], [\dst], \d_strd
2054        st1             {v3.s}[0], [\ds2], \d_strd
2055.else
2056        st1             {v2.4h}, [\dst], \d_strd
2057        st1             {v3.4h}, [\ds2], \d_strd
2058.endif
2059        b.le            0f
2060        mov             v16.8b,  v18.8b
2061        mov             v17.8b,  v28.8b
2062        mov             v18.8b,  v29.8b
2063        b               4b
2064
2065480:    // 4x8, 4x16, 4x32 hv
2066        ld1             {v1.8b},  [\xmy]
2067        sub             \src, \src, #1
2068        sub             \sr2, \src, \s_strd, lsl #1
2069        sub             \src, \sr2, \s_strd
2070        add             \ds2, \dst, \d_strd
2071        lsl             \s_strd, \s_strd, #1
2072        lsl             \d_strd, \d_strd, #1
2073        sxtl            v0.8h,  v0.8b
2074        sxtl            v1.8h,  v1.8b
2075        mov             x15, x30
2076
2077        ld1             {v26.8b}, [\src], \s_strd
2078        uxtl            v26.8h,  v26.8b
2079        ext             v28.16b, v26.16b, v26.16b, #2
2080        ext             v29.16b, v26.16b, v26.16b, #4
2081        ext             v30.16b, v26.16b, v26.16b, #6
2082        mul             v31.4h,  v26.4h,  v0.h[0]
2083        mla             v31.4h,  v28.4h,  v0.h[1]
2084        mla             v31.4h,  v29.4h,  v0.h[2]
2085        mla             v31.4h,  v30.4h,  v0.h[3]
2086        srshr           v16.4h,  v31.4h,  #2
2087
2088        bl              L(\type\()_8tap_filter_4)
2089        mov             v17.8b, v28.8b
2090        mov             v18.8b, v29.8b
2091        bl              L(\type\()_8tap_filter_4)
2092        mov             v19.8b, v28.8b
2093        mov             v20.8b, v29.8b
2094        bl              L(\type\()_8tap_filter_4)
2095        mov             v21.8b, v28.8b
2096        mov             v22.8b, v29.8b
2097
209848:
2099        bl              L(\type\()_8tap_filter_4)
2100        smull           v2.4s,  v16.4h, v1.h[0]
2101        smlal           v2.4s,  v17.4h, v1.h[1]
2102        smlal           v2.4s,  v18.4h, v1.h[2]
2103        smlal           v2.4s,  v19.4h, v1.h[3]
2104        smlal           v2.4s,  v20.4h, v1.h[4]
2105        smlal           v2.4s,  v21.4h, v1.h[5]
2106        smlal           v2.4s,  v22.4h, v1.h[6]
2107        smlal           v2.4s,  v28.4h, v1.h[7]
2108        smull           v3.4s,  v17.4h, v1.h[0]
2109        smlal           v3.4s,  v18.4h, v1.h[1]
2110        smlal           v3.4s,  v19.4h, v1.h[2]
2111        smlal           v3.4s,  v20.4h, v1.h[3]
2112        smlal           v3.4s,  v21.4h, v1.h[4]
2113        smlal           v3.4s,  v22.4h, v1.h[5]
2114        smlal           v3.4s,  v28.4h, v1.h[6]
2115        smlal           v3.4s,  v29.4h, v1.h[7]
2116        sqrshrn         v2.4h,  v2.4s,  #\shift_hv
2117        sqrshrn         v3.4h,  v3.4s,  #\shift_hv
2118        subs            \h,  \h,  #2
2119.ifc \type, put
2120        sqxtun          v2.8b,  v2.8h
2121        sqxtun          v3.8b,  v3.8h
2122        st1             {v2.s}[0], [\dst], \d_strd
2123        st1             {v3.s}[0], [\ds2], \d_strd
2124.else
2125        st1             {v2.4h}, [\dst], \d_strd
2126        st1             {v3.4h}, [\ds2], \d_strd
2127.endif
2128        b.le            0f
2129        mov             v16.8b,  v18.8b
2130        mov             v17.8b,  v19.8b
2131        mov             v18.8b,  v20.8b
2132        mov             v19.8b,  v21.8b
2133        mov             v20.8b,  v22.8b
2134        mov             v21.8b,  v28.8b
2135        mov             v22.8b,  v29.8b
2136        b               48b
21370:
2138        br              x15
2139
2140L(\type\()_8tap_filter_4):
2141        ld1             {v26.8b}, [\sr2], \s_strd
2142        ld1             {v27.8b}, [\src], \s_strd
2143        uxtl            v26.8h,  v26.8b
2144        uxtl            v27.8h,  v27.8b
2145        ext             v28.16b, v26.16b, v26.16b, #2
2146        ext             v29.16b, v26.16b, v26.16b, #4
2147        ext             v30.16b, v26.16b, v26.16b, #6
2148        mul             v31.4h,  v26.4h,  v0.h[0]
2149        mla             v31.4h,  v28.4h,  v0.h[1]
2150        mla             v31.4h,  v29.4h,  v0.h[2]
2151        mla             v31.4h,  v30.4h,  v0.h[3]
2152        ext             v28.16b, v27.16b, v27.16b, #2
2153        ext             v29.16b, v27.16b, v27.16b, #4
2154        ext             v30.16b, v27.16b, v27.16b, #6
2155        mul             v27.4h,  v27.4h,  v0.h[0]
2156        mla             v27.4h,  v28.4h,  v0.h[1]
2157        mla             v27.4h,  v29.4h,  v0.h[2]
2158        mla             v27.4h,  v30.4h,  v0.h[3]
2159        srshr           v28.4h,  v31.4h,  #2
2160        srshr           v29.4h,  v27.4h,  #2
2161        ret
2162
216380:
2164160:
2165320:
2166        b.gt            880f
2167        add             \xmy,  \xmy,  #2
2168        ld1             {v0.8b},  [\xmx]
2169        ld1             {v1.s}[0],  [\xmy]
2170        sub             \src,  \src,  #3
2171        sub             \src,  \src,  \s_strd
2172        sxtl            v0.8h,  v0.8b
2173        sxtl            v1.8h,  v1.8b
2174        mov             x15, x30
2175        mov             \my,  \h
2176
2177164:    // 8x2, 8x4, 16x2, 16x4, 32x2, 32x4 hv
2178        add             \ds2,  \dst,  \d_strd
2179        add             \sr2,  \src,  \s_strd
2180        lsl             \d_strd, \d_strd, #1
2181        lsl             \s_strd, \s_strd, #1
2182
2183        ld1             {v28.8b, v29.8b},  [\src], \s_strd
2184        uxtl            v28.8h,  v28.8b
2185        uxtl            v29.8h,  v29.8b
2186        mul             v24.8h,  v28.8h,  v0.h[0]
2187.irpc i, 1234567
2188        ext             v26.16b, v28.16b, v29.16b, #(2*\i)
2189        mla             v24.8h,  v26.8h,  v0.h[\i]
2190.endr
2191        srshr           v16.8h,  v24.8h, #2
2192
2193        bl              L(\type\()_8tap_filter_8)
2194        mov             v17.16b, v24.16b
2195        mov             v18.16b, v25.16b
2196
21978:
2198        smull           v2.4s,  v16.4h, v1.h[0]
2199        smull2          v3.4s,  v16.8h, v1.h[0]
2200        bl              L(\type\()_8tap_filter_8)
2201        smull           v4.4s,  v17.4h, v1.h[0]
2202        smull2          v5.4s,  v17.8h, v1.h[0]
2203        smlal           v2.4s,  v17.4h, v1.h[1]
2204        smlal2          v3.4s,  v17.8h, v1.h[1]
2205        smlal           v4.4s,  v18.4h, v1.h[1]
2206        smlal2          v5.4s,  v18.8h, v1.h[1]
2207        smlal           v2.4s,  v18.4h, v1.h[2]
2208        smlal2          v3.4s,  v18.8h, v1.h[2]
2209        smlal           v4.4s,  v24.4h, v1.h[2]
2210        smlal2          v5.4s,  v24.8h, v1.h[2]
2211        smlal           v2.4s,  v24.4h, v1.h[3]
2212        smlal2          v3.4s,  v24.8h, v1.h[3]
2213        smlal           v4.4s,  v25.4h, v1.h[3]
2214        smlal2          v5.4s,  v25.8h, v1.h[3]
2215        sqrshrn         v2.4h,  v2.4s,  #\shift_hv
2216        sqrshrn2        v2.8h,  v3.4s,  #\shift_hv
2217        sqrshrn         v4.4h,  v4.4s,  #\shift_hv
2218        sqrshrn2        v4.8h,  v5.4s,  #\shift_hv
2219        subs            \h,  \h,  #2
2220.ifc \type, put
2221        sqxtun          v2.8b,  v2.8h
2222        sqxtun          v4.8b,  v4.8h
2223        st1             {v2.8b}, [\dst], \d_strd
2224        st1             {v4.8b}, [\ds2], \d_strd
2225.else
2226        st1             {v2.8h}, [\dst], \d_strd
2227        st1             {v4.8h}, [\ds2], \d_strd
2228.endif
2229        b.le            9f
2230        mov             v16.16b, v18.16b
2231        mov             v17.16b, v24.16b
2232        mov             v18.16b, v25.16b
2233        b               8b
22349:
2235        subs            \w,  \w,  #8
2236        b.le            0f
2237        asr             \s_strd,  \s_strd,  #1
2238        asr             \d_strd,  \d_strd,  #1
2239        msub            \src,  \s_strd,  \xmy,  \src
2240        msub            \dst,  \d_strd,  \xmy,  \dst
2241        sub             \src,  \src,  \s_strd,  lsl #2
2242        mov             \h,  \my
2243        add             \src,  \src,  #8
2244.ifc \type, put
2245        add             \dst,  \dst,  #8
2246.else
2247        add             \dst,  \dst,  #16
2248.endif
2249        b               164b
2250
2251880:    // 8x8, 8x16, ..., 16x8, ..., 32x8, ... hv
2252640:
22531280:
2254        ld1             {v0.8b},  [\xmx]
2255        ld1             {v1.8b},  [\xmy]
2256        sub             \src,  \src,  #3
2257        sub             \src,  \src,  \s_strd
2258        sub             \src,  \src,  \s_strd, lsl #1
2259        sxtl            v0.8h,  v0.8b
2260        sxtl            v1.8h,  v1.8b
2261        mov             x15, x30
2262        mov             \my,  \h
2263
2264168:
2265        add             \ds2,  \dst,  \d_strd
2266        add             \sr2,  \src,  \s_strd
2267        lsl             \d_strd, \d_strd, #1
2268        lsl             \s_strd, \s_strd, #1
2269
2270        ld1             {v28.8b, v29.8b},  [\src], \s_strd
2271        uxtl            v28.8h,  v28.8b
2272        uxtl            v29.8h,  v29.8b
2273        mul             v24.8h,  v28.8h,  v0.h[0]
2274.irpc i, 1234567
2275        ext             v26.16b, v28.16b, v29.16b, #(2*\i)
2276        mla             v24.8h,  v26.8h,  v0.h[\i]
2277.endr
2278        srshr           v16.8h,  v24.8h, #2
2279
2280        bl              L(\type\()_8tap_filter_8)
2281        mov             v17.16b, v24.16b
2282        mov             v18.16b, v25.16b
2283        bl              L(\type\()_8tap_filter_8)
2284        mov             v19.16b, v24.16b
2285        mov             v20.16b, v25.16b
2286        bl              L(\type\()_8tap_filter_8)
2287        mov             v21.16b, v24.16b
2288        mov             v22.16b, v25.16b
2289
229088:
2291        smull           v2.4s,  v16.4h, v1.h[0]
2292        smull2          v3.4s,  v16.8h, v1.h[0]
2293        bl              L(\type\()_8tap_filter_8)
2294        smull           v4.4s,  v17.4h, v1.h[0]
2295        smull2          v5.4s,  v17.8h, v1.h[0]
2296        smlal           v2.4s,  v17.4h, v1.h[1]
2297        smlal2          v3.4s,  v17.8h, v1.h[1]
2298        smlal           v4.4s,  v18.4h, v1.h[1]
2299        smlal2          v5.4s,  v18.8h, v1.h[1]
2300        smlal           v2.4s,  v18.4h, v1.h[2]
2301        smlal2          v3.4s,  v18.8h, v1.h[2]
2302        smlal           v4.4s,  v19.4h, v1.h[2]
2303        smlal2          v5.4s,  v19.8h, v1.h[2]
2304        smlal           v2.4s,  v19.4h, v1.h[3]
2305        smlal2          v3.4s,  v19.8h, v1.h[3]
2306        smlal           v4.4s,  v20.4h, v1.h[3]
2307        smlal2          v5.4s,  v20.8h, v1.h[3]
2308        smlal           v2.4s,  v20.4h, v1.h[4]
2309        smlal2          v3.4s,  v20.8h, v1.h[4]
2310        smlal           v4.4s,  v21.4h, v1.h[4]
2311        smlal2          v5.4s,  v21.8h, v1.h[4]
2312        smlal           v2.4s,  v21.4h, v1.h[5]
2313        smlal2          v3.4s,  v21.8h, v1.h[5]
2314        smlal           v4.4s,  v22.4h, v1.h[5]
2315        smlal2          v5.4s,  v22.8h, v1.h[5]
2316        smlal           v2.4s,  v22.4h, v1.h[6]
2317        smlal2          v3.4s,  v22.8h, v1.h[6]
2318        smlal           v4.4s,  v24.4h, v1.h[6]
2319        smlal2          v5.4s,  v24.8h, v1.h[6]
2320        smlal           v2.4s,  v24.4h, v1.h[7]
2321        smlal2          v3.4s,  v24.8h, v1.h[7]
2322        smlal           v4.4s,  v25.4h, v1.h[7]
2323        smlal2          v5.4s,  v25.8h, v1.h[7]
2324        sqrshrn         v2.4h,  v2.4s,  #\shift_hv
2325        sqrshrn2        v2.8h,  v3.4s,  #\shift_hv
2326        sqrshrn         v4.4h,  v4.4s,  #\shift_hv
2327        sqrshrn2        v4.8h,  v5.4s,  #\shift_hv
2328        subs            \h,  \h,  #2
2329.ifc \type, put
2330        sqxtun          v2.8b,  v2.8h
2331        sqxtun          v4.8b,  v4.8h
2332        st1             {v2.8b}, [\dst], \d_strd
2333        st1             {v4.8b}, [\ds2], \d_strd
2334.else
2335        st1             {v2.8h}, [\dst], \d_strd
2336        st1             {v4.8h}, [\ds2], \d_strd
2337.endif
2338        b.le            9f
2339        mov             v16.16b, v18.16b
2340        mov             v17.16b, v19.16b
2341        mov             v18.16b, v20.16b
2342        mov             v19.16b, v21.16b
2343        mov             v20.16b, v22.16b
2344        mov             v21.16b, v24.16b
2345        mov             v22.16b, v25.16b
2346        b               88b
23479:
2348        subs            \w,  \w,  #8
2349        b.le            0f
2350        asr             \s_strd,  \s_strd,  #1
2351        asr             \d_strd,  \d_strd,  #1
2352        msub            \src,  \s_strd,  \xmy,  \src
2353        msub            \dst,  \d_strd,  \xmy,  \dst
2354        sub             \src,  \src,  \s_strd,  lsl #3
2355        mov             \h,  \my
2356        add             \src,  \src,  #8
2357.ifc \type, put
2358        add             \dst,  \dst,  #8
2359.else
2360        add             \dst,  \dst,  #16
2361.endif
2362        b               168b
23630:
2364        br              x15
2365
2366L(\type\()_8tap_filter_8):
2367        ld1             {v28.8b, v29.8b},  [\sr2], \s_strd
2368        ld1             {v30.8b, v31.8b},  [\src], \s_strd
2369        uxtl            v28.8h,  v28.8b
2370        uxtl            v29.8h,  v29.8b
2371        uxtl            v30.8h,  v30.8b
2372        uxtl            v31.8h,  v31.8b
2373        mul             v24.8h,  v28.8h,  v0.h[0]
2374        mul             v25.8h,  v30.8h,  v0.h[0]
2375.irpc i, 1234567
2376        ext             v26.16b, v28.16b, v29.16b, #(2*\i)
2377        ext             v27.16b, v30.16b, v31.16b, #(2*\i)
2378        mla             v24.8h,  v26.8h,  v0.h[\i]
2379        mla             v25.8h,  v27.8h,  v0.h[\i]
2380.endr
2381        srshr           v24.8h,  v24.8h, #2
2382        srshr           v25.8h,  v25.8h, #2
2383        ret
2384
2385L(\type\()_8tap_hv_tbl):
2386        .hword L(\type\()_8tap_hv_tbl) - 1280b
2387        .hword L(\type\()_8tap_hv_tbl) -  640b
2388        .hword L(\type\()_8tap_hv_tbl) -  320b
2389        .hword L(\type\()_8tap_hv_tbl) -  160b
2390        .hword L(\type\()_8tap_hv_tbl) -   80b
2391        .hword L(\type\()_8tap_hv_tbl) -   40b
2392        .hword L(\type\()_8tap_hv_tbl) -   20b
2393        .hword 0
2394endfunc
2395
2396
2397function \type\()_bilin_8bpc_neon, export=1
2398        dup             v1.16b, \mx
2399        dup             v3.16b, \my
2400        mov             w9,  #16
2401        sub             w8, w9, \mx
2402        sub             w9, w9, \my
2403        dup             v0.16b, w8
2404        dup             v2.16b, w9
2405.ifc \type, prep
2406        uxtw            \d_strd, \w
2407        lsl             \d_strd, \d_strd, #1
2408.endif
2409
2410        clz             w8,  \w
2411        sub             w8,  w8,  #24
2412        cbnz            \mx, L(\type\()_bilin_h)
2413        cbnz            \my, L(\type\()_bilin_v)
2414        b               \type\()_neon
2415
2416L(\type\()_bilin_h):
2417        cbnz            \my, L(\type\()_bilin_hv)
2418
2419        adr             x9,  L(\type\()_bilin_h_tbl)
2420        ldrh            w8,  [x9, x8, lsl #1]
2421        sub             x9,  x9,  w8, uxtw
2422        br              x9
2423
242420:     // 2xN h
2425.ifc \type, put
2426        add             \ds2,  \dst,  \d_strd
2427        add             \sr2,  \src,  \s_strd
2428        lsl             \d_strd,  \d_strd,  #1
2429        lsl             \s_strd,  \s_strd,  #1
24302:
2431        ld1             {v4.s}[0],  [\src], \s_strd
2432        ld1             {v6.s}[0],  [\sr2], \s_strd
2433        ext             v5.8b,  v4.8b,  v4.8b, #1
2434        ext             v7.8b,  v6.8b,  v6.8b, #1
2435        trn1            v4.4h,  v4.4h,  v6.4h
2436        trn1            v5.4h,  v5.4h,  v7.4h
2437        subs            \h,  \h,  #2
2438        umull           v4.8h,  v4.8b,  v0.8b
2439        umlal           v4.8h,  v5.8b,  v1.8b
2440        uqrshrn         v4.8b,  v4.8h,  #4
2441        st1             {v4.h}[0], [\dst], \d_strd
2442        st1             {v4.h}[1], [\ds2], \d_strd
2443        b.gt            2b
2444        ret
2445.endif
2446
244740:     // 4xN h
2448        add             \ds2,  \dst,  \d_strd
2449        add             \sr2,  \src,  \s_strd
2450        lsl             \d_strd,  \d_strd,  #1
2451        lsl             \s_strd,  \s_strd,  #1
24524:
2453        ld1             {v4.8b}, [\src], \s_strd
2454        ld1             {v6.8b}, [\sr2], \s_strd
2455        ext             v5.8b,  v4.8b,  v4.8b, #1
2456        ext             v7.8b,  v6.8b,  v6.8b, #1
2457        trn1            v4.2s,  v4.2s,  v6.2s
2458        trn1            v5.2s,  v5.2s,  v7.2s
2459        subs            \h,  \h,  #2
2460        umull           v4.8h,  v4.8b,  v0.8b
2461        umlal           v4.8h,  v5.8b,  v1.8b
2462.ifc \type, put
2463        uqrshrn         v4.8b,  v4.8h,  #4
2464        st1             {v4.s}[0], [\dst], \d_strd
2465        st1             {v4.s}[1], [\ds2], \d_strd
2466.else
2467        st1             {v4.d}[0], [\dst], \d_strd
2468        st1             {v4.d}[1], [\ds2], \d_strd
2469.endif
2470        b.gt            4b
2471        ret
2472
247380:     // 8xN h
2474        add             \ds2,  \dst,  \d_strd
2475        add             \sr2,  \src,  \s_strd
2476        lsl             \d_strd,  \d_strd,  #1
2477        lsl             \s_strd,  \s_strd,  #1
24788:
2479        ld1             {v4.16b}, [\src], \s_strd
2480        ld1             {v6.16b}, [\sr2], \s_strd
2481        ext             v5.16b, v4.16b, v4.16b, #1
2482        ext             v7.16b, v6.16b, v6.16b, #1
2483        subs            \h,  \h,  #2
2484        umull           v4.8h,  v4.8b,  v0.8b
2485        umull           v6.8h,  v6.8b,  v0.8b
2486        umlal           v4.8h,  v5.8b,  v1.8b
2487        umlal           v6.8h,  v7.8b,  v1.8b
2488.ifc \type, put
2489        uqrshrn         v4.8b,  v4.8h,  #4
2490        uqrshrn         v6.8b,  v6.8h,  #4
2491        st1             {v4.8b}, [\dst], \d_strd
2492        st1             {v6.8b}, [\ds2], \d_strd
2493.else
2494        st1             {v4.8h}, [\dst], \d_strd
2495        st1             {v6.8h}, [\ds2], \d_strd
2496.endif
2497        b.gt            8b
2498        ret
2499160:
2500320:
2501640:
25021280:   // 16xN, 32xN, ... h
2503        add             \ds2,  \dst,  \d_strd
2504        add             \sr2,  \src,  \s_strd
2505        lsl             \s_strd,  \s_strd,  #1
2506
2507        sub             \s_strd,  \s_strd,  \w, uxtw
2508        sub             \s_strd,  \s_strd,  #8
2509.ifc \type, put
2510        lsl             \d_strd,  \d_strd,  #1
2511        sub             \d_strd,  \d_strd,  \w, uxtw
2512.endif
2513161:
2514        ld1             {v16.d}[1],  [\src], #8
2515        ld1             {v20.d}[1],  [\sr2], #8
2516        mov             \mx, \w
2517
251816:
2519        ld1             {v18.16b},  [\src], #16
2520        ld1             {v22.16b},  [\sr2], #16
2521        ext             v17.16b, v16.16b, v18.16b, #8
2522        ext             v19.16b, v16.16b, v18.16b, #9
2523        ext             v21.16b, v20.16b, v22.16b, #8
2524        ext             v23.16b, v20.16b, v22.16b, #9
2525        umull           v16.8h,  v17.8b,  v0.8b
2526        umull2          v17.8h,  v17.16b, v0.16b
2527        umull           v20.8h,  v21.8b,  v0.8b
2528        umull2          v21.8h,  v21.16b, v0.16b
2529        umlal           v16.8h,  v19.8b,  v1.8b
2530        umlal2          v17.8h,  v19.16b, v1.16b
2531        umlal           v20.8h,  v23.8b,  v1.8b
2532        umlal2          v21.8h,  v23.16b, v1.16b
2533        subs            \mx, \mx, #16
2534.ifc \type, put
2535        uqrshrn         v16.8b,  v16.8h, #4
2536        uqrshrn2        v16.16b, v17.8h, #4
2537        uqrshrn         v20.8b,  v20.8h, #4
2538        uqrshrn2        v20.16b, v21.8h, #4
2539        st1             {v16.16b}, [\dst], #16
2540        st1             {v20.16b}, [\ds2], #16
2541.else
2542        st1             {v16.8h, v17.8h}, [\dst], #32
2543        st1             {v20.8h, v21.8h}, [\ds2], #32
2544.endif
2545        b.le            9f
2546
2547        mov             v16.16b, v18.16b
2548        mov             v20.16b, v22.16b
2549        b               16b
2550
25519:
2552        add             \dst,  \dst,  \d_strd
2553        add             \ds2,  \ds2,  \d_strd
2554        add             \src,  \src,  \s_strd
2555        add             \sr2,  \sr2,  \s_strd
2556
2557        subs            \h,  \h,  #2
2558        b.gt            161b
2559        ret
2560
2561L(\type\()_bilin_h_tbl):
2562        .hword L(\type\()_bilin_h_tbl) - 1280b
2563        .hword L(\type\()_bilin_h_tbl) -  640b
2564        .hword L(\type\()_bilin_h_tbl) -  320b
2565        .hword L(\type\()_bilin_h_tbl) -  160b
2566        .hword L(\type\()_bilin_h_tbl) -   80b
2567        .hword L(\type\()_bilin_h_tbl) -   40b
2568        .hword L(\type\()_bilin_h_tbl) -   20b
2569        .hword 0
2570
2571
2572L(\type\()_bilin_v):
2573        cmp             \h,  #4
2574        adr             x9,  L(\type\()_bilin_v_tbl)
2575        ldrh            w8,  [x9, x8, lsl #1]
2576        sub             x9,  x9,  w8, uxtw
2577        br              x9
2578
257920:     // 2xN v
2580.ifc \type, put
2581        cmp             \h,  #2
2582        add             \ds2,  \dst,  \d_strd
2583        add             \sr2,  \src,  \s_strd
2584        lsl             \s_strd,  \s_strd,  #1
2585        lsl             \d_strd,  \d_strd,  #1
2586
2587        // 2x2 v
2588        ld1             {v16.h}[0], [\src], \s_strd
2589        b.gt            24f
2590        ld1             {v17.h}[0], [\sr2], \s_strd
2591        ld1             {v18.h}[0], [\src], \s_strd
2592        trn1            v16.4h, v16.4h, v17.4h
2593        trn1            v17.4h, v17.4h, v18.4h
2594        umull           v4.8h,  v16.8b,  v2.8b
2595        umlal           v4.8h,  v17.8b,  v3.8b
2596        uqrshrn         v4.8b,  v4.8h,  #4
2597        st1             {v4.h}[0], [\dst]
2598        st1             {v4.h}[1], [\ds2]
2599        ret
260024:     // 2x4, 2x8, ... v
2601        ld1             {v17.h}[0], [\sr2], \s_strd
2602        ld1             {v18.h}[0], [\src], \s_strd
2603        ld1             {v19.h}[0], [\sr2], \s_strd
2604        ld1             {v20.h}[0], [\src], \s_strd
2605        trn1            v16.4h, v16.4h, v17.4h
2606        trn1            v17.4h, v17.4h, v18.4h
2607        trn1            v18.4h, v18.4h, v19.4h
2608        trn1            v19.4h, v19.4h, v20.4h
2609        trn1            v16.2s, v16.2s, v18.2s
2610        trn1            v17.2s, v17.2s, v19.2s
2611        umull           v4.8h,  v16.8b,  v2.8b
2612        umlal           v4.8h,  v17.8b,  v3.8b
2613        subs            \h,  \h,  #4
2614        uqrshrn         v4.8b,  v4.8h,  #4
2615        st1             {v4.h}[0], [\dst], \d_strd
2616        st1             {v4.h}[1], [\ds2], \d_strd
2617        st1             {v4.h}[2], [\dst], \d_strd
2618        st1             {v4.h}[3], [\ds2], \d_strd
2619        b.le            0f
2620        mov             v16.8b, v20.8b
2621        b               24b
26220:
2623        ret
2624.endif
2625
262640:     // 4xN v
2627        add             \ds2,  \dst,  \d_strd
2628        add             \sr2,  \src,  \s_strd
2629        lsl             \s_strd,  \s_strd,  #1
2630        lsl             \d_strd,  \d_strd,  #1
2631        ld1             {v16.s}[0], [\src], \s_strd
26324:
2633        ld1             {v17.s}[0], [\sr2], \s_strd
2634        ld1             {v18.s}[0], [\src], \s_strd
2635        trn1            v16.2s, v16.2s, v17.2s
2636        trn1            v17.2s, v17.2s, v18.2s
2637        umull           v4.8h,  v16.8b,  v2.8b
2638        umlal           v4.8h,  v17.8b,  v3.8b
2639        subs            \h,  \h,  #2
2640.ifc \type, put
2641        uqrshrn         v4.8b,  v4.8h,  #4
2642        st1             {v4.s}[0], [\dst], \d_strd
2643        st1             {v4.s}[1], [\ds2], \d_strd
2644.else
2645        st1             {v4.d}[0], [\dst], \d_strd
2646        st1             {v4.d}[1], [\ds2], \d_strd
2647.endif
2648        b.le            0f
2649        mov             v16.8b, v18.8b
2650        b               4b
26510:
2652        ret
2653
265480:     // 8xN v
2655        add             \ds2,  \dst,  \d_strd
2656        add             \sr2,  \src,  \s_strd
2657        lsl             \s_strd,  \s_strd,  #1
2658        lsl             \d_strd,  \d_strd,  #1
2659        ld1             {v16.8b}, [\src], \s_strd
26608:
2661        ld1             {v17.8b}, [\sr2], \s_strd
2662        ld1             {v18.8b}, [\src], \s_strd
2663        umull           v4.8h,  v16.8b,  v2.8b
2664        umull           v5.8h,  v17.8b,  v2.8b
2665        umlal           v4.8h,  v17.8b,  v3.8b
2666        umlal           v5.8h,  v18.8b,  v3.8b
2667        subs            \h,  \h,  #2
2668.ifc \type, put
2669        uqrshrn         v4.8b,  v4.8h,  #4
2670        uqrshrn         v5.8b,  v5.8h,  #4
2671        st1             {v4.8b}, [\dst], \d_strd
2672        st1             {v5.8b}, [\ds2], \d_strd
2673.else
2674        st1             {v4.8h}, [\dst], \d_strd
2675        st1             {v5.8h}, [\ds2], \d_strd
2676.endif
2677        b.le            0f
2678        mov             v16.8b, v18.8b
2679        b               8b
26800:
2681        ret
2682
2683160:    // 16xN, 32xN, ...
2684320:
2685640:
26861280:
2687        mov             \my,  \h
26881:
2689        add             \ds2, \dst, \d_strd
2690        add             \sr2, \src, \s_strd
2691        lsl             \s_strd, \s_strd, #1
2692        lsl             \d_strd, \d_strd, #1
2693
2694        ld1             {v16.16b}, [\src], \s_strd
26952:
2696        ld1             {v17.16b}, [\sr2], \s_strd
2697        ld1             {v18.16b}, [\src], \s_strd
2698        umull           v4.8h,  v16.8b,  v2.8b
2699        umull2          v5.8h,  v16.16b, v2.16b
2700        umull           v6.8h,  v17.8b,  v2.8b
2701        umull2          v7.8h,  v17.16b, v2.16b
2702        umlal           v4.8h,  v17.8b,  v3.8b
2703        umlal2          v5.8h,  v17.16b, v3.16b
2704        umlal           v6.8h,  v18.8b,  v3.8b
2705        umlal2          v7.8h,  v18.16b, v3.16b
2706        subs            \h,  \h,  #2
2707.ifc \type, put
2708        uqrshrn         v4.8b,  v4.8h,  #4
2709        uqrshrn2        v4.16b, v5.8h,  #4
2710        uqrshrn         v6.8b,  v6.8h,  #4
2711        uqrshrn2        v6.16b, v7.8h,  #4
2712        st1             {v4.16b}, [\dst], \d_strd
2713        st1             {v6.16b}, [\ds2], \d_strd
2714.else
2715        st1             {v4.8h, v5.8h}, [\dst], \d_strd
2716        st1             {v6.8h, v7.8h}, [\ds2], \d_strd
2717.endif
2718        b.le            9f
2719        mov             v16.16b, v18.16b
2720        b               2b
27219:
2722        subs            \w,  \w,  #16
2723        b.le            0f
2724        asr             \s_strd, \s_strd, #1
2725        asr             \d_strd, \d_strd, #1
2726        msub            \src, \s_strd, \xmy, \src
2727        msub            \dst, \d_strd, \xmy, \dst
2728        sub             \src, \src, \s_strd, lsl #1
2729        mov             \h,  \my
2730        add             \src, \src, #16
2731.ifc \type, put
2732        add             \dst, \dst, #16
2733.else
2734        add             \dst, \dst, #32
2735.endif
2736        b               1b
27370:
2738        ret
2739
2740L(\type\()_bilin_v_tbl):
2741        .hword L(\type\()_bilin_v_tbl) - 1280b
2742        .hword L(\type\()_bilin_v_tbl) -  640b
2743        .hword L(\type\()_bilin_v_tbl) -  320b
2744        .hword L(\type\()_bilin_v_tbl) -  160b
2745        .hword L(\type\()_bilin_v_tbl) -   80b
2746        .hword L(\type\()_bilin_v_tbl) -   40b
2747        .hword L(\type\()_bilin_v_tbl) -   20b
2748        .hword 0
2749
2750L(\type\()_bilin_hv):
2751        uxtl            v2.8h, v2.8b
2752        uxtl            v3.8h, v3.8b
2753        adr             x9,  L(\type\()_bilin_hv_tbl)
2754        ldrh            w8,  [x9, x8, lsl #1]
2755        sub             x9,  x9,  w8, uxtw
2756        br              x9
2757
275820:     // 2xN hv
2759.ifc \type, put
2760        add             \sr2, \src, \s_strd
2761        add             \ds2, \dst, \d_strd
2762        lsl             \s_strd, \s_strd, #1
2763        lsl             \d_strd, \d_strd, #1
2764
2765        ld1             {v28.s}[0],  [\src], \s_strd
2766        ext             v29.8b, v28.8b, v28.8b, #1
2767        umull           v16.8h, v28.8b, v0.8b
2768        umlal           v16.8h, v29.8b, v1.8b
2769
27702:
2771        ld1             {v28.s}[0],  [\sr2], \s_strd
2772        ld1             {v30.s}[0],  [\src], \s_strd
2773        ext             v29.8b, v28.8b, v28.8b, #1
2774        ext             v31.8b, v30.8b, v30.8b, #1
2775        trn1            v28.4h, v28.4h, v30.4h
2776        trn1            v29.4h, v29.4h, v31.4h
2777        umull           v17.8h, v28.8b, v0.8b
2778        umlal           v17.8h, v29.8b, v1.8b
2779
2780        trn1            v16.2s, v16.2s, v17.2s
2781
2782        mul             v4.4h,  v16.4h, v2.4h
2783        mla             v4.4h,  v17.4h, v3.4h
2784        uqrshrn         v4.8b,  v4.8h,  #8
2785        subs            \h,  \h,  #2
2786        st1             {v4.h}[0], [\dst], \d_strd
2787        st1             {v4.h}[1], [\ds2], \d_strd
2788        b.le            0f
2789        trn2            v16.2s, v17.2s, v17.2s
2790        b               2b
27910:
2792        ret
2793.endif
2794
279540:     // 4xN hv
2796        add             \sr2, \src, \s_strd
2797        add             \ds2, \dst, \d_strd
2798        lsl             \s_strd, \s_strd, #1
2799        lsl             \d_strd, \d_strd, #1
2800
2801        ld1             {v28.8b},  [\src], \s_strd
2802        ext             v29.8b, v28.8b, v28.8b, #1
2803        umull           v16.8h, v28.8b, v0.8b
2804        umlal           v16.8h, v29.8b, v1.8b
2805
28064:
2807        ld1             {v28.8b},  [\sr2], \s_strd
2808        ld1             {v30.8b},  [\src], \s_strd
2809        ext             v29.8b, v28.8b, v28.8b, #1
2810        ext             v31.8b, v30.8b, v30.8b, #1
2811        trn1            v28.2s, v28.2s, v30.2s
2812        trn1            v29.2s, v29.2s, v31.2s
2813        umull           v17.8h, v28.8b, v0.8b
2814        umlal           v17.8h, v29.8b, v1.8b
2815
2816        trn1            v16.2d, v16.2d, v17.2d
2817
2818        mul             v4.8h,  v16.8h, v2.8h
2819        mla             v4.8h,  v17.8h, v3.8h
2820        subs            \h,  \h,  #2
2821.ifc \type, put
2822        uqrshrn         v4.8b,  v4.8h,  #8
2823        st1             {v4.s}[0], [\dst], \d_strd
2824        st1             {v4.s}[1], [\ds2], \d_strd
2825.else
2826        urshr           v4.8h,  v4.8h,  #4
2827        st1             {v4.d}[0], [\dst], \d_strd
2828        st1             {v4.d}[1], [\ds2], \d_strd
2829.endif
2830        b.le            0f
2831        trn2            v16.2d, v17.2d, v17.2d
2832        b               4b
28330:
2834        ret
2835
283680:     // 8xN, 16xN, ... hv
2837160:
2838320:
2839640:
28401280:
2841        mov             \my,  \h
2842
28431:
2844        add             \sr2, \src, \s_strd
2845        add             \ds2, \dst, \d_strd
2846        lsl             \s_strd, \s_strd, #1
2847        lsl             \d_strd, \d_strd, #1
2848
2849        ld1             {v28.16b},  [\src], \s_strd
2850        ext             v29.16b, v28.16b, v28.16b, #1
2851        umull           v16.8h, v28.8b, v0.8b
2852        umlal           v16.8h, v29.8b, v1.8b
2853
28542:
2855        ld1             {v28.16b},  [\sr2], \s_strd
2856        ld1             {v30.16b},  [\src], \s_strd
2857        ext             v29.16b, v28.16b, v28.16b, #1
2858        ext             v31.16b, v30.16b, v30.16b, #1
2859        umull           v17.8h, v28.8b, v0.8b
2860        umlal           v17.8h, v29.8b, v1.8b
2861        umull           v18.8h, v30.8b, v0.8b
2862        umlal           v18.8h, v31.8b, v1.8b
2863
2864        mul             v4.8h,  v16.8h, v2.8h
2865        mla             v4.8h,  v17.8h, v3.8h
2866        mul             v5.8h,  v17.8h, v2.8h
2867        mla             v5.8h,  v18.8h, v3.8h
2868        subs            \h,  \h,  #2
2869.ifc \type, put
2870        uqrshrn         v4.8b,  v4.8h,  #8
2871        uqrshrn         v5.8b,  v5.8h,  #8
2872        st1             {v4.8b}, [\dst], \d_strd
2873        st1             {v5.8b}, [\ds2], \d_strd
2874.else
2875        urshr           v4.8h,  v4.8h,  #4
2876        urshr           v5.8h,  v5.8h,  #4
2877        st1             {v4.8h}, [\dst], \d_strd
2878        st1             {v5.8h}, [\ds2], \d_strd
2879.endif
2880        b.le            9f
2881        mov             v16.16b, v18.16b
2882        b               2b
28839:
2884        subs            \w,  \w,  #8
2885        b.le            0f
2886        asr             \s_strd,  \s_strd,  #1
2887        asr             \d_strd,  \d_strd,  #1
2888        msub            \src,  \s_strd,  \xmy,  \src
2889        msub            \dst,  \d_strd,  \xmy,  \dst
2890        sub             \src,  \src,  \s_strd,  lsl #1
2891        mov             \h,  \my
2892        add             \src,  \src,  #8
2893.ifc \type, put
2894        add             \dst,  \dst,  #8
2895.else
2896        add             \dst,  \dst,  #16
2897.endif
2898        b               1b
28990:
2900        ret
2901
2902L(\type\()_bilin_hv_tbl):
2903        .hword L(\type\()_bilin_hv_tbl) - 1280b
2904        .hword L(\type\()_bilin_hv_tbl) -  640b
2905        .hword L(\type\()_bilin_hv_tbl) -  320b
2906        .hword L(\type\()_bilin_hv_tbl) -  160b
2907        .hword L(\type\()_bilin_hv_tbl) -   80b
2908        .hword L(\type\()_bilin_hv_tbl) -   40b
2909        .hword L(\type\()_bilin_hv_tbl) -   20b
2910        .hword 0
2911endfunc
2912.endm
2913
2914filter_fn put,  x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 10
2915filter_fn prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6
2916
2917.macro load_filter_row dst, src, inc
2918        asr             w13, \src, #10
2919        ldr             \dst, [x11, w13, sxtw #3]
2920        add             \src, \src, \inc
2921.endm
2922
2923function warp_filter_horz_neon
2924        add             w12, w5,  #512
2925
2926        ld1             {v16.8b, v17.8b}, [x2], x3
2927
2928        load_filter_row d0, w12, w7
2929        uxtl            v16.8h,  v16.8b
2930        load_filter_row d1, w12, w7
2931        uxtl            v17.8h,  v17.8b
2932        load_filter_row d2, w12, w7
2933        sxtl            v0.8h,   v0.8b
2934        load_filter_row d3, w12, w7
2935        sxtl            v1.8h,   v1.8b
2936        load_filter_row d4, w12, w7
2937        sxtl            v2.8h,   v2.8b
2938        load_filter_row d5, w12, w7
2939        sxtl            v3.8h,   v3.8b
2940        load_filter_row d6, w12, w7
2941        sxtl            v4.8h,   v4.8b
2942        load_filter_row d7, w12, w7
2943        sxtl            v5.8h,   v5.8b
2944        ext             v18.16b, v16.16b, v17.16b, #2*1
2945        mul             v23.8h,  v16.8h,  v0.8h
2946        sxtl            v6.8h,   v6.8b
2947        ext             v19.16b, v16.16b, v17.16b, #2*2
2948        mul             v18.8h,  v18.8h,  v1.8h
2949        sxtl            v7.8h,   v7.8b
2950        ext             v20.16b, v16.16b, v17.16b, #2*3
2951        mul             v19.8h,  v19.8h,  v2.8h
2952        ext             v21.16b, v16.16b, v17.16b, #2*4
2953        saddlp          v23.4s,  v23.8h
2954        mul             v20.8h,  v20.8h,  v3.8h
2955        ext             v22.16b, v16.16b, v17.16b, #2*5
2956        saddlp          v18.4s,  v18.8h
2957        mul             v21.8h,  v21.8h,  v4.8h
2958        saddlp          v19.4s,  v19.8h
2959        mul             v22.8h,  v22.8h,  v5.8h
2960        saddlp          v20.4s,  v20.8h
2961        saddlp          v21.4s,  v21.8h
2962        saddlp          v22.4s,  v22.8h
2963        addp            v18.4s,  v23.4s,  v18.4s
2964        ext             v23.16b, v16.16b, v17.16b, #2*6
2965        addp            v19.4s,  v19.4s,  v20.4s
2966        mul             v23.8h,  v23.8h,  v6.8h
2967        ext             v20.16b, v16.16b, v17.16b, #2*7
2968        mul             v20.8h,  v20.8h,  v7.8h
2969        saddlp          v23.4s,  v23.8h
2970        addp            v21.4s,  v21.4s,  v22.4s
2971        saddlp          v20.4s,  v20.8h
2972        addp            v20.4s,  v23.4s,  v20.4s
2973        addp            v18.4s,  v18.4s,  v19.4s
2974        addp            v20.4s,  v21.4s,  v20.4s
2975
2976        add             w5,  w5,  w8
2977
2978        rshrn           v16.4h,  v18.4s,  #3
2979        rshrn2          v16.8h,  v20.4s,  #3
2980
2981        ret
2982endfunc
2983
2984// void dav1d_warp_affine_8x8_8bpc_neon(
2985//         pixel *dst, const ptrdiff_t dst_stride,
2986//         const pixel *src, const ptrdiff_t src_stride,
2987//         const int16_t *const abcd, int mx, int my)
2988.macro warp t, shift
2989function warp_affine_8x8\t\()_8bpc_neon, export=1
2990        ldr             x4,  [x4]
2991        sbfx            x7,  x4, #0,  #16
2992        sbfx            x8,  x4, #16, #16
2993        sbfx            x9,  x4, #32, #16
2994        sbfx            x4,  x4, #48, #16
2995        mov             w10, #8
2996        sub             x2,  x2,  x3, lsl #1
2997        sub             x2,  x2,  x3
2998        sub             x2,  x2,  #3
2999        movrel          x11, X(mc_warp_filter), 64*8
3000        mov             x15, x30
3001.ifnb \t
3002        lsl             x1,  x1,  #1
3003.endif
3004
3005        bl              warp_filter_horz_neon
3006        mov             v24.16b, v16.16b
3007        bl              warp_filter_horz_neon
3008        mov             v25.16b, v16.16b
3009        bl              warp_filter_horz_neon
3010        mov             v26.16b, v16.16b
3011        bl              warp_filter_horz_neon
3012        mov             v27.16b, v16.16b
3013        bl              warp_filter_horz_neon
3014        mov             v28.16b, v16.16b
3015        bl              warp_filter_horz_neon
3016        mov             v29.16b, v16.16b
3017        bl              warp_filter_horz_neon
3018        mov             v30.16b, v16.16b
3019
30201:
3021        add             w14, w6,  #512
3022        bl              warp_filter_horz_neon
3023        mov             v31.16b, v16.16b
3024
3025        load_filter_row d0, w14, w9
3026        load_filter_row d1, w14, w9
3027        load_filter_row d2, w14, w9
3028        load_filter_row d3, w14, w9
3029        load_filter_row d4, w14, w9
3030        load_filter_row d5, w14, w9
3031        load_filter_row d6, w14, w9
3032        load_filter_row d7, w14, w9
3033        transpose_8x8b  v0, v1, v2, v3, v4, v5, v6, v7, v16, v17
3034        sxtl            v0.8h,   v0.8b
3035        sxtl            v1.8h,   v1.8b
3036        sxtl            v2.8h,   v2.8b
3037        sxtl            v3.8h,   v3.8b
3038        sxtl            v4.8h,   v4.8b
3039        sxtl            v5.8h,   v5.8b
3040        sxtl            v6.8h,   v6.8b
3041        sxtl            v7.8h,   v7.8b
3042
3043        // This ordering of smull/smlal/smull2/smlal2 is highly
3044        // beneficial for Cortex A53 here.
3045        smull           v16.4s,  v24.4h,  v0.4h
3046        smlal           v16.4s,  v25.4h,  v1.4h
3047        smlal           v16.4s,  v26.4h,  v2.4h
3048        smlal           v16.4s,  v27.4h,  v3.4h
3049        smlal           v16.4s,  v28.4h,  v4.4h
3050        smlal           v16.4s,  v29.4h,  v5.4h
3051        smlal           v16.4s,  v30.4h,  v6.4h
3052        smlal           v16.4s,  v31.4h,  v7.4h
3053        smull2          v17.4s,  v24.8h,  v0.8h
3054        smlal2          v17.4s,  v25.8h,  v1.8h
3055        smlal2          v17.4s,  v26.8h,  v2.8h
3056        smlal2          v17.4s,  v27.8h,  v3.8h
3057        smlal2          v17.4s,  v28.8h,  v4.8h
3058        smlal2          v17.4s,  v29.8h,  v5.8h
3059        smlal2          v17.4s,  v30.8h,  v6.8h
3060        smlal2          v17.4s,  v31.8h,  v7.8h
3061
3062        mov             v24.16b, v25.16b
3063        mov             v25.16b, v26.16b
3064        sqrshrn         v16.4h,  v16.4s,  #\shift
3065        mov             v26.16b, v27.16b
3066        sqrshrn2        v16.8h,  v17.4s,  #\shift
3067        mov             v27.16b, v28.16b
3068        mov             v28.16b, v29.16b
3069.ifb \t
3070        sqxtun          v16.8b,  v16.8h
3071.endif
3072        mov             v29.16b, v30.16b
3073        mov             v30.16b, v31.16b
3074        subs            w10, w10, #1
3075.ifnb \t
3076        st1             {v16.8h}, [x0], x1
3077.else
3078        st1             {v16.8b}, [x0], x1
3079.endif
3080
3081        add             w6,  w6,  w4
3082        b.gt            1b
3083
3084        br              x15
3085endfunc
3086.endm
3087
3088warp  , 11
3089warp t, 7
3090
3091// void dav1d_emu_edge_8bpc_neon(
3092//         const intptr_t bw, const intptr_t bh,
3093//         const intptr_t iw, const intptr_t ih,
3094//         const intptr_t x, const intptr_t y,
3095//         pixel *dst, const ptrdiff_t dst_stride,
3096//         const pixel *ref, const ptrdiff_t ref_stride)
3097function emu_edge_8bpc_neon, export=1
3098        ldp             x8,  x9,  [sp]
3099
3100        // ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride)
3101        // ref += iclip(x, 0, iw - 1)
3102        sub             x12, x3,  #1           // ih - 1
3103        cmp             x5,  x3
3104        sub             x13, x2,  #1           // iw - 1
3105        csel            x12, x12, x5,  ge      // min(y, ih - 1)
3106        cmp             x4,  x2
3107        bic             x12, x12, x12, asr #63 // max(min(y, ih - 1), 0)
3108        csel            x13, x13, x4,  ge      // min(x, iw - 1)
3109        bic             x13, x13, x13, asr #63 // max(min(x, iw - 1), 0)
3110        madd            x8,  x12, x9,  x8      // ref += iclip() * stride
3111        add             x8,  x8,  x13          // ref += iclip()
3112
3113        // bottom_ext = iclip(y + bh - ih, 0, bh - 1)
3114        // top_ext = iclip(-y, 0, bh - 1)
3115        add             x10, x5,  x1           // y + bh
3116        neg             x5,  x5                // -y
3117        sub             x10, x10, x3           // y + bh - ih
3118        sub             x12, x1,  #1           // bh - 1
3119        cmp             x10, x1
3120        bic             x5,  x5,  x5,  asr #63 // max(-y, 0)
3121        csel            x10, x10, x12, lt      // min(y + bh - ih, bh-1)
3122        cmp             x5,  x1
3123        bic             x10, x10, x10, asr #63 // max(min(y + bh - ih, bh-1), 0)
3124        csel            x5,  x5,  x12, lt      // min(max(-y, 0), bh-1)
3125
3126        // right_ext = iclip(x + bw - iw, 0, bw - 1)
3127        // left_ext = iclip(-x, 0, bw - 1)
3128        add             x11, x4,  x0           // x + bw
3129        neg             x4,  x4                // -x
3130        sub             x11, x11, x2           // x + bw - iw
3131        sub             x13, x0,  #1           // bw - 1
3132        cmp             x11, x0
3133        bic             x4,  x4,  x4,  asr #63 // max(-x, 0)
3134        csel            x11, x11, x13, lt      // min(x + bw - iw, bw-1)
3135        cmp             x4,  x0
3136        bic             x11, x11, x11, asr #63 // max(min(x + bw - iw, bw-1), 0)
3137        csel            x4,  x4,  x13, lt      // min(max(-x, 0), bw - 1)
3138
3139        // center_h = bh - top_ext - bottom_ext
3140        // dst += top_ext * PXSTRIDE(dst_stride)
3141        // center_w = bw - left_ext - right_ext
3142        sub             x1,  x1,  x5           // bh - top_ext
3143        madd            x6,  x5,  x7,  x6
3144        sub             x2,  x0,  x4           // bw - left_ext
3145        sub             x1,  x1,  x10          // center_h = bh - top_ext - bottom_ext
3146        sub             x2,  x2,  x11          // center_w = bw - left_ext - right_ext
3147
3148        mov             x14, x6                // backup of dst
3149
3150.macro v_loop need_left, need_right
31510:
3152.if \need_left
3153        ld1r            {v0.16b}, [x8]
3154        mov             x12, x6                // out = dst
3155        mov             x3,  x4
31561:
3157        subs            x3,  x3,  #16
3158        st1             {v0.16b}, [x12], #16
3159        b.gt            1b
3160.endif
3161        mov             x13, x8
3162        add             x12, x6,  x4           // out = dst + left_ext
3163        mov             x3,  x2
31641:
3165        ld1             {v0.16b, v1.16b}, [x13], #32
3166        subs            x3,  x3,  #32
3167        st1             {v0.16b, v1.16b}, [x12], #32
3168        b.gt            1b
3169.if \need_right
3170        add             x3,  x8,  x2           // in + center_w
3171        sub             x3,  x3,  #1           // in + center_w - 1
3172        add             x12, x6,  x4           // dst + left_ext
3173        ld1r            {v0.16b}, [x3]
3174        add             x12, x12, x2           // out = dst + left_ext + center_w
3175        mov             x3,  x11
31761:
3177        subs            x3,  x3,  #16
3178        st1             {v0.16b}, [x12], #16
3179        b.gt            1b
3180.endif
3181
3182        subs            x1,  x1,  #1           // center_h--
3183        add             x6,  x6,  x7
3184        add             x8,  x8,  x9
3185        b.gt            0b
3186.endm
3187
3188        cbz             x4,  2f
3189        // need_left
3190        cbz             x11, 3f
3191        // need_left + need_right
3192        v_loop          1,   1
3193        b               5f
3194
31952:
3196        // !need_left
3197        cbz             x11, 4f
3198        // !need_left + need_right
3199        v_loop          0,   1
3200        b               5f
3201
32023:
3203        // need_left + !need_right
3204        v_loop          1,   0
3205        b               5f
3206
32074:
3208        // !need_left + !need_right
3209        v_loop          0,   0
3210
32115:
3212
3213        cbz             x10, 3f
3214        // need_bottom
3215        sub             x8,  x6,  x7           // ref = dst - stride
3216        mov             x4,  x0
32171:
3218        ld1             {v0.16b, v1.16b}, [x8], #32
3219        mov             x3,  x10
32202:
3221        subs            x3,  x3,  #1
3222        st1             {v0.16b, v1.16b}, [x6], x7
3223        b.gt            2b
3224        msub            x6,  x7,  x10,  x6     // dst -= bottom_ext * stride
3225        subs            x4,  x4,  #32          // bw -= 32
3226        add             x6,  x6,  #32          // dst += 32
3227        b.gt            1b
3228
32293:
3230        cbz             x5,  3f
3231        // need_top
3232        msub            x6,  x7,  x5,  x14     // dst = stored_dst - top_ext * stride
32331:
3234        ld1             {v0.16b, v1.16b}, [x14], #32
3235        mov             x3,  x5
32362:
3237        subs            x3,  x3,  #1
3238        st1             {v0.16b, v1.16b}, [x6], x7
3239        b.gt            2b
3240        msub            x6,  x7,  x5,  x6      // dst -= top_ext * stride
3241        subs            x0,  x0,  #32          // bw -= 32
3242        add             x6,  x6,  #32          // dst += 32
3243        b.gt            1b
3244
32453:
3246        ret
3247endfunc
3248