1/*
2 * Copyright © 2018, VideoLAN and dav1d authors
3 * Copyright © 2018, Janne Grunau
4 * Copyright © 2018, Martin Storsjo
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice, this
11 *    list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 *    this list of conditions and the following disclaimer in the documentation
15 *    and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
21 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29#include "src/arm/asm.S"
30#include "util.S"
31
32.macro avg dst, t0, t1, t2, t3
33        ld1             {\t0\().8h,\t1\().8h},   [x2],  32
34        ld1             {\t2\().8h,\t3\().8h},   [x3],  32
35        add             \t0\().8h,   \t0\().8h,   \t2\().8h
36        add             \t1\().8h,   \t1\().8h,   \t3\().8h
37        sqrshrun        \dst\().8b,  \t0\().8h,   #5
38        sqrshrun2       \dst\().16b, \t1\().8h,   #5
39.endm
40
41.macro w_avg dst, t0, t1, t2, t3
42        ld1             {\t0\().8h,\t1\().8h},   [x2],  32
43        ld1             {\t2\().8h,\t3\().8h},   [x3],  32
44        sub             \t0\().8h,   \t2\().8h,   \t0\().8h
45        sub             \t1\().8h,   \t3\().8h,   \t1\().8h
46        sqdmulh         \t0\().8h,   \t0\().8h,   v30.8h
47        sqdmulh         \t1\().8h,   \t1\().8h,   v30.8h
48        add             \t0\().8h,   \t2\().8h,   \t0\().8h
49        add             \t1\().8h,   \t3\().8h,   \t1\().8h
50        sqrshrun        \dst\().8b,  \t0\().8h,   #4
51        sqrshrun2       \dst\().16b, \t1\().8h,   #4
52.endm
53
54.macro mask dst, t0, t1, t2, t3
55        ld1             {v30.16b}, [x6],  16
56        ld1             {\t0\().8h,\t1\().8h},   [x2],  32
57        mul             v30.16b, v30.16b, v31.16b
58        ld1             {\t2\().8h,\t3\().8h},   [x3],  32
59        shll            v28.8h, v30.8b,  #8
60        shll2           v29.8h, v30.16b, #8
61        sub             \t0\().8h,   \t2\().8h,   \t0\().8h
62        sub             \t1\().8h,   \t3\().8h,   \t1\().8h
63        sqdmulh         \t0\().8h,   \t0\().8h,   v28.8h
64        sqdmulh         \t1\().8h,   \t1\().8h,   v29.8h
65        add             \t0\().8h,   \t2\().8h,   \t0\().8h
66        add             \t1\().8h,   \t3\().8h,   \t1\().8h
67        sqrshrun        \dst\().8b,  \t0\().8h,   #4
68        sqrshrun2       \dst\().16b, \t1\().8h,   #4
69.endm
70
71.macro bidir_fn type
72function \type\()_8bpc_neon, export=1
73        clz             w4,  w4
74.ifc \type, w_avg
75        dup             v30.8h, w6
76        neg             v30.8h, v30.8h
77        shl             v30.8h, v30.8h, #11
78.endif
79.ifc \type, mask
80        movi            v31.16b, #256-2
81.endif
82        adr             x7,  L(\type\()_tbl)
83        sub             w4,  w4,  #24
84        ldrh            w4,  [x7, x4, lsl #1]
85        \type           v4,  v0,  v1,  v2,  v3
86        sub             x7,  x7,  w4, uxtw
87        br              x7
8840:
89        add             x7,  x0,  x1
90        lsl             x1,  x1,  #1
914:
92        cmp             w5,  #4
93        st1             {v4.s}[0],  [x0], x1
94        st1             {v4.s}[1],  [x7], x1
95        st1             {v4.s}[2],  [x0], x1
96        st1             {v4.s}[3],  [x7], x1
97        b.eq            0f
98        \type           v5,  v0,  v1,  v2,  v3
99        cmp             w5,  #8
100        st1             {v5.s}[0],  [x0], x1
101        st1             {v5.s}[1],  [x7], x1
102        st1             {v5.s}[2],  [x0], x1
103        st1             {v5.s}[3],  [x7], x1
104        b.eq            0f
105        \type           v4,  v0,  v1,  v2,  v3
106        st1             {v4.s}[0],  [x0], x1
107        st1             {v4.s}[1],  [x7], x1
108        \type           v5,  v0,  v1,  v2,  v3
109        st1             {v4.s}[2],  [x0], x1
110        st1             {v4.s}[3],  [x7], x1
111        st1             {v5.s}[0],  [x0], x1
112        st1             {v5.s}[1],  [x7], x1
113        st1             {v5.s}[2],  [x0], x1
114        st1             {v5.s}[3],  [x7], x1
115        ret
11680:
117        add             x7,  x0,  x1
118        lsl             x1,  x1,  #1
1198:
120        st1             {v4.d}[0],  [x0], x1
121        \type           v5,  v0,  v1,  v2,  v3
122        st1             {v4.d}[1],  [x7], x1
123        st1             {v5.d}[0],  [x0], x1
124        subs            w5,  w5,  #4
125        st1             {v5.d}[1],  [x7], x1
126        b.le            0f
127        \type           v4,  v0,  v1,  v2,  v3
128        b               8b
12916:
130        \type           v5,  v0,  v1,  v2,  v3
131        st1             {v4.16b}, [x0], x1
132        \type           v6,  v0,  v1,  v2,  v3
133        st1             {v5.16b}, [x0], x1
134        \type           v7,  v0,  v1,  v2,  v3
135        st1             {v6.16b}, [x0], x1
136        subs            w5,  w5,  #4
137        st1             {v7.16b}, [x0], x1
138        b.le            0f
139        \type           v4,  v0,  v1,  v2,  v3
140        b               16b
141320:
142        add             x7,  x0,  x1
143        lsl             x1,  x1,  #1
14432:
145        \type           v5,  v0,  v1,  v2,  v3
146        \type           v6,  v0,  v1,  v2,  v3
147        st1             {v4.16b,v5.16b}, [x0], x1
148        \type           v7,  v0,  v1,  v2,  v3
149        subs            w5,  w5,  #2
150        st1             {v6.16b,v7.16b}, [x7], x1
151        b.le            0f
152        \type           v4,  v0,  v1,  v2,  v3
153        b               32b
154640:
155        add             x7,  x0,  x1
156        lsl             x1,  x1,  #1
15764:
158        \type           v5,  v0,  v1,  v2,  v3
159        \type           v6,  v0,  v1,  v2,  v3
160        \type           v7,  v0,  v1,  v2,  v3
161        \type           v16, v0,  v1,  v2,  v3
162        \type           v17, v0,  v1,  v2,  v3
163        st1             {v4.16b,v5.16b,v6.16b,v7.16b}, [x0], x1
164        \type           v18, v0,  v1,  v2,  v3
165        \type           v19, v0,  v1,  v2,  v3
166        subs            w5,  w5,  #2
167        st1             {v16.16b,v17.16b,v18.16b,v19.16b}, [x7], x1
168        b.le            0f
169        \type           v4, v0,  v1,  v2,  v3
170        b               64b
1711280:
172        add             x7,  x0,  #64
173128:
174        \type           v5,  v0,  v1,  v2,  v3
175        \type           v6,  v0,  v1,  v2,  v3
176        \type           v7,  v0,  v1,  v2,  v3
177        \type           v16, v0,  v1,  v2,  v3
178        \type           v17, v0,  v1,  v2,  v3
179        st1             {v4.16b,v5.16b,v6.16b,v7.16b}, [x0], x1
180        \type           v18, v0,  v1,  v2,  v3
181        \type           v19, v0,  v1,  v2,  v3
182        subs            w5,  w5,  #1
183        st1             {v16.16b,v17.16b,v18.16b,v19.16b}, [x7], x1
184        b.le            0f
185        \type           v4, v0,  v1,  v2,  v3
186        b               128b
1870:
188        ret
189L(\type\()_tbl):
190        .hword L(\type\()_tbl) - 1280b
191        .hword L(\type\()_tbl) -  640b
192        .hword L(\type\()_tbl) -  320b
193        .hword L(\type\()_tbl) -   16b
194        .hword L(\type\()_tbl) -   80b
195        .hword L(\type\()_tbl) -   40b
196endfunc
197.endm
198
199bidir_fn avg
200bidir_fn w_avg
201bidir_fn mask
202
203
204.macro w_mask_fn type
205function w_mask_\type\()_8bpc_neon, export=1
206        clz             w8,  w4
207        adr             x9,  L(w_mask_\type\()_tbl)
208        sub             w8,  w8,  #24
209        ldrh            w8,  [x9,  x8,  lsl #1]
210        sub             x9,  x9,  w8,  uxtw
211        mov             w10, #6903
212        dup             v0.8h,   w10
213.if \type == 444
214        movi            v1.16b,  #64
215.elseif \type == 422
216        dup             v2.8b,   w7
217        movi            v3.8b,   #129
218        sub             v3.8b,   v3.8b,   v2.8b
219.elseif \type == 420
220        dup             v2.8h,   w7
221        movi            v3.8h,   #1, lsl #8
222        sub             v3.8h,   v3.8h,   v2.8h
223.endif
224        add             x12,  x0,  x1
225        lsl             x1,   x1,  #1
226        br              x9
2274:
228        ld1             {v4.8h,   v5.8h},   [x2],  #32  // tmp1 (four rows at once)
229        ld1             {v6.8h,   v7.8h},   [x3],  #32  // tmp2 (four rows at once)
230        subs            w5,  w5,  #4
231        sub             v16.8h,  v6.8h,   v4.8h
232        sub             v17.8h,  v7.8h,   v5.8h
233        sabd            v18.8h,  v4.8h,   v6.8h
234        sabd            v19.8h,  v5.8h,   v7.8h
235        uqsub           v18.8h,  v0.8h,   v18.8h
236        uqsub           v19.8h,  v0.8h,   v19.8h
237        ushr            v18.8h,  v18.8h,  #8
238        ushr            v19.8h,  v19.8h,  #8
239        shl             v20.8h,  v18.8h,  #9
240        shl             v21.8h,  v19.8h,  #9
241        sqdmulh         v20.8h,  v20.8h,  v16.8h
242        sqdmulh         v21.8h,  v21.8h,  v17.8h
243        add             v20.8h,  v20.8h,  v4.8h
244        add             v21.8h,  v21.8h,  v5.8h
245        sqrshrun        v22.8b,  v20.8h,  #4
246        sqrshrun        v23.8b,  v21.8h,  #4
247.if \type == 444
248        xtn             v18.8b,   v18.8h
249        xtn2            v18.16b,  v19.8h
250        sub             v18.16b,  v1.16b,  v18.16b
251        st1             {v18.16b}, [x6],  #16
252.elseif \type == 422
253        addp            v18.8h,   v18.8h,  v19.8h
254        xtn             v18.8b,   v18.8h
255        uhsub           v18.8b,   v3.8b,   v18.8b
256        st1             {v18.8b},  [x6],  #8
257.elseif \type == 420
258        trn1            v24.2d,   v18.2d,  v19.2d
259        trn2            v25.2d,   v18.2d,  v19.2d
260        add             v24.8h,   v24.8h,  v25.8h
261        addp            v18.8h,   v24.8h,  v24.8h
262        sub             v18.4h,   v3.4h,   v18.4h
263        rshrn           v18.8b,   v18.8h,  #2
264        st1             {v18.s}[0],  [x6],  #4
265.endif
266        st1             {v22.s}[0],  [x0],  x1
267        st1             {v22.s}[1],  [x12], x1
268        st1             {v23.s}[0],  [x0],  x1
269        st1             {v23.s}[1],  [x12], x1
270        b.gt            4b
271        ret
2728:
273        ld1             {v4.8h,   v5.8h},   [x2],  #32
274        ld1             {v6.8h,   v7.8h},   [x3],  #32
275        subs            w5,  w5,  #2
276        sub             v16.8h,  v6.8h,   v4.8h
277        sub             v17.8h,  v7.8h,   v5.8h
278        sabd            v18.8h,  v4.8h,   v6.8h
279        sabd            v19.8h,  v5.8h,   v7.8h
280        uqsub           v18.8h,  v0.8h,   v18.8h
281        uqsub           v19.8h,  v0.8h,   v19.8h
282        ushr            v18.8h,  v18.8h,  #8
283        ushr            v19.8h,  v19.8h,  #8
284        shl             v20.8h,  v18.8h,  #9
285        shl             v21.8h,  v19.8h,  #9
286        sqdmulh         v20.8h,  v20.8h,  v16.8h
287        sqdmulh         v21.8h,  v21.8h,  v17.8h
288        add             v20.8h,  v20.8h,  v4.8h
289        add             v21.8h,  v21.8h,  v5.8h
290        sqrshrun        v22.8b,  v20.8h,  #4
291        sqrshrun        v23.8b,  v21.8h,  #4
292.if \type == 444
293        xtn             v18.8b,  v18.8h
294        xtn2            v18.16b, v19.8h
295        sub             v18.16b, v1.16b,  v18.16b
296        st1             {v18.16b}, [x6],  #16
297.elseif \type == 422
298        addp            v18.8h,  v18.8h,  v19.8h
299        xtn             v18.8b,  v18.8h
300        uhsub           v18.8b,  v3.8b,   v18.8b
301        st1             {v18.8b},  [x6],  #8
302.elseif \type == 420
303        add             v18.8h,  v18.8h,  v19.8h
304        addp            v18.8h,  v18.8h,  v18.8h
305        sub             v18.4h,  v3.4h,   v18.4h
306        rshrn           v18.8b,  v18.8h,  #2
307        st1             {v18.s}[0],  [x6],  #4
308.endif
309        st1             {v22.8b},  [x0],  x1
310        st1             {v23.8b},  [x12], x1
311        b.gt            8b
312        ret
3131280:
314640:
315320:
316160:
317        mov             w11, w4
318        sub             x1,  x1,  w4,  uxtw
319.if \type == 444
320        add             x10, x6,  w4,  uxtw
321.elseif \type == 422
322        add             x10, x6,  x11, lsr #1
323.endif
324        add             x9,  x3,  w4,  uxtw #1
325        add             x7,  x2,  w4,  uxtw #1
326161:
327        mov             w8,  w4
32816:
329        ld1             {v4.8h,   v5.8h},   [x2],  #32
330        ld1             {v6.8h,   v7.8h},   [x3],  #32
331        ld1             {v16.8h,  v17.8h},  [x7],  #32
332        ld1             {v18.8h,  v19.8h},  [x9],  #32
333        subs            w8,  w8,  #16
334        sub             v6.8h,   v6.8h,   v4.8h
335        sub             v7.8h,   v7.8h,   v5.8h
336        sub             v18.8h,  v18.8h,  v16.8h
337        sub             v19.8h,  v19.8h,  v17.8h
338        abs             v20.8h,  v6.8h
339        abs             v21.8h,  v7.8h
340        abs             v22.8h,  v18.8h
341        abs             v23.8h,  v19.8h
342        uqsub           v20.8h,  v0.8h,   v20.8h
343        uqsub           v21.8h,  v0.8h,   v21.8h
344        uqsub           v22.8h,  v0.8h,   v22.8h
345        uqsub           v23.8h,  v0.8h,   v23.8h
346        ushr            v20.8h,  v20.8h,  #8
347        ushr            v21.8h,  v21.8h,  #8
348        ushr            v22.8h,  v22.8h,  #8
349        ushr            v23.8h,  v23.8h,  #8
350        shl             v24.8h,  v20.8h,  #9
351        shl             v25.8h,  v21.8h,  #9
352        shl             v26.8h,  v22.8h,  #9
353        shl             v27.8h,  v23.8h,  #9
354        sqdmulh         v24.8h,  v24.8h,  v6.8h
355        sqdmulh         v25.8h,  v25.8h,  v7.8h
356        sqdmulh         v26.8h,  v26.8h,  v18.8h
357        sqdmulh         v27.8h,  v27.8h,  v19.8h
358        add             v24.8h,  v24.8h,  v4.8h
359        add             v25.8h,  v25.8h,  v5.8h
360        add             v26.8h,  v26.8h,  v16.8h
361        add             v27.8h,  v27.8h,  v17.8h
362        sqrshrun        v24.8b,  v24.8h,  #4
363        sqrshrun        v25.8b,  v25.8h,  #4
364        sqrshrun        v26.8b,  v26.8h,  #4
365        sqrshrun        v27.8b,  v27.8h,  #4
366.if \type == 444
367        xtn             v20.8b,  v20.8h
368        xtn2            v20.16b, v21.8h
369        xtn             v21.8b,  v22.8h
370        xtn2            v21.16b, v23.8h
371        sub             v20.16b, v1.16b,  v20.16b
372        sub             v21.16b, v1.16b,  v21.16b
373        st1             {v20.16b}, [x6],  #16
374        st1             {v21.16b}, [x10], #16
375.elseif \type == 422
376        addp            v20.8h,  v20.8h,  v21.8h
377        addp            v21.8h,  v22.8h,  v23.8h
378        xtn             v20.8b,  v20.8h
379        xtn             v21.8b,  v21.8h
380        uhsub           v20.8b,  v3.8b,   v20.8b
381        uhsub           v21.8b,  v3.8b,   v21.8b
382        st1             {v20.8b},  [x6],  #8
383        st1             {v21.8b},  [x10], #8
384.elseif \type == 420
385        add             v20.8h,  v20.8h,  v22.8h
386        add             v21.8h,  v21.8h,  v23.8h
387        addp            v20.8h,  v20.8h,  v21.8h
388        sub             v20.8h,  v3.8h,   v20.8h
389        rshrn           v20.8b,  v20.8h,  #2
390        st1             {v20.8b},  [x6],  #8
391.endif
392        st1             {v24.8b,  v25.8b},  [x0],  #16
393        st1             {v26.8b,  v27.8b},  [x12], #16
394        b.gt            16b
395        subs            w5,  w5,  #2
396        add             x2,  x2,  w4,  uxtw #1
397        add             x3,  x3,  w4,  uxtw #1
398        add             x7,  x7,  w4,  uxtw #1
399        add             x9,  x9,  w4,  uxtw #1
400.if \type == 444
401        add             x6,  x6,  w4,  uxtw
402        add             x10, x10, w4,  uxtw
403.elseif \type == 422
404        add             x6,  x6,  x11, lsr #1
405        add             x10, x10, x11, lsr #1
406.endif
407        add             x0,  x0,  x1
408        add             x12, x12, x1
409        b.gt            161b
410        ret
411L(w_mask_\type\()_tbl):
412        .hword L(w_mask_\type\()_tbl) - 1280b
413        .hword L(w_mask_\type\()_tbl) -  640b
414        .hword L(w_mask_\type\()_tbl) -  320b
415        .hword L(w_mask_\type\()_tbl) -  160b
416        .hword L(w_mask_\type\()_tbl) -    8b
417        .hword L(w_mask_\type\()_tbl) -    4b
418endfunc
419.endm
420
421w_mask_fn 444
422w_mask_fn 422
423w_mask_fn 420
424
425
426function blend_8bpc_neon, export=1
427        adr             x6,  L(blend_tbl)
428        clz             w3,  w3
429        sub             w3,  w3,  #26
430        ldrh            w3,  [x6,  x3,  lsl #1]
431        sub             x6,  x6,  w3,  uxtw
432        movi            v4.16b,  #64
433        add             x8,  x0,  x1
434        lsl             x1,  x1,  #1
435        br              x6
4364:
437        ld1             {v2.8b},     [x5],  #8
438        ld1             {v1.d}[0],   [x2],  #8
439        ld1             {v0.s}[0],   [x0]
440        subs            w4,  w4,  #2
441        ld1             {v0.s}[1],   [x8]
442        sub             v3.8b,   v4.8b,   v2.8b
443        umull           v5.8h,   v1.8b,   v2.8b
444        umlal           v5.8h,   v0.8b,   v3.8b
445        rshrn           v6.8b,   v5.8h,   #6
446        st1             {v6.s}[0],   [x0],  x1
447        st1             {v6.s}[1],   [x8],  x1
448        b.gt            4b
449        ret
4508:
451        ld1             {v2.16b},  [x5],  #16
452        ld1             {v1.16b},  [x2],  #16
453        ld1             {v0.d}[0],   [x0]
454        ld1             {v0.d}[1],   [x8]
455        sub             v3.16b,  v4.16b,  v2.16b
456        subs            w4,  w4,  #2
457        umull           v5.8h,   v1.8b,   v2.8b
458        umlal           v5.8h,   v0.8b,   v3.8b
459        umull2          v6.8h,   v1.16b,  v2.16b
460        umlal2          v6.8h,   v0.16b,  v3.16b
461        rshrn           v7.8b,   v5.8h,   #6
462        rshrn2          v7.16b,  v6.8h,   #6
463        st1             {v7.d}[0],   [x0],  x1
464        st1             {v7.d}[1],   [x8],  x1
465        b.gt            8b
466        ret
46716:
468        ld1             {v1.16b,  v2.16b},  [x5],  #32
469        ld1             {v5.16b,  v6.16b},  [x2],  #32
470        ld1             {v0.16b},  [x0]
471        subs            w4,  w4,  #2
472        sub             v7.16b,  v4.16b,  v1.16b
473        sub             v20.16b, v4.16b,  v2.16b
474        ld1             {v3.16b},  [x8]
475        umull           v16.8h,  v5.8b,   v1.8b
476        umlal           v16.8h,  v0.8b,   v7.8b
477        umull2          v17.8h,  v5.16b,  v1.16b
478        umlal2          v17.8h,  v0.16b,  v7.16b
479        umull           v21.8h,  v6.8b,   v2.8b
480        umlal           v21.8h,  v3.8b,   v20.8b
481        umull2          v22.8h,  v6.16b,  v2.16b
482        umlal2          v22.8h,  v3.16b,  v20.16b
483        rshrn           v18.8b,  v16.8h,  #6
484        rshrn2          v18.16b, v17.8h,  #6
485        rshrn           v19.8b,  v21.8h,  #6
486        rshrn2          v19.16b, v22.8h,  #6
487        st1             {v18.16b}, [x0],  x1
488        st1             {v19.16b}, [x8],  x1
489        b.gt            16b
490        ret
49132:
492        ld1             {v0.16b,  v1.16b,  v2.16b,  v3.16b},  [x5],  #64
493        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x2],  #64
494        ld1             {v20.16b, v21.16b}, [x0]
495        subs            w4,  w4,  #2
496        ld1             {v22.16b, v23.16b}, [x8]
497        sub             v5.16b,  v4.16b,  v0.16b
498        sub             v6.16b,  v4.16b,  v1.16b
499        sub             v30.16b, v4.16b,  v2.16b
500        sub             v31.16b, v4.16b,  v3.16b
501        umull           v24.8h,  v16.8b,  v0.8b
502        umlal           v24.8h,  v20.8b,  v5.8b
503        umull2          v26.8h,  v16.16b, v0.16b
504        umlal2          v26.8h,  v20.16b, v5.16b
505        umull           v28.8h,  v17.8b,  v1.8b
506        umlal           v28.8h,  v21.8b,  v6.8b
507        umull2          v7.8h,   v17.16b, v1.16b
508        umlal2          v7.8h,   v21.16b, v6.16b
509        umull           v27.8h,  v18.8b,  v2.8b
510        umlal           v27.8h,  v22.8b,  v30.8b
511        umull2          v1.8h,   v18.16b, v2.16b
512        umlal2          v1.8h,   v22.16b, v30.16b
513        umull           v29.8h,  v19.8b,  v3.8b
514        umlal           v29.8h,  v23.8b,  v31.8b
515        umull2          v21.8h,  v19.16b, v3.16b
516        umlal2          v21.8h,  v23.16b, v31.16b
517        rshrn           v24.8b,  v24.8h,  #6
518        rshrn2          v24.16b, v26.8h,  #6
519        rshrn           v25.8b,  v28.8h,  #6
520        rshrn2          v25.16b, v7.8h,   #6
521        rshrn           v27.8b,  v27.8h,  #6
522        rshrn2          v27.16b, v1.8h,   #6
523        rshrn           v28.8b,  v29.8h,  #6
524        rshrn2          v28.16b, v21.8h,  #6
525        st1             {v24.16b, v25.16b}, [x0],  x1
526        st1             {v27.16b, v28.16b}, [x8],  x1
527        b.gt            32b
528        ret
529L(blend_tbl):
530        .hword L(blend_tbl) - 32b
531        .hword L(blend_tbl) - 16b
532        .hword L(blend_tbl) -  8b
533        .hword L(blend_tbl) -  4b
534endfunc
535
536function blend_h_8bpc_neon, export=1
537        adr             x6,  L(blend_h_tbl)
538        movrel          x5,  X(obmc_masks)
539        add             x5,  x5,  w4,  uxtw
540        sub             w4,  w4,  w4,  lsr #2
541        clz             w7,  w3
542        movi            v4.16b,  #64
543        add             x8,  x0,  x1
544        lsl             x1,  x1,  #1
545        sub             w7,  w7,  #24
546        ldrh            w7,  [x6,  x7,  lsl #1]
547        sub             x6,  x6,  w7, uxtw
548        br              x6
5492:
550        ld1             {v0.h}[0],   [x5],  #2
551        ld1             {v1.s}[0],   [x2],  #4
552        subs            w4,  w4,  #2
553        ld1             {v2.h}[0],   [x0]
554        zip1            v0.8b,   v0.8b,   v0.8b
555        sub             v3.8b,   v4.8b,   v0.8b
556        ld1             {v2.h}[1],   [x8]
557        umull           v5.8h,   v1.8b,   v0.8b
558        umlal           v5.8h,   v2.8b,   v3.8b
559        rshrn           v5.8b,   v5.8h,   #6
560        st1             {v5.h}[0],   [x0],  x1
561        st1             {v5.h}[1],   [x8],  x1
562        b.gt            2b
563        ret
5644:
565        ld2r            {v0.8b,   v1.8b},   [x5],  #2
566        ld1             {v2.8b},   [x2],  #8
567        subs            w4,  w4,  #2
568        ext             v0.8b,   v0.8b,   v1.8b,   #4
569        ld1             {v3.s}[0],   [x0]
570        sub             v5.8b,   v4.8b,   v0.8b
571        ld1             {v3.s}[1],   [x8]
572        umull           v6.8h,   v2.8b,   v0.8b
573        umlal           v6.8h,   v3.8b,   v5.8b
574        rshrn           v6.8b,   v6.8h,   #6
575        st1             {v6.s}[0],   [x0],  x1
576        st1             {v6.s}[1],   [x8],  x1
577        b.gt            4b
578        ret
5798:
580        ld2r            {v0.16b,  v1.16b},  [x5],  #2
581        ld1             {v2.16b},  [x2],  #16
582        ld1             {v3.d}[0],   [x0]
583        ext             v0.16b,  v0.16b,  v1.16b,  #8
584        sub             v5.16b,  v4.16b,  v0.16b
585        ld1             {v3.d}[1],   [x8]
586        subs            w4,  w4,  #2
587        umull           v6.8h,   v0.8b,   v2.8b
588        umlal           v6.8h,   v3.8b,   v5.8b
589        umull2          v7.8h,   v0.16b,  v2.16b
590        umlal2          v7.8h,   v3.16b,  v5.16b
591        rshrn           v16.8b,  v6.8h,   #6
592        rshrn2          v16.16b, v7.8h,   #6
593        st1             {v16.d}[0],  [x0],  x1
594        st1             {v16.d}[1],  [x8],  x1
595        b.gt            8b
596        ret
59716:
598        ld2r            {v0.16b,  v1.16b},  [x5],  #2
599        ld1             {v2.16b,  v3.16b},  [x2],  #32
600        ld1             {v5.16b},  [x0]
601        sub             v7.16b,  v4.16b,  v0.16b
602        sub             v16.16b, v4.16b,  v1.16b
603        ld1             {v6.16b},  [x8]
604        subs            w4,  w4,  #2
605        umull           v17.8h,  v0.8b,   v2.8b
606        umlal           v17.8h,  v5.8b,   v7.8b
607        umull2          v18.8h,  v0.16b,  v2.16b
608        umlal2          v18.8h,  v5.16b,  v7.16b
609        umull           v19.8h,  v1.8b,   v3.8b
610        umlal           v19.8h,  v6.8b,   v16.8b
611        umull2          v20.8h,  v1.16b,  v3.16b
612        umlal2          v20.8h,  v6.16b,  v16.16b
613        rshrn           v21.8b,  v17.8h,  #6
614        rshrn2          v21.16b, v18.8h,  #6
615        rshrn           v22.8b,  v19.8h,  #6
616        rshrn2          v22.16b, v20.8h,  #6
617        st1             {v21.16b}, [x0],  x1
618        st1             {v22.16b}, [x8],  x1
619        b.gt            16b
620        ret
6211280:
622640:
623320:
624        sub             x1,  x1,  w3,  uxtw
625        add             x7,  x2,  w3,  uxtw
626321:
627        ld2r            {v0.16b,  v1.16b},  [x5],  #2
628        mov             w6,  w3
629        sub             v20.16b, v4.16b,  v0.16b
630        sub             v21.16b, v4.16b,  v1.16b
63132:
632        ld1             {v16.16b, v17.16b}, [x2],  #32
633        ld1             {v2.16b,  v3.16b},  [x0]
634        subs            w6,  w6,  #32
635        umull           v23.8h,  v0.8b,   v16.8b
636        umlal           v23.8h,  v2.8b,   v20.8b
637        ld1             {v18.16b, v19.16b}, [x7],  #32
638        umull2          v27.8h,  v0.16b,  v16.16b
639        umlal2          v27.8h,  v2.16b,  v20.16b
640        ld1             {v6.16b,  v7.16b},  [x8]
641        umull           v24.8h,  v0.8b,   v17.8b
642        umlal           v24.8h,  v3.8b,   v20.8b
643        umull2          v28.8h,  v0.16b,  v17.16b
644        umlal2          v28.8h,  v3.16b,  v20.16b
645        umull           v25.8h,  v1.8b,   v18.8b
646        umlal           v25.8h,  v6.8b,   v21.8b
647        umull2          v5.8h,   v1.16b,  v18.16b
648        umlal2          v5.8h,   v6.16b,  v21.16b
649        rshrn           v29.8b,  v23.8h,  #6
650        rshrn2          v29.16b, v27.8h,  #6
651        umull           v26.8h,  v1.8b,   v19.8b
652        umlal           v26.8h,  v7.8b,   v21.8b
653        umull2          v31.8h,  v1.16b,  v19.16b
654        umlal2          v31.8h,  v7.16b,  v21.16b
655        rshrn           v30.8b,  v24.8h,  #6
656        rshrn2          v30.16b, v28.8h,  #6
657        rshrn           v23.8b,  v25.8h,  #6
658        rshrn2          v23.16b, v5.8h,   #6
659        rshrn           v24.8b,  v26.8h,  #6
660        st1             {v29.16b, v30.16b}, [x0],  #32
661        rshrn2          v24.16b, v31.8h,  #6
662        st1             {v23.16b, v24.16b}, [x8],  #32
663        b.gt            32b
664        subs            w4,  w4,  #2
665        add             x0,  x0,  x1
666        add             x8,  x8,  x1
667        add             x2,  x2,  w3,  uxtw
668        add             x7,  x7,  w3,  uxtw
669        b.gt            321b
670        ret
671L(blend_h_tbl):
672        .hword L(blend_h_tbl) - 1280b
673        .hword L(blend_h_tbl) -  640b
674        .hword L(blend_h_tbl) -  320b
675        .hword L(blend_h_tbl) -   16b
676        .hword L(blend_h_tbl) -    8b
677        .hword L(blend_h_tbl) -    4b
678        .hword L(blend_h_tbl) -    2b
679endfunc
680
681function blend_v_8bpc_neon, export=1
682        adr             x6,  L(blend_v_tbl)
683        movrel          x5,  X(obmc_masks)
684        add             x5,  x5,  w3,  uxtw
685        clz             w3,  w3
686        movi            v4.16b,  #64
687        add             x8,  x0,  x1
688        lsl             x1,  x1,  #1
689        sub             w3,  w3,  #26
690        ldrh            w3,  [x6,  x3,  lsl #1]
691        sub             x6,  x6,  w3,  uxtw
692        br              x6
69320:
694        ld1r            {v0.8b},   [x5]
695        sub             v1.8b,   v4.8b,   v0.8b
6962:
697        ld1             {v2.h}[0],   [x2],  #2
698        ld1             {v3.b}[0],   [x0]
699        subs            w4,  w4,  #2
700        ld1             {v2.b}[1],   [x2]
701        ld1             {v3.b}[1],   [x8]
702        umull           v5.8h,   v2.8b,   v0.8b
703        umlal           v5.8h,   v3.8b,   v1.8b
704        rshrn           v5.8b,   v5.8h,   #6
705        add             x2,  x2,  #2
706        st1             {v5.b}[0],   [x0],  x1
707        st1             {v5.b}[1],   [x8],  x1
708        b.gt            2b
709        ret
71040:
711        ld1r            {v0.2s},   [x5]
712        sub             x1,  x1,  #2
713        sub             v1.8b,   v4.8b,   v0.8b
7144:
715        ld1             {v2.8b},   [x2],  #8
716        ld1             {v3.s}[0],   [x0]
717        ld1             {v3.s}[1],   [x8]
718        subs            w4,  w4,  #2
719        umull           v5.8h,   v2.8b,   v0.8b
720        umlal           v5.8h,   v3.8b,   v1.8b
721        rshrn           v5.8b,   v5.8h,   #6
722        st1             {v5.h}[0],   [x0],  #2
723        st1             {v5.h}[2],   [x8],  #2
724        st1             {v5.b}[2],   [x0],  x1
725        st1             {v5.b}[6],   [x8],  x1
726        b.gt            4b
727        ret
72880:
729        ld1r            {v0.2d},   [x5]
730        sub             x1,  x1,  #4
731        sub             v1.16b,  v4.16b,  v0.16b
7328:
733        ld1             {v2.16b},  [x2],  #16
734        ld1             {v3.d}[0],   [x0]
735        ld1             {v3.d}[1],   [x8]
736        subs            w4,  w4,  #2
737        umull           v5.8h,  v0.8b,  v2.8b
738        umlal           v5.8h,  v3.8b,  v1.8b
739        umull2          v6.8h,  v0.16b, v2.16b
740        umlal2          v6.8h,  v3.16b, v1.16b
741        rshrn           v7.8b,  v5.8h,  #6
742        rshrn2          v7.16b, v6.8h,  #6
743        st1             {v7.s}[0],   [x0],  #4
744        st1             {v7.s}[2],   [x8],  #4
745        st1             {v7.h}[2],   [x0],  x1
746        st1             {v7.h}[6],   [x8],  x1
747        b.gt            8b
748        ret
749160:
750        ld1             {v0.16b},  [x5]
751        sub             x1,  x1,  #8
752        sub             v2.16b,  v4.16b,  v0.16b
75316:
754        ld1             {v5.16b,  v6.16b},  [x2],  #32
755        ld1             {v7.16b},  [x0]
756        subs            w4,  w4,  #2
757        ld1             {v16.16b}, [x8]
758        umull           v17.8h,  v5.8b,   v0.8b
759        umlal           v17.8h,  v7.8b,   v2.8b
760        umull2          v18.8h,  v5.16b,  v0.16b
761        umlal2          v18.8h,  v7.16b,  v2.16b
762        umull           v20.8h,  v6.8b,   v0.8b
763        umlal           v20.8h,  v16.8b,  v2.8b
764        umull2          v21.8h,  v6.16b,  v0.16b
765        umlal2          v21.8h,  v16.16b, v2.16b
766        rshrn           v19.8b,  v17.8h,  #6
767        rshrn2          v19.16b, v18.8h,  #6
768        rshrn           v22.8b,  v20.8h,  #6
769        rshrn2          v22.16b, v21.8h,  #6
770        st1             {v19.8b},  [x0],  #8
771        st1             {v22.8b},  [x8],  #8
772        st1             {v19.s}[2],  [x0],  x1
773        st1             {v22.s}[2],  [x8],  x1
774        b.gt            16b
775        ret
776320:
777        ld1             {v0.16b,  v1.16b},  [x5]
778        sub             x1,  x1,  #16
779        sub             v2.16b,  v4.16b,  v0.16b
780        sub             v3.8b,   v4.8b,   v1.8b
78132:
782        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x2],  #64
783        ld1             {v5.16b,  v6.16b},  [x0]
784        subs            w4,  w4,  #2
785        ld1             {v20.16b, v21.16b}, [x8]
786        umull           v22.8h,  v16.8b,  v0.8b
787        umlal           v22.8h,  v5.8b,   v2.8b
788        umull2          v23.8h,  v16.16b, v0.16b
789        umlal2          v23.8h,  v5.16b,  v2.16b
790        umull           v28.8h,  v17.8b,  v1.8b
791        umlal           v28.8h,  v6.8b,   v3.8b
792        umull           v30.8h,  v18.8b,  v0.8b
793        umlal           v30.8h,  v20.8b,  v2.8b
794        umull2          v31.8h,  v18.16b, v0.16b
795        umlal2          v31.8h,  v20.16b, v2.16b
796        umull           v25.8h,  v19.8b,  v1.8b
797        umlal           v25.8h,  v21.8b,  v3.8b
798        rshrn           v24.8b,  v22.8h,  #6
799        rshrn2          v24.16b, v23.8h,  #6
800        rshrn           v28.8b,  v28.8h,  #6
801        rshrn           v30.8b,  v30.8h,  #6
802        rshrn2          v30.16b, v31.8h,  #6
803        rshrn           v27.8b,  v25.8h,  #6
804        st1             {v24.16b}, [x0],  #16
805        st1             {v30.16b}, [x8],  #16
806        st1             {v28.8b},  [x0],  x1
807        st1             {v27.8b},  [x8],  x1
808        b.gt            32b
809        ret
810L(blend_v_tbl):
811        .hword L(blend_v_tbl) - 320b
812        .hword L(blend_v_tbl) - 160b
813        .hword L(blend_v_tbl) -  80b
814        .hword L(blend_v_tbl) -  40b
815        .hword L(blend_v_tbl) -  20b
816endfunc
817
818
819// This has got the same signature as the put_8tap functions,
820// and assumes that x8 is set to (clz(w)-24).
821function put_neon
822        adr             x9,  L(put_tbl)
823        ldrh            w8,  [x9, x8, lsl #1]
824        sub             x9,  x9,  w8, uxtw
825        br              x9
826
8272:
828        ld1             {v0.h}[0], [x2], x3
829        ld1             {v1.h}[0], [x2], x3
830        subs            w5,  w5,  #2
831        st1             {v0.h}[0], [x0], x1
832        st1             {v1.h}[0], [x0], x1
833        b.gt            2b
834        ret
8354:
836        ld1             {v0.s}[0], [x2], x3
837        ld1             {v1.s}[0], [x2], x3
838        subs            w5,  w5,  #2
839        st1             {v0.s}[0], [x0], x1
840        st1             {v1.s}[0], [x0], x1
841        b.gt            4b
842        ret
8438:
844        ld1             {v0.8b}, [x2], x3
845        ld1             {v1.8b}, [x2], x3
846        subs            w5,  w5,  #2
847        st1             {v0.8b}, [x0], x1
848        st1             {v1.8b}, [x0], x1
849        b.gt            8b
850        ret
851160:
852        add             x8,  x0,  x1
853        lsl             x1,  x1,  #1
854        add             x9,  x2,  x3
855        lsl             x3,  x3,  #1
85616:
857        ld1             {v0.16b}, [x2], x3
858        ld1             {v1.16b}, [x9], x3
859        subs            w5,  w5,  #2
860        st1             {v0.16b}, [x0], x1
861        st1             {v1.16b}, [x8], x1
862        b.gt            16b
863        ret
86432:
865        ldp             x6,  x7,  [x2]
866        ldp             x8,  x9,  [x2, #16]
867        stp             x6,  x7,  [x0]
868        subs            w5,  w5,  #1
869        stp             x8,  x9,  [x0, #16]
870        add             x2,  x2,  x3
871        add             x0,  x0,  x1
872        b.gt            32b
873        ret
87464:
875        ldp             x6,  x7,  [x2]
876        ldp             x8,  x9,  [x2, #16]
877        stp             x6,  x7,  [x0]
878        ldp             x10, x11, [x2, #32]
879        stp             x8,  x9,  [x0, #16]
880        subs            w5,  w5,  #1
881        ldp             x12, x13, [x2, #48]
882        stp             x10, x11, [x0, #32]
883        stp             x12, x13, [x0, #48]
884        add             x2,  x2,  x3
885        add             x0,  x0,  x1
886        b.gt            64b
887        ret
888128:
889        ldp             q0,  q1,  [x2]
890        ldp             q2,  q3,  [x2, #32]
891        stp             q0,  q1,  [x0]
892        ldp             q4,  q5,  [x2, #64]
893        stp             q2,  q3,  [x0, #32]
894        ldp             q6,  q7,  [x2, #96]
895        subs            w5,  w5,  #1
896        stp             q4,  q5,  [x0, #64]
897        stp             q6,  q7,  [x0, #96]
898        add             x2,  x2,  x3
899        add             x0,  x0,  x1
900        b.gt            128b
901        ret
902
903L(put_tbl):
904        .hword L(put_tbl) - 128b
905        .hword L(put_tbl) -  64b
906        .hword L(put_tbl) -  32b
907        .hword L(put_tbl) - 160b
908        .hword L(put_tbl) -   8b
909        .hword L(put_tbl) -   4b
910        .hword L(put_tbl) -   2b
911endfunc
912
913
914// This has got the same signature as the prep_8tap functions,
915// and assumes that x8 is set to (clz(w)-24), and x7 to w*2.
916function prep_neon
917        adr             x9,  L(prep_tbl)
918        ldrh            w8,  [x9, x8, lsl #1]
919        sub             x9,  x9,  w8, uxtw
920        br              x9
921
9224:
923        ld1             {v0.s}[0], [x1], x2
924        ld1             {v1.s}[0], [x1], x2
925        subs            w4,  w4,  #2
926        ushll           v0.8h, v0.8b, #4
927        ushll           v1.8h, v1.8b, #4
928        st1             {v0.4h, v1.4h}, [x0], #16
929        b.gt            4b
930        ret
9318:
932        ld1             {v0.8b}, [x1], x2
933        ld1             {v1.8b}, [x1], x2
934        subs            w4,  w4,  #2
935        ushll           v0.8h, v0.8b, #4
936        ushll           v1.8h, v1.8b, #4
937        st1             {v0.8h, v1.8h}, [x0], #32
938        b.gt            8b
939        ret
940160:
941        add             x9,  x1,  x2
942        lsl             x2,  x2,  #1
94316:
944        ld1             {v0.16b}, [x1], x2
945        ld1             {v1.16b}, [x9], x2
946        subs            w4,  w4,  #2
947        ushll           v4.8h, v0.8b,  #4
948        ushll2          v5.8h, v0.16b, #4
949        ushll           v6.8h, v1.8b,  #4
950        ushll2          v7.8h, v1.16b, #4
951        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
952        b.gt            16b
953        ret
954320:
955        add             x8,  x0,  w3, uxtw
95632:
957        ld1             {v0.16b, v1.16b},  [x1], x2
958        subs            w4,  w4,  #2
959        ushll           v4.8h,  v0.8b,  #4
960        ushll2          v5.8h,  v0.16b, #4
961        ld1             {v2.16b, v3.16b},  [x1], x2
962        ushll           v6.8h,  v1.8b,  #4
963        ushll2          v7.8h,  v1.16b, #4
964        ushll           v16.8h, v2.8b,  #4
965        st1             {v4.8h,  v5.8h},  [x0], x7
966        ushll2          v17.8h, v2.16b, #4
967        st1             {v6.8h,  v7.8h},  [x8], x7
968        ushll           v18.8h, v3.8b,  #4
969        st1             {v16.8h, v17.8h}, [x0], x7
970        ushll2          v19.8h, v3.16b, #4
971        st1             {v18.8h, v19.8h}, [x8], x7
972        b.gt            32b
973        ret
974640:
975        add             x8,  x0,  #32
976        mov             x6,  #64
97764:
978        ldp             q0,  q1,  [x1]
979        subs            w4,  w4,  #1
980        ushll           v4.8h,  v0.8b,  #4
981        ushll2          v5.8h,  v0.16b, #4
982        ldp             q2,  q3,  [x1, #32]
983        ushll           v6.8h,  v1.8b,  #4
984        ushll2          v7.8h,  v1.16b, #4
985        add             x1,  x1,  x2
986        ushll           v16.8h, v2.8b,  #4
987        st1             {v4.8h,  v5.8h},  [x0], x6
988        ushll2          v17.8h, v2.16b, #4
989        ushll           v18.8h, v3.8b,  #4
990        st1             {v6.8h,  v7.8h},  [x8], x6
991        ushll2          v19.8h, v3.16b, #4
992        st1             {v16.8h, v17.8h}, [x0], x6
993        st1             {v18.8h, v19.8h}, [x8], x6
994        b.gt            64b
995        ret
9961280:
997        add             x8,  x0,  #64
998        mov             x6,  #128
999128:
1000        ldp             q0,  q1,  [x1]
1001        ldp             q2,  q3,  [x1, #32]
1002        ushll           v16.8h,  v0.8b,  #4
1003        ushll2          v17.8h,  v0.16b, #4
1004        ushll           v18.8h,  v1.8b,  #4
1005        ushll2          v19.8h,  v1.16b, #4
1006        ushll           v20.8h,  v2.8b,  #4
1007        ushll2          v21.8h,  v2.16b, #4
1008        ldp             q4,  q5,  [x1, #64]
1009        st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x6
1010        ushll           v22.8h,  v3.8b,  #4
1011        ushll2          v23.8h,  v3.16b, #4
1012        ushll           v24.8h,  v4.8b,  #4
1013        ushll2          v25.8h,  v4.16b, #4
1014        ushll           v26.8h,  v5.8b,  #4
1015        ushll2          v27.8h,  v5.16b, #4
1016        ldp             q6,  q7,  [x1, #96]
1017        st1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x8], x6
1018        ushll           v28.8h,  v6.8b,  #4
1019        ushll2          v29.8h,  v6.16b, #4
1020        ushll           v30.8h,  v7.8b,  #4
1021        ushll2          v31.8h,  v7.16b, #4
1022        subs            w4,  w4,  #1
1023        add             x1,  x1,  x2
1024        st1             {v24.8h, v25.8h, v26.8h, v27.8h}, [x0], x6
1025        st1             {v28.8h, v29.8h, v30.8h, v31.8h}, [x8], x6
1026        b.gt            128b
1027        ret
1028
1029L(prep_tbl):
1030        .hword L(prep_tbl) - 1280b
1031        .hword L(prep_tbl) -  640b
1032        .hword L(prep_tbl) -  320b
1033        .hword L(prep_tbl) -  160b
1034        .hword L(prep_tbl) -    8b
1035        .hword L(prep_tbl) -    4b
1036endfunc
1037
1038
1039.macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6
1040        ld1             {\d0\wd}[0], [\s0], \strd
1041        ld1             {\d1\wd}[0], [\s1], \strd
1042.ifnb \d2
1043        ld1             {\d2\wd}[0], [\s0], \strd
1044        ld1             {\d3\wd}[0], [\s1], \strd
1045.endif
1046.ifnb \d4
1047        ld1             {\d4\wd}[0], [\s0], \strd
1048.endif
1049.ifnb \d5
1050        ld1             {\d5\wd}[0], [\s1], \strd
1051.endif
1052.ifnb \d6
1053        ld1             {\d6\wd}[0], [\s0], \strd
1054.endif
1055.endm
1056.macro load_reg s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6
1057        ld1             {\d0\wd}, [\s0], \strd
1058        ld1             {\d1\wd}, [\s1], \strd
1059.ifnb \d2
1060        ld1             {\d2\wd}, [\s0], \strd
1061        ld1             {\d3\wd}, [\s1], \strd
1062.endif
1063.ifnb \d4
1064        ld1             {\d4\wd}, [\s0], \strd
1065.endif
1066.ifnb \d5
1067        ld1             {\d5\wd}, [\s1], \strd
1068.endif
1069.ifnb \d6
1070        ld1             {\d6\wd}, [\s0], \strd
1071.endif
1072.endm
1073.macro load_h s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
1074        load_slice      \s0, \s1, \strd, .h, \d0, \d1, \d2, \d3, \d4, \d5, \d6
1075.endm
1076.macro load_s s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
1077        load_slice      \s0, \s1, \strd, .s, \d0, \d1, \d2, \d3, \d4, \d5, \d6
1078.endm
1079.macro load_8b s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
1080        load_reg        \s0, \s1, \strd, .8b, \d0, \d1, \d2, \d3, \d4, \d5, \d6
1081.endm
1082.macro load_16b s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
1083        load_reg        \s0, \s1, \strd, .16b, \d0, \d1, \d2, \d3, \d4, \d5, \d6
1084.endm
1085.macro interleave_1 wd, r0, r1, r2, r3, r4
1086        trn1            \r0\wd, \r0\wd, \r1\wd
1087        trn1            \r1\wd, \r1\wd, \r2\wd
1088.ifnb \r3
1089        trn1            \r2\wd, \r2\wd, \r3\wd
1090        trn1            \r3\wd, \r3\wd, \r4\wd
1091.endif
1092.endm
1093.macro interleave_1_h r0, r1, r2, r3, r4
1094        interleave_1    .4h, \r0, \r1, \r2, \r3, \r4
1095.endm
1096.macro interleave_1_s r0, r1, r2, r3, r4
1097        interleave_1    .2s, \r0, \r1, \r2, \r3, \r4
1098.endm
1099.macro interleave_2 wd, r0, r1, r2, r3, r4, r5
1100        trn1            \r0\wd,  \r0\wd, \r2\wd
1101        trn1            \r1\wd,  \r1\wd, \r3\wd
1102        trn1            \r2\wd,  \r2\wd, \r4\wd
1103        trn1            \r3\wd,  \r3\wd, \r5\wd
1104.endm
1105.macro interleave_2_s r0, r1, r2, r3, r4, r5
1106        interleave_2    .2s, \r0, \r1, \r2, \r3, \r4, \r5
1107.endm
1108.macro uxtl_b r0, r1, r2, r3, r4, r5, r6
1109        uxtl            \r0\().8h, \r0\().8b
1110        uxtl            \r1\().8h, \r1\().8b
1111.ifnb \r2
1112        uxtl            \r2\().8h, \r2\().8b
1113        uxtl            \r3\().8h, \r3\().8b
1114.endif
1115.ifnb \r4
1116        uxtl            \r4\().8h, \r4\().8b
1117.endif
1118.ifnb \r5
1119        uxtl            \r5\().8h, \r5\().8b
1120.endif
1121.ifnb \r6
1122        uxtl            \r6\().8h, \r6\().8b
1123.endif
1124.endm
1125.macro mul_mla_4 d, s0, s1, s2, s3, wd
1126        mul             \d\wd,  \s0\wd,  v0.h[0]
1127        mla             \d\wd,  \s1\wd,  v0.h[1]
1128        mla             \d\wd,  \s2\wd,  v0.h[2]
1129        mla             \d\wd,  \s3\wd,  v0.h[3]
1130.endm
1131// Interleaving the mul/mla chains actually hurts performance
1132// significantly on Cortex A53, thus keeping mul/mla tightly
1133// chained like this.
1134.macro mul_mla_8_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8
1135        mul             \d0\().8h, \s0\().8h, v0.h[0]
1136        mla             \d0\().8h, \s1\().8h, v0.h[1]
1137        mla             \d0\().8h, \s2\().8h, v0.h[2]
1138        mla             \d0\().8h, \s3\().8h, v0.h[3]
1139        mla             \d0\().8h, \s4\().8h, v0.h[4]
1140        mla             \d0\().8h, \s5\().8h, v0.h[5]
1141        mla             \d0\().8h, \s6\().8h, v0.h[6]
1142        mla             \d0\().8h, \s7\().8h, v0.h[7]
1143        mul             \d1\().8h, \s1\().8h, v0.h[0]
1144        mla             \d1\().8h, \s2\().8h, v0.h[1]
1145        mla             \d1\().8h, \s3\().8h, v0.h[2]
1146        mla             \d1\().8h, \s4\().8h, v0.h[3]
1147        mla             \d1\().8h, \s5\().8h, v0.h[4]
1148        mla             \d1\().8h, \s6\().8h, v0.h[5]
1149        mla             \d1\().8h, \s7\().8h, v0.h[6]
1150        mla             \d1\().8h, \s8\().8h, v0.h[7]
1151.endm
1152.macro mul_mla_8_2 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9
1153        mul             \d0\().8h, \s0\().8h, v0.h[0]
1154        mla             \d0\().8h, \s1\().8h, v0.h[1]
1155        mla             \d0\().8h, \s2\().8h, v0.h[2]
1156        mla             \d0\().8h, \s3\().8h, v0.h[3]
1157        mla             \d0\().8h, \s4\().8h, v0.h[4]
1158        mla             \d0\().8h, \s5\().8h, v0.h[5]
1159        mla             \d0\().8h, \s6\().8h, v0.h[6]
1160        mla             \d0\().8h, \s7\().8h, v0.h[7]
1161        mul             \d1\().8h, \s2\().8h, v0.h[0]
1162        mla             \d1\().8h, \s3\().8h, v0.h[1]
1163        mla             \d1\().8h, \s4\().8h, v0.h[2]
1164        mla             \d1\().8h, \s5\().8h, v0.h[3]
1165        mla             \d1\().8h, \s6\().8h, v0.h[4]
1166        mla             \d1\().8h, \s7\().8h, v0.h[5]
1167        mla             \d1\().8h, \s8\().8h, v0.h[6]
1168        mla             \d1\().8h, \s9\().8h, v0.h[7]
1169.endm
1170.macro mul_mla_8_4 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11
1171        mul             \d0\().8h, \s0\().8h,  v0.h[0]
1172        mla             \d0\().8h, \s1\().8h,  v0.h[1]
1173        mla             \d0\().8h, \s2\().8h,  v0.h[2]
1174        mla             \d0\().8h, \s3\().8h,  v0.h[3]
1175        mla             \d0\().8h, \s4\().8h,  v0.h[4]
1176        mla             \d0\().8h, \s5\().8h,  v0.h[5]
1177        mla             \d0\().8h, \s6\().8h,  v0.h[6]
1178        mla             \d0\().8h, \s7\().8h,  v0.h[7]
1179        mul             \d1\().8h, \s4\().8h,  v0.h[0]
1180        mla             \d1\().8h, \s5\().8h,  v0.h[1]
1181        mla             \d1\().8h, \s6\().8h,  v0.h[2]
1182        mla             \d1\().8h, \s7\().8h,  v0.h[3]
1183        mla             \d1\().8h, \s8\().8h,  v0.h[4]
1184        mla             \d1\().8h, \s9\().8h,  v0.h[5]
1185        mla             \d1\().8h, \s10\().8h, v0.h[6]
1186        mla             \d1\().8h, \s11\().8h, v0.h[7]
1187.endm
1188.macro sqrshrun_b shift, r0, r1, r2, r3
1189        sqrshrun        \r0\().8b, \r0\().8h,  #\shift
1190.ifnb \r1
1191        sqrshrun        \r1\().8b, \r1\().8h,  #\shift
1192.endif
1193.ifnb \r2
1194        sqrshrun        \r2\().8b, \r2\().8h,  #\shift
1195        sqrshrun        \r3\().8b, \r3\().8h,  #\shift
1196.endif
1197.endm
1198.macro srshr_h shift, r0, r1, r2, r3
1199        srshr           \r0\().8h, \r0\().8h,  #\shift
1200.ifnb \r1
1201        srshr           \r1\().8h, \r1\().8h,  #\shift
1202.endif
1203.ifnb \r2
1204        srshr           \r2\().8h, \r2\().8h,  #\shift
1205        srshr           \r3\().8h, \r3\().8h,  #\shift
1206.endif
1207.endm
1208.macro st_h strd, reg, lanes
1209        st1             {\reg\().h}[0], [x0], \strd
1210        st1             {\reg\().h}[1], [x8], \strd
1211.if \lanes > 2
1212        st1             {\reg\().h}[2], [x0], \strd
1213        st1             {\reg\().h}[3], [x8], \strd
1214.endif
1215.endm
1216.macro st_s strd, r0, r1
1217        st1             {\r0\().s}[0], [x0], \strd
1218        st1             {\r0\().s}[1], [x8], \strd
1219.ifnb \r1
1220        st1             {\r1\().s}[0], [x0], \strd
1221        st1             {\r1\().s}[1], [x8], \strd
1222.endif
1223.endm
1224.macro st_d strd, r0, r1
1225        st1             {\r0\().d}[0], [x0], \strd
1226        st1             {\r0\().d}[1], [x8], \strd
1227.ifnb \r1
1228        st1             {\r1\().d}[0], [x0], \strd
1229        st1             {\r1\().d}[1], [x8], \strd
1230.endif
1231.endm
1232.macro shift_store_4 type, strd, r0, r1
1233.ifc \type, put
1234        sqrshrun_b      6,     \r0, \r1
1235        st_s            \strd, \r0, \r1
1236.else
1237        srshr_h         2,     \r0, \r1
1238        st_d            \strd, \r0, \r1
1239.endif
1240.endm
1241.macro st_reg strd, wd, r0, r1, r2, r3, r4, r5, r6, r7
1242        st1             {\r0\wd}, [x0], \strd
1243        st1             {\r1\wd}, [x8], \strd
1244.ifnb \r2
1245        st1             {\r2\wd}, [x0], \strd
1246        st1             {\r3\wd}, [x8], \strd
1247.endif
1248.ifnb \r4
1249        st1             {\r4\wd}, [x0], \strd
1250        st1             {\r5\wd}, [x8], \strd
1251        st1             {\r6\wd}, [x0], \strd
1252        st1             {\r7\wd}, [x8], \strd
1253.endif
1254.endm
1255.macro st_8b strd, r0, r1, r2, r3, r4, r5, r6, r7
1256        st_reg          \strd, .8b,  \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7
1257.endm
1258.macro st_16b strd, r0, r1, r2, r3, r4, r5, r6, r7
1259        st_reg          \strd, .16b, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7
1260.endm
1261.macro shift_store_8 type, strd, r0, r1, r2, r3
1262.ifc \type, put
1263        sqrshrun_b      6,     \r0, \r1, \r2, \r3
1264        st_8b           \strd, \r0, \r1, \r2, \r3
1265.else
1266        srshr_h         2,     \r0, \r1, \r2, \r3
1267        st_16b          \strd, \r0, \r1, \r2, \r3
1268.endif
1269.endm
1270.macro shift_store_16 type, strd, r0, r1, r2, r3
1271.ifc \type, put
1272        sqrshrun        \r0\().8b,  \r0\().8h, #6
1273        sqrshrun2       \r0\().16b, \r1\().8h, #6
1274        sqrshrun        \r2\().8b,  \r2\().8h, #6
1275        sqrshrun2       \r2\().16b, \r3\().8h, #6
1276        st_16b          \strd, \r0, \r2
1277.else
1278        srshr_h         2,     \r0, \r1, \r2, \r3
1279        st1             {\r0\().8h, \r1\().8h}, [x0], \strd
1280        st1             {\r2\().8h, \r3\().8h}, [x8], \strd
1281.endif
1282.endm
1283
1284.macro make_8tap_fn op, type, type_h, type_v
1285function \op\()_8tap_\type\()_8bpc_neon, export=1
1286        mov             x8,  \type_h
1287        mov             x9,  \type_v
1288        b               \op\()_8tap_neon
1289endfunc
1290.endm
1291
1292// No spaces in these expressions, due to gas-preprocessor.
1293#define REGULAR ((0*15<<7)|3*15)
1294#define SMOOTH  ((1*15<<7)|4*15)
1295#define SHARP   ((2*15<<7)|3*15)
1296
1297.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, ds2, sr2, shift_hv
1298make_8tap_fn \type, regular,        REGULAR, REGULAR
1299make_8tap_fn \type, regular_smooth, REGULAR, SMOOTH
1300make_8tap_fn \type, regular_sharp,  REGULAR, SHARP
1301make_8tap_fn \type, smooth,         SMOOTH,  SMOOTH
1302make_8tap_fn \type, smooth_regular, SMOOTH,  REGULAR
1303make_8tap_fn \type, smooth_sharp,   SMOOTH,  SHARP
1304make_8tap_fn \type, sharp,          SHARP,   SHARP
1305make_8tap_fn \type, sharp_regular,  SHARP,   REGULAR
1306make_8tap_fn \type, sharp_smooth,   SHARP,   SMOOTH
1307
1308function \type\()_8tap_neon
1309        mov             w10,  #0x4081  // (1 << 14) | (1 << 7) | (1 << 0)
1310        mul             \mx,  \mx, w10
1311        mul             \my,  \my, w10
1312        add             \mx,  \mx, w8 // mx, 8tap_h, 4tap_h
1313        add             \my,  \my, w9 // my, 8tap_v, 4tap_v
1314.ifc \type, prep
1315        uxtw            \d_strd, \w
1316        lsl             \d_strd, \d_strd, #1
1317.endif
1318
1319        clz             w8,  \w
1320        tst             \mx, #(0x7f << 14)
1321        sub             w8,  w8,  #24
1322        movrel          x10, X(mc_subpel_filters), -8
1323        b.ne            L(\type\()_8tap_h)
1324        tst             \my, #(0x7f << 14)
1325        b.ne            L(\type\()_8tap_v)
1326        b               \type\()_neon
1327
1328L(\type\()_8tap_h):
1329        cmp             \w,  #4
1330        ubfx            w9,  \mx, #7, #7
1331        and             \mx, \mx, #0x7f
1332        b.le            4f
1333        mov             \mx,  w9
13344:
1335        tst             \my,  #(0x7f << 14)
1336        add             \xmx, x10, \mx, uxtw #3
1337        b.ne            L(\type\()_8tap_hv)
1338
1339        adr             x9,  L(\type\()_8tap_h_tbl)
1340        ldrh            w8,  [x9, x8, lsl #1]
1341        sub             x9,  x9,  w8, uxtw
1342        br              x9
1343
134420:     // 2xN h
1345.ifc \type, put
1346        add             \xmx,  \xmx,  #2
1347        ld1             {v0.s}[0], [\xmx]
1348        sub             \src,  \src,  #1
1349        add             \ds2,  \dst,  \d_strd
1350        add             \sr2,  \src,  \s_strd
1351        lsl             \d_strd,  \d_strd,  #1
1352        lsl             \s_strd,  \s_strd,  #1
1353        sxtl            v0.8h,  v0.8b
13542:
1355        ld1             {v4.8b},  [\src], \s_strd
1356        ld1             {v6.8b},  [\sr2], \s_strd
1357        uxtl            v4.8h,  v4.8b
1358        uxtl            v6.8h,  v6.8b
1359        ext             v5.16b, v4.16b, v4.16b, #2
1360        ext             v7.16b, v6.16b, v6.16b, #2
1361        subs            \h,  \h,  #2
1362        trn1            v3.2s,  v4.2s,  v6.2s
1363        trn2            v6.2s,  v4.2s,  v6.2s
1364        trn1            v4.2s,  v5.2s,  v7.2s
1365        trn2            v7.2s,  v5.2s,  v7.2s
1366        mul             v3.4h,  v3.4h,  v0.h[0]
1367        mla             v3.4h,  v4.4h,  v0.h[1]
1368        mla             v3.4h,  v6.4h,  v0.h[2]
1369        mla             v3.4h,  v7.4h,  v0.h[3]
1370        srshr           v3.4h,  v3.4h,  #2
1371        sqrshrun        v3.8b,  v3.8h,  #4
1372        st1             {v3.h}[0], [\dst], \d_strd
1373        st1             {v3.h}[1], [\ds2], \d_strd
1374        b.gt            2b
1375        ret
1376.endif
1377
137840:     // 4xN h
1379        add             \xmx,  \xmx,  #2
1380        ld1             {v0.s}[0], [\xmx]
1381        sub             \src,  \src,  #1
1382        add             \ds2,  \dst,  \d_strd
1383        add             \sr2,  \src,  \s_strd
1384        lsl             \d_strd,  \d_strd,  #1
1385        lsl             \s_strd,  \s_strd,  #1
1386        sxtl            v0.8h,  v0.8b
13874:
1388        ld1             {v16.8b}, [\src], \s_strd
1389        ld1             {v20.8b}, [\sr2], \s_strd
1390        uxtl            v16.8h,  v16.8b
1391        uxtl            v20.8h,  v20.8b
1392        ext             v17.16b, v16.16b, v16.16b, #2
1393        ext             v18.16b, v16.16b, v16.16b, #4
1394        ext             v19.16b, v16.16b, v16.16b, #6
1395        ext             v21.16b, v20.16b, v20.16b, #2
1396        ext             v22.16b, v20.16b, v20.16b, #4
1397        ext             v23.16b, v20.16b, v20.16b, #6
1398        subs            \h,  \h,  #2
1399        mul             v16.4h,  v16.4h,  v0.h[0]
1400        mla             v16.4h,  v17.4h,  v0.h[1]
1401        mla             v16.4h,  v18.4h,  v0.h[2]
1402        mla             v16.4h,  v19.4h,  v0.h[3]
1403        mul             v20.4h,  v20.4h,  v0.h[0]
1404        mla             v20.4h,  v21.4h,  v0.h[1]
1405        mla             v20.4h,  v22.4h,  v0.h[2]
1406        mla             v20.4h,  v23.4h,  v0.h[3]
1407        srshr           v16.4h,  v16.4h,  #2
1408        srshr           v20.4h,  v20.4h,  #2
1409.ifc \type, put
1410        sqrshrun        v16.8b,  v16.8h,  #4
1411        sqrshrun        v20.8b,  v20.8h,  #4
1412        st1             {v16.s}[0], [\dst], \d_strd
1413        st1             {v20.s}[0], [\ds2], \d_strd
1414.else
1415        st1             {v16.4h}, [\dst], \d_strd
1416        st1             {v20.4h}, [\ds2], \d_strd
1417.endif
1418        b.gt            4b
1419        ret
1420
142180:     // 8xN h
1422        ld1             {v0.8b}, [\xmx]
1423        sub             \src,  \src,  #3
1424        add             \ds2,  \dst,  \d_strd
1425        add             \sr2,  \src,  \s_strd
1426        lsl             \d_strd,  \d_strd,  #1
1427        lsl             \s_strd,  \s_strd,  #1
1428        sxtl            v0.8h, v0.8b
14298:
1430        ld1             {v16.8b, v17.8b},  [\src], \s_strd
1431        ld1             {v20.8b, v21.8b},  [\sr2], \s_strd
1432        uxtl            v16.8h,  v16.8b
1433        uxtl            v17.8h,  v17.8b
1434        uxtl            v20.8h,  v20.8b
1435        uxtl            v21.8h,  v21.8b
1436
1437        mul             v18.8h,  v16.8h,  v0.h[0]
1438        mul             v22.8h,  v20.8h,  v0.h[0]
1439.irpc i, 1234567
1440        ext             v19.16b, v16.16b, v17.16b, #(2*\i)
1441        ext             v23.16b, v20.16b, v21.16b, #(2*\i)
1442        mla             v18.8h,  v19.8h,  v0.h[\i]
1443        mla             v22.8h,  v23.8h,  v0.h[\i]
1444.endr
1445        subs            \h,  \h,  #2
1446        srshr           v18.8h,  v18.8h, #2
1447        srshr           v22.8h,  v22.8h, #2
1448.ifc \type, put
1449        sqrshrun        v18.8b,  v18.8h, #4
1450        sqrshrun        v22.8b,  v22.8h, #4
1451        st1             {v18.8b}, [\dst], \d_strd
1452        st1             {v22.8b}, [\ds2], \d_strd
1453.else
1454        st1             {v18.8h}, [\dst], \d_strd
1455        st1             {v22.8h}, [\ds2], \d_strd
1456.endif
1457        b.gt            8b
1458        ret
1459160:
1460320:
1461640:
14621280:   // 16xN, 32xN, ... h
1463        ld1             {v0.8b}, [\xmx]
1464        sub             \src,  \src,  #3
1465        add             \ds2,  \dst,  \d_strd
1466        add             \sr2,  \src,  \s_strd
1467        lsl             \s_strd,  \s_strd,  #1
1468        sxtl            v0.8h, v0.8b
1469
1470        sub             \s_strd,  \s_strd,  \w, uxtw
1471        sub             \s_strd,  \s_strd,  #8
1472.ifc \type, put
1473        lsl             \d_strd,  \d_strd,  #1
1474        sub             \d_strd,  \d_strd,  \w, uxtw
1475.endif
1476161:
1477        ld1             {v16.8b, v17.8b, v18.8b},  [\src], #24
1478        ld1             {v20.8b, v21.8b, v22.8b},  [\sr2], #24
1479        mov             \mx, \w
1480        uxtl            v16.8h,  v16.8b
1481        uxtl            v17.8h,  v17.8b
1482        uxtl            v18.8h,  v18.8b
1483        uxtl            v20.8h,  v20.8b
1484        uxtl            v21.8h,  v21.8b
1485        uxtl            v22.8h,  v22.8b
1486
148716:
1488        mul             v24.8h,  v16.8h,  v0.h[0]
1489        mul             v25.8h,  v17.8h,  v0.h[0]
1490        mul             v26.8h,  v20.8h,  v0.h[0]
1491        mul             v27.8h,  v21.8h,  v0.h[0]
1492.irpc i, 1234567
1493        ext             v28.16b, v16.16b, v17.16b, #(2*\i)
1494        ext             v29.16b, v17.16b, v18.16b, #(2*\i)
1495        ext             v30.16b, v20.16b, v21.16b, #(2*\i)
1496        ext             v31.16b, v21.16b, v22.16b, #(2*\i)
1497        mla             v24.8h,  v28.8h,  v0.h[\i]
1498        mla             v25.8h,  v29.8h,  v0.h[\i]
1499        mla             v26.8h,  v30.8h,  v0.h[\i]
1500        mla             v27.8h,  v31.8h,  v0.h[\i]
1501.endr
1502        srshr           v24.8h,  v24.8h, #2
1503        srshr           v25.8h,  v25.8h, #2
1504        srshr           v26.8h,  v26.8h, #2
1505        srshr           v27.8h,  v27.8h, #2
1506        subs            \mx, \mx, #16
1507.ifc \type, put
1508        sqrshrun        v24.8b,  v24.8h, #4
1509        sqrshrun2       v24.16b, v25.8h, #4
1510        sqrshrun        v26.8b,  v26.8h, #4
1511        sqrshrun2       v26.16b, v27.8h, #4
1512        st1             {v24.16b}, [\dst], #16
1513        st1             {v26.16b}, [\ds2], #16
1514.else
1515        st1             {v24.8h, v25.8h}, [\dst], #32
1516        st1             {v26.8h, v27.8h}, [\ds2], #32
1517.endif
1518        b.le            9f
1519
1520        mov             v16.16b, v18.16b
1521        mov             v20.16b, v22.16b
1522        ld1             {v17.8b, v18.8b}, [\src], #16
1523        ld1             {v21.8b, v22.8b}, [\sr2], #16
1524        uxtl            v17.8h,  v17.8b
1525        uxtl            v18.8h,  v18.8b
1526        uxtl            v21.8h,  v21.8b
1527        uxtl            v22.8h,  v22.8b
1528        b               16b
1529
15309:
1531        add             \dst,  \dst,  \d_strd
1532        add             \ds2,  \ds2,  \d_strd
1533        add             \src,  \src,  \s_strd
1534        add             \sr2,  \sr2,  \s_strd
1535
1536        subs            \h,  \h,  #2
1537        b.gt            161b
1538        ret
1539
1540L(\type\()_8tap_h_tbl):
1541        .hword L(\type\()_8tap_h_tbl) - 1280b
1542        .hword L(\type\()_8tap_h_tbl) -  640b
1543        .hword L(\type\()_8tap_h_tbl) -  320b
1544        .hword L(\type\()_8tap_h_tbl) -  160b
1545        .hword L(\type\()_8tap_h_tbl) -   80b
1546        .hword L(\type\()_8tap_h_tbl) -   40b
1547        .hword L(\type\()_8tap_h_tbl) -   20b
1548        .hword 0
1549
1550
1551L(\type\()_8tap_v):
1552        cmp             \h,  #4
1553        ubfx            w9,  \my, #7, #7
1554        and             \my, \my, #0x7f
1555        b.le            4f
1556        mov             \my, w9
15574:
1558        add             \xmy, x10, \my, uxtw #3
1559
1560        adr             x9,  L(\type\()_8tap_v_tbl)
1561        ldrh            w8,  [x9, x8, lsl #1]
1562        sub             x9,  x9,  w8, uxtw
1563        br              x9
1564
156520:     // 2xN v
1566.ifc \type, put
1567        b.gt            28f
1568
1569        cmp             \h,  #2
1570        add             \xmy, \xmy, #2
1571        ld1             {v0.s}[0], [\xmy]
1572        sub             \src,  \src,  \s_strd
1573        add             \ds2,  \dst,  \d_strd
1574        add             \sr2,  \src,  \s_strd
1575        lsl             \s_strd,  \s_strd,  #1
1576        lsl             \d_strd,  \d_strd,  #1
1577        sxtl            v0.8h, v0.8b
1578
1579        // 2x2 v
1580        load_h          \src, \sr2, \s_strd, v1, v2, v3, v4, v5
1581        interleave_1_h  v1, v2, v3, v4, v5
1582        b.gt            24f
1583        uxtl_b          v1, v2, v3, v4
1584        mul_mla_4       v6, v1, v2, v3, v4, .4h
1585        sqrshrun_b      6,  v6
1586        st_h            \d_strd, v6, 2
1587        ret
1588
158924:     // 2x4 v
1590        load_h          \sr2, \src, \s_strd, v6, v7
1591        interleave_1_h  v5, v6, v7
1592        interleave_2_s  v1, v2, v3, v4, v5, v6
1593        uxtl_b          v1, v2, v3, v4
1594        mul_mla_4       v6, v1, v2, v3, v4, .8h
1595        sqrshrun_b      6,  v6
1596        st_h            \d_strd, v6, 4
1597        ret
1598
159928:     // 2x8, 2x16 v
1600        ld1             {v0.8b}, [\xmy]
1601        sub             \sr2,  \src,  \s_strd, lsl #1
1602        add             \ds2,  \dst,  \d_strd
1603        sub             \src,  \sr2,  \s_strd
1604        lsl             \d_strd,  \d_strd,  #1
1605        lsl             \s_strd,  \s_strd,  #1
1606        sxtl            v0.8h, v0.8b
1607
1608        load_h          \src, \sr2, \s_strd, v1,  v2,  v3,  v4, v5, v6, v7
1609        interleave_1_h  v1,  v2,  v3,  v4,  v5
1610        interleave_1_h  v5,  v6,  v7
1611        interleave_2_s  v1,  v2,  v3,  v4,  v5,  v6
1612        uxtl_b          v1,  v2,  v3,  v4
1613216:
1614        subs            \h,  \h,  #8
1615        load_h          \sr2, \src, \s_strd, v16, v17, v18, v19
1616        load_h          \sr2, \src, \s_strd, v20, v21, v22, v23
1617        interleave_1_h  v7,  v16, v17, v18, v19
1618        interleave_1_h  v19, v20, v21, v22, v23
1619        interleave_2_s  v5,  v6,  v7,  v16, v17, v18
1620        interleave_2_s  v17, v18, v19, v20, v21, v22
1621        uxtl_b          v5,  v6,  v7,  v16
1622        uxtl_b          v17, v18, v19, v20
1623        mul_mla_8_4     v30, v31, v1,  v2,  v3,  v4,  v5,  v6,  v7,  v16, v17, v18, v19, v20
1624        sqrshrun_b      6,   v30, v31
1625        st_h            \d_strd, v30, 4
1626        st_h            \d_strd, v31, 4
1627        b.le            0f
1628        mov             v1.16b,  v17.16b
1629        mov             v2.16b,  v18.16b
1630        mov             v3.16b,  v19.16b
1631        mov             v4.16b,  v20.16b
1632        mov             v5.16b,  v21.16b
1633        mov             v6.16b,  v22.16b
1634        mov             v7.16b,  v23.16b
1635        b               216b
16360:
1637        ret
1638.endif
1639
164040:
1641        b.gt            480f
1642
1643        // 4x2, 4x4 v
1644        cmp             \h,  #2
1645        add             \xmy, \xmy, #2
1646        ld1             {v0.s}[0], [\xmy]
1647        sub             \src, \src, \s_strd
1648        add             \ds2, \dst, \d_strd
1649        add             \sr2, \src, \s_strd
1650        lsl             \s_strd, \s_strd, #1
1651        lsl             \d_strd, \d_strd, #1
1652        sxtl            v0.8h, v0.8b
1653
1654        load_s          \src, \sr2, \s_strd, v1, v2, v3, v4, v5
1655        interleave_1_s  v1, v2, v3, v4, v5
1656        uxtl_b          v1, v2, v3, v4
1657        mul_mla_4       v6, v1, v2, v3, v4, .8h
1658        shift_store_4   \type, \d_strd, v6
1659        b.le            0f
1660        load_s          \sr2, \src, \s_strd, v6, v7
1661        interleave_1_s  v5, v6, v7
1662        uxtl_b          v5, v6
1663        mul_mla_4       v7, v3, v4, v5, v6, .8h
1664        shift_store_4   \type, \d_strd, v7
16650:
1666        ret
1667
1668480:    // 4x8, 4x16 v
1669        ld1             {v0.8b}, [\xmy]
1670        sub             \sr2, \src, \s_strd, lsl #1
1671        add             \ds2, \dst, \d_strd
1672        sub             \src, \sr2, \s_strd
1673        lsl             \s_strd, \s_strd, #1
1674        lsl             \d_strd, \d_strd, #1
1675        sxtl            v0.8h, v0.8b
1676
1677        load_s          \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22
1678        interleave_1_s  v16, v17, v18
1679        interleave_1_s  v18, v19, v20, v21, v22
1680        uxtl_b          v16, v17
1681        uxtl_b          v18, v19, v20, v21
1682
168348:
1684        subs            \h,  \h,  #4
1685        load_s          \sr2, \src, \s_strd, v23, v24, v25, v26
1686        interleave_1_s  v22, v23, v24, v25, v26
1687        uxtl_b          v22, v23, v24, v25
1688        mul_mla_8_2     v1,  v2,  v16, v17, v18, v19, v20, v21, v22, v23, v24, v25
1689        shift_store_4   \type, \d_strd, v1, v2
1690        b.le            0f
1691        subs            \h,  \h,  #4
1692        load_s          \sr2,  \src, \s_strd, v27, v16, v17, v18
1693        interleave_1_s  v26, v27, v16, v17, v18
1694        uxtl_b          v26, v27, v16, v17
1695        mul_mla_8_2     v1,  v2,  v20, v21, v22, v23, v24, v25, v26, v27, v16, v17
1696        shift_store_4   \type, \d_strd, v1, v2
1697        b.le            0f
1698        subs            \h,  \h,  #4
1699        load_s          \sr2, \src, \s_strd, v19, v20, v21, v22
1700        interleave_1_s  v18, v19, v20, v21, v22
1701        uxtl_b          v18, v19, v20, v21
1702        mul_mla_8_2     v1,  v2,  v24, v25, v26, v27, v16, v17, v18, v19, v20, v21
1703        shift_store_4   \type, \d_strd, v1, v2
1704        b.gt            48b
17050:
1706        ret
1707
170880:
1709        b.gt            880f
1710
1711        // 8x2, 8x4 v
1712        cmp             \h,  #2
1713        add             \xmy, \xmy, #2
1714        ld1             {v0.s}[0], [\xmy]
1715        sub             \src, \src, \s_strd
1716        add             \ds2, \dst, \d_strd
1717        add             \sr2, \src, \s_strd
1718        lsl             \s_strd, \s_strd, #1
1719        lsl             \d_strd, \d_strd, #1
1720        sxtl            v0.8h, v0.8b
1721
1722        load_8b         \src, \sr2, \s_strd, v1, v2, v3, v4, v5
1723        uxtl_b          v1, v2, v3, v4, v5
1724        mul_mla_4       v6, v1, v2, v3, v4, .8h
1725        mul_mla_4       v7, v2, v3, v4, v5, .8h
1726        shift_store_8   \type, \d_strd, v6, v7
1727        b.le            0f
1728        load_8b         \sr2, \src, \s_strd, v6, v7
1729        uxtl_b          v6, v7
1730        mul_mla_4       v1, v3, v4, v5, v6, .8h
1731        mul_mla_4       v2, v4, v5, v6, v7, .8h
1732        shift_store_8   \type, \d_strd, v1, v2
17330:
1734        ret
1735
1736880:    // 8x6, 8x8, 8x16, 8x32 v
17371680:   // 16x8, 16x16, ...
1738320:    // 32x8, 32x16, ...
1739640:
17401280:
1741        ld1             {v0.8b}, [\xmy]
1742        sub             \src, \src, \s_strd
1743        sub             \src, \src, \s_strd, lsl #1
1744        sxtl            v0.8h, v0.8b
1745        mov             \my,  \h
1746168:
1747        add             \ds2, \dst, \d_strd
1748        add             \sr2, \src, \s_strd
1749        lsl             \s_strd, \s_strd, #1
1750        lsl             \d_strd, \d_strd, #1
1751
1752        load_8b         \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22
1753        uxtl_b          v16, v17, v18, v19, v20, v21, v22
1754
175588:
1756        subs            \h,  \h,  #2
1757        load_8b         \sr2, \src, \s_strd, v23, v24
1758        uxtl_b          v23, v24
1759        mul_mla_8_1     v1,  v2,  v16, v17, v18, v19, v20, v21, v22, v23, v24
1760        shift_store_8   \type, \d_strd, v1, v2
1761        b.le            9f
1762        subs            \h,  \h,  #2
1763        load_8b         \sr2, \src, \s_strd, v25, v26
1764        uxtl_b          v25, v26
1765        mul_mla_8_1     v3,  v4,  v18, v19, v20, v21, v22, v23, v24, v25, v26
1766        shift_store_8   \type, \d_strd, v3, v4
1767        b.le            9f
1768        subs            \h,  \h,  #2
1769        load_8b         \sr2, \src, \s_strd, v27, v16
1770        uxtl_b          v27, v16
1771        mul_mla_8_1     v1,  v2,  v20, v21, v22, v23, v24, v25, v26, v27, v16
1772        shift_store_8   \type, \d_strd, v1, v2
1773        b.le            9f
1774        subs            \h,  \h,  #2
1775        load_8b         \sr2, \src, \s_strd, v17, v18
1776        uxtl_b          v17, v18
1777        mul_mla_8_1     v3,  v4,  v22, v23, v24, v25, v26, v27, v16, v17, v18
1778        shift_store_8   \type, \d_strd, v3, v4
1779        b.le            9f
1780        subs            \h,  \h,  #4
1781        load_8b         \sr2, \src, \s_strd, v19, v20, v21, v22
1782        uxtl_b          v19, v20, v21, v22
1783        mul_mla_8_1     v1,  v2,  v24, v25, v26, v27, v16, v17, v18, v19, v20
1784        mul_mla_8_1     v3,  v4,  v26, v27, v16, v17, v18, v19, v20, v21, v22
1785        shift_store_8   \type, \d_strd, v1, v2, v3, v4
1786        b.gt            88b
17879:
1788        subs            \w,  \w,  #8
1789        b.le            0f
1790        asr             \s_strd, \s_strd, #1
1791        asr             \d_strd, \d_strd, #1
1792        msub            \src, \s_strd, \xmy, \src
1793        msub            \dst, \d_strd, \xmy, \dst
1794        sub             \src, \src, \s_strd, lsl #3
1795        mov             \h,  \my
1796        add             \src, \src, #8
1797.ifc \type, put
1798        add             \dst, \dst, #8
1799.else
1800        add             \dst, \dst, #16
1801.endif
1802        b               168b
18030:
1804        ret
1805
1806160:
1807        b.gt            1680b
1808
1809        // 16x2, 16x4 v
1810        add             \xmy, \xmy, #2
1811        ld1             {v0.s}[0], [\xmy]
1812        sub             \src, \src, \s_strd
1813        add             \ds2, \dst, \d_strd
1814        add             \sr2, \src, \s_strd
1815        lsl             \s_strd, \s_strd, #1
1816        lsl             \d_strd, \d_strd, #1
1817        sxtl            v0.8h, v0.8b
1818
1819        cmp             \h,  #2
1820        load_16b        \src, \sr2, \s_strd, v1,  v2,  v3,  v4,  v5
1821        uxtl            v16.8h, v1.8b
1822        uxtl            v17.8h, v2.8b
1823        uxtl            v18.8h, v3.8b
1824        uxtl            v19.8h, v4.8b
1825        uxtl            v20.8h, v5.8b
1826        uxtl2           v23.8h, v1.16b
1827        uxtl2           v24.8h, v2.16b
1828        uxtl2           v25.8h, v3.16b
1829        uxtl2           v26.8h, v4.16b
1830        uxtl2           v27.8h, v5.16b
1831        mul_mla_4       v1,  v16, v17, v18, v19, .8h
1832        mul_mla_4       v16, v17, v18, v19, v20, .8h
1833        mul_mla_4       v2,  v23, v24, v25, v26, .8h
1834        mul_mla_4       v17, v24, v25, v26, v27, .8h
1835        shift_store_16  \type, \d_strd, v1, v2, v16, v17
1836        b.le            0f
1837        load_16b        \sr2, \src, \s_strd, v6,  v7
1838        uxtl            v21.8h, v6.8b
1839        uxtl            v22.8h, v7.8b
1840        uxtl2           v28.8h, v6.16b
1841        uxtl2           v29.8h, v7.16b
1842        mul_mla_4       v1,  v18, v19, v20, v21, .8h
1843        mul_mla_4       v3,  v19, v20, v21, v22, .8h
1844        mul_mla_4       v2,  v25, v26, v27, v28, .8h
1845        mul_mla_4       v4,  v26, v27, v28, v29, .8h
1846        shift_store_16  \type, \d_strd, v1, v2, v3, v4
18470:
1848        ret
1849
1850L(\type\()_8tap_v_tbl):
1851        .hword L(\type\()_8tap_v_tbl) - 1280b
1852        .hword L(\type\()_8tap_v_tbl) -  640b
1853        .hword L(\type\()_8tap_v_tbl) -  320b
1854        .hword L(\type\()_8tap_v_tbl) -  160b
1855        .hword L(\type\()_8tap_v_tbl) -   80b
1856        .hword L(\type\()_8tap_v_tbl) -   40b
1857        .hword L(\type\()_8tap_v_tbl) -   20b
1858        .hword 0
1859
1860L(\type\()_8tap_hv):
1861        cmp             \h,  #4
1862        ubfx            w9,  \my, #7, #7
1863        and             \my, \my, #0x7f
1864        b.le            4f
1865        mov             \my,  w9
18664:
1867        add             \xmy,  x10, \my, uxtw #3
1868
1869        adr             x9,  L(\type\()_8tap_hv_tbl)
1870        ldrh            w8,  [x9, x8, lsl #1]
1871        sub             x9,  x9,  w8, uxtw
1872        br              x9
1873
187420:
1875.ifc \type, put
1876        add             \xmx,  \xmx,  #2
1877        ld1             {v0.s}[0],  [\xmx]
1878        b.gt            280f
1879        add             \xmy,  \xmy,  #2
1880        ld1             {v1.s}[0],  [\xmy]
1881
1882        // 2x2, 2x4 hv
1883        sub             \sr2, \src, #1
1884        sub             \src, \sr2, \s_strd
1885        add             \ds2, \dst, \d_strd
1886        lsl             \s_strd, \s_strd, #1
1887        lsl             \d_strd, \d_strd, #1
1888        sxtl            v0.8h,  v0.8b
1889        sxtl            v1.8h,  v1.8b
1890        mov             x15, x30
1891
1892        ld1             {v28.8b}, [\src], \s_strd
1893        uxtl            v28.8h,  v28.8b
1894        ext             v29.16b, v28.16b, v28.16b, #2
1895        mul             v28.4h,  v28.4h,  v0.4h
1896        mul             v29.4h,  v29.4h,  v0.4h
1897        addp            v28.4h,  v28.4h,  v29.4h
1898        addp            v16.4h,  v28.4h,  v28.4h
1899        srshr           v16.4h,  v16.4h,  #2
1900        bl              L(\type\()_8tap_filter_2)
1901
1902        trn1            v16.2s, v16.2s, v28.2s
1903        mov             v17.8b, v28.8b
1904
19052:
1906        bl              L(\type\()_8tap_filter_2)
1907
1908        ext             v18.8b, v17.8b, v28.8b, #4
1909        smull           v2.4s,  v16.4h, v1.h[0]
1910        smlal           v2.4s,  v17.4h, v1.h[1]
1911        smlal           v2.4s,  v18.4h, v1.h[2]
1912        smlal           v2.4s,  v28.4h, v1.h[3]
1913
1914        sqrshrn         v2.4h,  v2.4s,  #\shift_hv
1915        sqxtun          v2.8b,  v2.8h
1916        subs            \h,  \h,  #2
1917        st1             {v2.h}[0], [\dst], \d_strd
1918        st1             {v2.h}[1], [\ds2], \d_strd
1919        b.le            0f
1920        mov             v16.8b, v18.8b
1921        mov             v17.8b, v28.8b
1922        b               2b
1923
1924280:    // 2x8, 2x16, 2x32 hv
1925        ld1             {v1.8b},  [\xmy]
1926        sub             \src, \src, #1
1927        sub             \sr2, \src, \s_strd, lsl #1
1928        sub             \src, \sr2, \s_strd
1929        add             \ds2, \dst, \d_strd
1930        lsl             \s_strd, \s_strd, #1
1931        lsl             \d_strd, \d_strd, #1
1932        sxtl            v0.8h,  v0.8b
1933        sxtl            v1.8h,  v1.8b
1934        mov             x15, x30
1935
1936        ld1             {v28.8b}, [\src], \s_strd
1937        uxtl            v28.8h,  v28.8b
1938        ext             v29.16b, v28.16b, v28.16b, #2
1939        mul             v28.4h,  v28.4h,  v0.4h
1940        mul             v29.4h,  v29.4h,  v0.4h
1941        addp            v28.4h,  v28.4h,  v29.4h
1942        addp            v16.4h,  v28.4h,  v28.4h
1943        srshr           v16.4h,  v16.4h,  #2
1944
1945        bl              L(\type\()_8tap_filter_2)
1946        trn1            v16.2s, v16.2s, v28.2s
1947        mov             v17.8b, v28.8b
1948        bl              L(\type\()_8tap_filter_2)
1949        ext             v18.8b, v17.8b, v28.8b, #4
1950        mov             v19.8b, v28.8b
1951        bl              L(\type\()_8tap_filter_2)
1952        ext             v20.8b, v19.8b, v28.8b, #4
1953        mov             v21.8b, v28.8b
1954
195528:
1956        bl              L(\type\()_8tap_filter_2)
1957        ext             v22.8b, v21.8b, v28.8b, #4
1958        smull           v2.4s,  v16.4h, v1.h[0]
1959        smlal           v2.4s,  v17.4h, v1.h[1]
1960        smlal           v2.4s,  v18.4h, v1.h[2]
1961        smlal           v2.4s,  v19.4h, v1.h[3]
1962        smlal           v2.4s,  v20.4h, v1.h[4]
1963        smlal           v2.4s,  v21.4h, v1.h[5]
1964        smlal           v2.4s,  v22.4h, v1.h[6]
1965        smlal           v2.4s,  v28.4h, v1.h[7]
1966
1967        sqrshrn         v2.4h,  v2.4s,  #\shift_hv
1968        sqxtun          v2.8b,  v2.8h
1969        subs            \h,  \h,  #2
1970        st1             {v2.h}[0], [\dst], \d_strd
1971        st1             {v2.h}[1], [\ds2], \d_strd
1972        b.le            0f
1973        mov             v16.8b, v18.8b
1974        mov             v17.8b, v19.8b
1975        mov             v18.8b, v20.8b
1976        mov             v19.8b, v21.8b
1977        mov             v20.8b, v22.8b
1978        mov             v21.8b, v28.8b
1979        b               28b
1980
19810:
1982        br              x15
1983
1984L(\type\()_8tap_filter_2):
1985        ld1             {v28.8b},  [\sr2], \s_strd
1986        ld1             {v30.8b},  [\src], \s_strd
1987        uxtl            v28.8h,  v28.8b
1988        uxtl            v30.8h,  v30.8b
1989        ext             v29.16b, v28.16b, v28.16b, #2
1990        ext             v31.16b, v30.16b, v30.16b, #2
1991        trn1            v27.2s,  v28.2s,  v30.2s
1992        trn2            v30.2s,  v28.2s,  v30.2s
1993        trn1            v28.2s,  v29.2s,  v31.2s
1994        trn2            v31.2s,  v29.2s,  v31.2s
1995        mul             v27.4h,  v27.4h,  v0.h[0]
1996        mla             v27.4h,  v28.4h,  v0.h[1]
1997        mla             v27.4h,  v30.4h,  v0.h[2]
1998        mla             v27.4h,  v31.4h,  v0.h[3]
1999        srshr           v28.4h,  v27.4h,  #2
2000        ret
2001.endif
2002
200340:
2004        add             \xmx, \xmx, #2
2005        ld1             {v0.s}[0],  [\xmx]
2006        b.gt            480f
2007        add             \xmy, \xmy,  #2
2008        ld1             {v1.s}[0],  [\xmy]
2009        sub             \sr2, \src, #1
2010        sub             \src, \sr2, \s_strd
2011        add             \ds2, \dst, \d_strd
2012        lsl             \s_strd, \s_strd, #1
2013        lsl             \d_strd, \d_strd, #1
2014        sxtl            v0.8h,  v0.8b
2015        sxtl            v1.8h,  v1.8b
2016        mov             x15, x30
2017
2018        // 4x2, 4x4 hv
2019        ld1             {v26.8b}, [\src], \s_strd
2020        uxtl            v26.8h,  v26.8b
2021        ext             v28.16b, v26.16b, v26.16b, #2
2022        ext             v29.16b, v26.16b, v26.16b, #4
2023        ext             v30.16b, v26.16b, v26.16b, #6
2024        mul             v31.4h,  v26.4h,  v0.h[0]
2025        mla             v31.4h,  v28.4h,  v0.h[1]
2026        mla             v31.4h,  v29.4h,  v0.h[2]
2027        mla             v31.4h,  v30.4h,  v0.h[3]
2028        srshr           v16.4h,  v31.4h,  #2
2029
2030        bl              L(\type\()_8tap_filter_4)
2031        mov             v17.8b, v28.8b
2032        mov             v18.8b, v29.8b
2033
20344:
2035        bl              L(\type\()_8tap_filter_4)
2036        // Interleaving the mul/mla chains actually hurts performance
2037        // significantly on Cortex A53, thus keeping mul/mla tightly
2038        // chained like this.
2039        smull           v2.4s,  v16.4h, v1.h[0]
2040        smlal           v2.4s,  v17.4h, v1.h[1]
2041        smlal           v2.4s,  v18.4h, v1.h[2]
2042        smlal           v2.4s,  v28.4h, v1.h[3]
2043        smull           v3.4s,  v17.4h, v1.h[0]
2044        smlal           v3.4s,  v18.4h, v1.h[1]
2045        smlal           v3.4s,  v28.4h, v1.h[2]
2046        smlal           v3.4s,  v29.4h, v1.h[3]
2047        sqrshrn         v2.4h,  v2.4s,  #\shift_hv
2048        sqrshrn         v3.4h,  v3.4s,  #\shift_hv
2049        subs            \h,  \h,  #2
2050.ifc \type, put
2051        sqxtun          v2.8b,  v2.8h
2052        sqxtun          v3.8b,  v3.8h
2053        st1             {v2.s}[0], [\dst], \d_strd
2054        st1             {v3.s}[0], [\ds2], \d_strd
2055.else
2056        st1             {v2.4h}, [\dst], \d_strd
2057        st1             {v3.4h}, [\ds2], \d_strd
2058.endif
2059        b.le            0f
2060        mov             v16.8b,  v18.8b
2061        mov             v17.8b,  v28.8b
2062        mov             v18.8b,  v29.8b
2063        b               4b
2064
2065480:    // 4x8, 4x16, 4x32 hv
2066        ld1             {v1.8b},  [\xmy]
2067        sub             \src, \src, #1
2068        sub             \sr2, \src, \s_strd, lsl #1
2069        sub             \src, \sr2, \s_strd
2070        add             \ds2, \dst, \d_strd
2071        lsl             \s_strd, \s_strd, #1
2072        lsl             \d_strd, \d_strd, #1
2073        sxtl            v0.8h,  v0.8b
2074        sxtl            v1.8h,  v1.8b
2075        mov             x15, x30
2076
2077        ld1             {v26.8b}, [\src], \s_strd
2078        uxtl            v26.8h,  v26.8b
2079        ext             v28.16b, v26.16b, v26.16b, #2
2080        ext             v29.16b, v26.16b, v26.16b, #4
2081        ext             v30.16b, v26.16b, v26.16b, #6
2082        mul             v31.4h,  v26.4h,  v0.h[0]
2083        mla             v31.4h,  v28.4h,  v0.h[1]
2084        mla             v31.4h,  v29.4h,  v0.h[2]
2085        mla             v31.4h,  v30.4h,  v0.h[3]
2086        srshr           v16.4h,  v31.4h,  #2
2087
2088        bl              L(\type\()_8tap_filter_4)
2089        mov             v17.8b, v28.8b
2090        mov             v18.8b, v29.8b
2091        bl              L(\type\()_8tap_filter_4)
2092        mov             v19.8b, v28.8b
2093        mov             v20.8b, v29.8b
2094        bl              L(\type\()_8tap_filter_4)
2095        mov             v21.8b, v28.8b
2096        mov             v22.8b, v29.8b
2097
209848:
2099        bl              L(\type\()_8tap_filter_4)
2100        smull           v2.4s,  v16.4h, v1.h[0]
2101        smlal           v2.4s,  v17.4h, v1.h[1]
2102        smlal           v2.4s,  v18.4h, v1.h[2]
2103        smlal           v2.4s,  v19.4h, v1.h[3]
2104        smlal           v2.4s,  v20.4h, v1.h[4]
2105        smlal           v2.4s,  v21.4h, v1.h[5]
2106        smlal           v2.4s,  v22.4h, v1.h[6]
2107        smlal           v2.4s,  v28.4h, v1.h[7]
2108        smull           v3.4s,  v17.4h, v1.h[0]
2109        smlal           v3.4s,  v18.4h, v1.h[1]
2110        smlal           v3.4s,  v19.4h, v1.h[2]
2111        smlal           v3.4s,  v20.4h, v1.h[3]
2112        smlal           v3.4s,  v21.4h, v1.h[4]
2113        smlal           v3.4s,  v22.4h, v1.h[5]
2114        smlal           v3.4s,  v28.4h, v1.h[6]
2115        smlal           v3.4s,  v29.4h, v1.h[7]
2116        sqrshrn         v2.4h,  v2.4s,  #\shift_hv
2117        sqrshrn         v3.4h,  v3.4s,  #\shift_hv
2118        subs            \h,  \h,  #2
2119.ifc \type, put
2120        sqxtun          v2.8b,  v2.8h
2121        sqxtun          v3.8b,  v3.8h
2122        st1             {v2.s}[0], [\dst], \d_strd
2123        st1             {v3.s}[0], [\ds2], \d_strd
2124.else
2125        st1             {v2.4h}, [\dst], \d_strd
2126        st1             {v3.4h}, [\ds2], \d_strd
2127.endif
2128        b.le            0f
2129        mov             v16.8b,  v18.8b
2130        mov             v17.8b,  v19.8b
2131        mov             v18.8b,  v20.8b
2132        mov             v19.8b,  v21.8b
2133        mov             v20.8b,  v22.8b
2134        mov             v21.8b,  v28.8b
2135        mov             v22.8b,  v29.8b
2136        b               48b
21370:
2138        br              x15
2139
2140L(\type\()_8tap_filter_4):
2141        ld1             {v26.8b}, [\sr2], \s_strd
2142        ld1             {v27.8b}, [\src], \s_strd
2143        uxtl            v26.8h,  v26.8b
2144        uxtl            v27.8h,  v27.8b
2145        ext             v28.16b, v26.16b, v26.16b, #2
2146        ext             v29.16b, v26.16b, v26.16b, #4
2147        ext             v30.16b, v26.16b, v26.16b, #6
2148        mul             v31.4h,  v26.4h,  v0.h[0]
2149        mla             v31.4h,  v28.4h,  v0.h[1]
2150        mla             v31.4h,  v29.4h,  v0.h[2]
2151        mla             v31.4h,  v30.4h,  v0.h[3]
2152        ext             v28.16b, v27.16b, v27.16b, #2
2153        ext             v29.16b, v27.16b, v27.16b, #4
2154        ext             v30.16b, v27.16b, v27.16b, #6
2155        mul             v27.4h,  v27.4h,  v0.h[0]
2156        mla             v27.4h,  v28.4h,  v0.h[1]
2157        mla             v27.4h,  v29.4h,  v0.h[2]
2158        mla             v27.4h,  v30.4h,  v0.h[3]
2159        srshr           v28.4h,  v31.4h,  #2
2160        srshr           v29.4h,  v27.4h,  #2
2161        ret
2162
216380:
2164160:
2165320:
2166        b.gt            880f
2167        add             \xmy,  \xmy,  #2
2168        ld1             {v0.8b},  [\xmx]
2169        ld1             {v1.s}[0],  [\xmy]
2170        sub             \src,  \src,  #3
2171        sub             \src,  \src,  \s_strd
2172        sxtl            v0.8h,  v0.8b
2173        sxtl            v1.8h,  v1.8b
2174        mov             x15, x30
2175        mov             \my,  \h
2176
2177164:    // 8x2, 8x4, 16x2, 16x4, 32x2, 32x4 hv
2178        add             \ds2,  \dst,  \d_strd
2179        add             \sr2,  \src,  \s_strd
2180        lsl             \d_strd, \d_strd, #1
2181        lsl             \s_strd, \s_strd, #1
2182
2183        bl              L(\type\()_8tap_filter_8_first)
2184        bl              L(\type\()_8tap_filter_8)
2185        mov             v17.16b, v24.16b
2186        mov             v18.16b, v25.16b
2187
21888:
2189        smull           v2.4s,  v16.4h, v1.h[0]
2190        smull2          v3.4s,  v16.8h, v1.h[0]
2191        bl              L(\type\()_8tap_filter_8)
2192        smull           v4.4s,  v17.4h, v1.h[0]
2193        smull2          v5.4s,  v17.8h, v1.h[0]
2194        smlal           v2.4s,  v17.4h, v1.h[1]
2195        smlal2          v3.4s,  v17.8h, v1.h[1]
2196        smlal           v4.4s,  v18.4h, v1.h[1]
2197        smlal2          v5.4s,  v18.8h, v1.h[1]
2198        smlal           v2.4s,  v18.4h, v1.h[2]
2199        smlal2          v3.4s,  v18.8h, v1.h[2]
2200        smlal           v4.4s,  v24.4h, v1.h[2]
2201        smlal2          v5.4s,  v24.8h, v1.h[2]
2202        smlal           v2.4s,  v24.4h, v1.h[3]
2203        smlal2          v3.4s,  v24.8h, v1.h[3]
2204        smlal           v4.4s,  v25.4h, v1.h[3]
2205        smlal2          v5.4s,  v25.8h, v1.h[3]
2206        sqrshrn         v2.4h,  v2.4s,  #\shift_hv
2207        sqrshrn2        v2.8h,  v3.4s,  #\shift_hv
2208        sqrshrn         v4.4h,  v4.4s,  #\shift_hv
2209        sqrshrn2        v4.8h,  v5.4s,  #\shift_hv
2210        subs            \h,  \h,  #2
2211.ifc \type, put
2212        sqxtun          v2.8b,  v2.8h
2213        sqxtun          v4.8b,  v4.8h
2214        st1             {v2.8b}, [\dst], \d_strd
2215        st1             {v4.8b}, [\ds2], \d_strd
2216.else
2217        st1             {v2.8h}, [\dst], \d_strd
2218        st1             {v4.8h}, [\ds2], \d_strd
2219.endif
2220        b.le            9f
2221        mov             v16.16b, v18.16b
2222        mov             v17.16b, v24.16b
2223        mov             v18.16b, v25.16b
2224        b               8b
22259:
2226        subs            \w,  \w,  #8
2227        b.le            0f
2228        asr             \s_strd,  \s_strd,  #1
2229        asr             \d_strd,  \d_strd,  #1
2230        msub            \src,  \s_strd,  \xmy,  \src
2231        msub            \dst,  \d_strd,  \xmy,  \dst
2232        sub             \src,  \src,  \s_strd,  lsl #2
2233        mov             \h,  \my
2234        add             \src,  \src,  #8
2235.ifc \type, put
2236        add             \dst,  \dst,  #8
2237.else
2238        add             \dst,  \dst,  #16
2239.endif
2240        b               164b
2241
2242880:    // 8x8, 8x16, ..., 16x8, ..., 32x8, ... hv
2243640:
22441280:
2245        ld1             {v0.8b},  [\xmx]
2246        ld1             {v1.8b},  [\xmy]
2247        sub             \src,  \src,  #3
2248        sub             \src,  \src,  \s_strd
2249        sub             \src,  \src,  \s_strd, lsl #1
2250        sxtl            v0.8h,  v0.8b
2251        sxtl            v1.8h,  v1.8b
2252        mov             x15, x30
2253        mov             \my,  \h
2254
2255168:
2256        add             \ds2,  \dst,  \d_strd
2257        add             \sr2,  \src,  \s_strd
2258        lsl             \d_strd, \d_strd, #1
2259        lsl             \s_strd, \s_strd, #1
2260
2261        bl              L(\type\()_8tap_filter_8_first)
2262        bl              L(\type\()_8tap_filter_8)
2263        mov             v17.16b, v24.16b
2264        mov             v18.16b, v25.16b
2265        bl              L(\type\()_8tap_filter_8)
2266        mov             v19.16b, v24.16b
2267        mov             v20.16b, v25.16b
2268        bl              L(\type\()_8tap_filter_8)
2269        mov             v21.16b, v24.16b
2270        mov             v22.16b, v25.16b
2271
227288:
2273        smull           v2.4s,  v16.4h, v1.h[0]
2274        smull2          v3.4s,  v16.8h, v1.h[0]
2275        bl              L(\type\()_8tap_filter_8)
2276        smull           v4.4s,  v17.4h, v1.h[0]
2277        smull2          v5.4s,  v17.8h, v1.h[0]
2278        smlal           v2.4s,  v17.4h, v1.h[1]
2279        smlal2          v3.4s,  v17.8h, v1.h[1]
2280        smlal           v4.4s,  v18.4h, v1.h[1]
2281        smlal2          v5.4s,  v18.8h, v1.h[1]
2282        smlal           v2.4s,  v18.4h, v1.h[2]
2283        smlal2          v3.4s,  v18.8h, v1.h[2]
2284        smlal           v4.4s,  v19.4h, v1.h[2]
2285        smlal2          v5.4s,  v19.8h, v1.h[2]
2286        smlal           v2.4s,  v19.4h, v1.h[3]
2287        smlal2          v3.4s,  v19.8h, v1.h[3]
2288        smlal           v4.4s,  v20.4h, v1.h[3]
2289        smlal2          v5.4s,  v20.8h, v1.h[3]
2290        smlal           v2.4s,  v20.4h, v1.h[4]
2291        smlal2          v3.4s,  v20.8h, v1.h[4]
2292        smlal           v4.4s,  v21.4h, v1.h[4]
2293        smlal2          v5.4s,  v21.8h, v1.h[4]
2294        smlal           v2.4s,  v21.4h, v1.h[5]
2295        smlal2          v3.4s,  v21.8h, v1.h[5]
2296        smlal           v4.4s,  v22.4h, v1.h[5]
2297        smlal2          v5.4s,  v22.8h, v1.h[5]
2298        smlal           v2.4s,  v22.4h, v1.h[6]
2299        smlal2          v3.4s,  v22.8h, v1.h[6]
2300        smlal           v4.4s,  v24.4h, v1.h[6]
2301        smlal2          v5.4s,  v24.8h, v1.h[6]
2302        smlal           v2.4s,  v24.4h, v1.h[7]
2303        smlal2          v3.4s,  v24.8h, v1.h[7]
2304        smlal           v4.4s,  v25.4h, v1.h[7]
2305        smlal2          v5.4s,  v25.8h, v1.h[7]
2306        sqrshrn         v2.4h,  v2.4s,  #\shift_hv
2307        sqrshrn2        v2.8h,  v3.4s,  #\shift_hv
2308        sqrshrn         v4.4h,  v4.4s,  #\shift_hv
2309        sqrshrn2        v4.8h,  v5.4s,  #\shift_hv
2310        subs            \h,  \h,  #2
2311.ifc \type, put
2312        sqxtun          v2.8b,  v2.8h
2313        sqxtun          v4.8b,  v4.8h
2314        st1             {v2.8b}, [\dst], \d_strd
2315        st1             {v4.8b}, [\ds2], \d_strd
2316.else
2317        st1             {v2.8h}, [\dst], \d_strd
2318        st1             {v4.8h}, [\ds2], \d_strd
2319.endif
2320        b.le            9f
2321        mov             v16.16b, v18.16b
2322        mov             v17.16b, v19.16b
2323        mov             v18.16b, v20.16b
2324        mov             v19.16b, v21.16b
2325        mov             v20.16b, v22.16b
2326        mov             v21.16b, v24.16b
2327        mov             v22.16b, v25.16b
2328        b               88b
23299:
2330        subs            \w,  \w,  #8
2331        b.le            0f
2332        asr             \s_strd,  \s_strd,  #1
2333        asr             \d_strd,  \d_strd,  #1
2334        msub            \src,  \s_strd,  \xmy,  \src
2335        msub            \dst,  \d_strd,  \xmy,  \dst
2336        sub             \src,  \src,  \s_strd,  lsl #3
2337        mov             \h,  \my
2338        add             \src,  \src,  #8
2339.ifc \type, put
2340        add             \dst,  \dst,  #8
2341.else
2342        add             \dst,  \dst,  #16
2343.endif
2344        b               168b
23450:
2346        br              x15
2347
2348L(\type\()_8tap_filter_8_first):
2349        ld1             {v28.8b, v29.8b},  [\src], \s_strd
2350        uxtl            v28.8h,  v28.8b
2351        uxtl            v29.8h,  v29.8b
2352        mul             v16.8h,  v28.8h,  v0.h[0]
2353        ext             v24.16b, v28.16b, v29.16b, #(2*1)
2354        ext             v25.16b, v28.16b, v29.16b, #(2*2)
2355        ext             v26.16b, v28.16b, v29.16b, #(2*3)
2356        ext             v27.16b, v28.16b, v29.16b, #(2*4)
2357        mla             v16.8h,  v24.8h,  v0.h[1]
2358        mla             v16.8h,  v25.8h,  v0.h[2]
2359        mla             v16.8h,  v26.8h,  v0.h[3]
2360        mla             v16.8h,  v27.8h,  v0.h[4]
2361        ext             v24.16b, v28.16b, v29.16b, #(2*5)
2362        ext             v25.16b, v28.16b, v29.16b, #(2*6)
2363        ext             v26.16b, v28.16b, v29.16b, #(2*7)
2364        mla             v16.8h,  v24.8h,  v0.h[5]
2365        mla             v16.8h,  v25.8h,  v0.h[6]
2366        mla             v16.8h,  v26.8h,  v0.h[7]
2367        srshr           v16.8h,  v16.8h,  #2
2368        ret
2369
2370L(\type\()_8tap_filter_8):
2371        ld1             {v28.8b, v29.8b},  [\sr2], \s_strd
2372        ld1             {v30.8b, v31.8b},  [\src], \s_strd
2373        uxtl            v28.8h,  v28.8b
2374        uxtl            v29.8h,  v29.8b
2375        uxtl            v30.8h,  v30.8b
2376        uxtl            v31.8h,  v31.8b
2377        mul             v24.8h,  v28.8h,  v0.h[0]
2378        mul             v25.8h,  v30.8h,  v0.h[0]
2379.irpc i, 1234567
2380        ext             v26.16b, v28.16b, v29.16b, #(2*\i)
2381        ext             v27.16b, v30.16b, v31.16b, #(2*\i)
2382        mla             v24.8h,  v26.8h,  v0.h[\i]
2383        mla             v25.8h,  v27.8h,  v0.h[\i]
2384.endr
2385        srshr           v24.8h,  v24.8h, #2
2386        srshr           v25.8h,  v25.8h, #2
2387        ret
2388
2389L(\type\()_8tap_hv_tbl):
2390        .hword L(\type\()_8tap_hv_tbl) - 1280b
2391        .hword L(\type\()_8tap_hv_tbl) -  640b
2392        .hword L(\type\()_8tap_hv_tbl) -  320b
2393        .hword L(\type\()_8tap_hv_tbl) -  160b
2394        .hword L(\type\()_8tap_hv_tbl) -   80b
2395        .hword L(\type\()_8tap_hv_tbl) -   40b
2396        .hword L(\type\()_8tap_hv_tbl) -   20b
2397        .hword 0
2398endfunc
2399
2400
2401function \type\()_bilin_8bpc_neon, export=1
2402        dup             v1.16b, \mx
2403        dup             v3.16b, \my
2404        mov             w9,  #16
2405        sub             w8, w9, \mx
2406        sub             w9, w9, \my
2407        dup             v0.16b, w8
2408        dup             v2.16b, w9
2409.ifc \type, prep
2410        uxtw            \d_strd, \w
2411        lsl             \d_strd, \d_strd, #1
2412.endif
2413
2414        clz             w8,  \w
2415        sub             w8,  w8,  #24
2416        cbnz            \mx, L(\type\()_bilin_h)
2417        cbnz            \my, L(\type\()_bilin_v)
2418        b               \type\()_neon
2419
2420L(\type\()_bilin_h):
2421        cbnz            \my, L(\type\()_bilin_hv)
2422
2423        adr             x9,  L(\type\()_bilin_h_tbl)
2424        ldrh            w8,  [x9, x8, lsl #1]
2425        sub             x9,  x9,  w8, uxtw
2426        br              x9
2427
242820:     // 2xN h
2429.ifc \type, put
2430        add             \ds2,  \dst,  \d_strd
2431        add             \sr2,  \src,  \s_strd
2432        lsl             \d_strd,  \d_strd,  #1
2433        lsl             \s_strd,  \s_strd,  #1
24342:
2435        ld1             {v4.s}[0],  [\src], \s_strd
2436        ld1             {v6.s}[0],  [\sr2], \s_strd
2437        ext             v5.8b,  v4.8b,  v4.8b, #1
2438        ext             v7.8b,  v6.8b,  v6.8b, #1
2439        trn1            v4.4h,  v4.4h,  v6.4h
2440        trn1            v5.4h,  v5.4h,  v7.4h
2441        subs            \h,  \h,  #2
2442        umull           v4.8h,  v4.8b,  v0.8b
2443        umlal           v4.8h,  v5.8b,  v1.8b
2444        uqrshrn         v4.8b,  v4.8h,  #4
2445        st1             {v4.h}[0], [\dst], \d_strd
2446        st1             {v4.h}[1], [\ds2], \d_strd
2447        b.gt            2b
2448        ret
2449.endif
2450
245140:     // 4xN h
2452        add             \ds2,  \dst,  \d_strd
2453        add             \sr2,  \src,  \s_strd
2454        lsl             \d_strd,  \d_strd,  #1
2455        lsl             \s_strd,  \s_strd,  #1
24564:
2457        ld1             {v4.8b}, [\src], \s_strd
2458        ld1             {v6.8b}, [\sr2], \s_strd
2459        ext             v5.8b,  v4.8b,  v4.8b, #1
2460        ext             v7.8b,  v6.8b,  v6.8b, #1
2461        trn1            v4.2s,  v4.2s,  v6.2s
2462        trn1            v5.2s,  v5.2s,  v7.2s
2463        subs            \h,  \h,  #2
2464        umull           v4.8h,  v4.8b,  v0.8b
2465        umlal           v4.8h,  v5.8b,  v1.8b
2466.ifc \type, put
2467        uqrshrn         v4.8b,  v4.8h,  #4
2468        st1             {v4.s}[0], [\dst], \d_strd
2469        st1             {v4.s}[1], [\ds2], \d_strd
2470.else
2471        st1             {v4.d}[0], [\dst], \d_strd
2472        st1             {v4.d}[1], [\ds2], \d_strd
2473.endif
2474        b.gt            4b
2475        ret
2476
247780:     // 8xN h
2478        add             \ds2,  \dst,  \d_strd
2479        add             \sr2,  \src,  \s_strd
2480        lsl             \d_strd,  \d_strd,  #1
2481        lsl             \s_strd,  \s_strd,  #1
24828:
2483        ld1             {v4.16b}, [\src], \s_strd
2484        ld1             {v6.16b}, [\sr2], \s_strd
2485        ext             v5.16b, v4.16b, v4.16b, #1
2486        ext             v7.16b, v6.16b, v6.16b, #1
2487        subs            \h,  \h,  #2
2488        umull           v4.8h,  v4.8b,  v0.8b
2489        umull           v6.8h,  v6.8b,  v0.8b
2490        umlal           v4.8h,  v5.8b,  v1.8b
2491        umlal           v6.8h,  v7.8b,  v1.8b
2492.ifc \type, put
2493        uqrshrn         v4.8b,  v4.8h,  #4
2494        uqrshrn         v6.8b,  v6.8h,  #4
2495        st1             {v4.8b}, [\dst], \d_strd
2496        st1             {v6.8b}, [\ds2], \d_strd
2497.else
2498        st1             {v4.8h}, [\dst], \d_strd
2499        st1             {v6.8h}, [\ds2], \d_strd
2500.endif
2501        b.gt            8b
2502        ret
2503160:
2504320:
2505640:
25061280:   // 16xN, 32xN, ... h
2507        add             \ds2,  \dst,  \d_strd
2508        add             \sr2,  \src,  \s_strd
2509        lsl             \s_strd,  \s_strd,  #1
2510
2511        sub             \s_strd,  \s_strd,  \w, uxtw
2512        sub             \s_strd,  \s_strd,  #8
2513.ifc \type, put
2514        lsl             \d_strd,  \d_strd,  #1
2515        sub             \d_strd,  \d_strd,  \w, uxtw
2516.endif
2517161:
2518        ld1             {v16.d}[1],  [\src], #8
2519        ld1             {v20.d}[1],  [\sr2], #8
2520        mov             \mx, \w
2521
252216:
2523        ld1             {v18.16b},  [\src], #16
2524        ld1             {v22.16b},  [\sr2], #16
2525        ext             v17.16b, v16.16b, v18.16b, #8
2526        ext             v19.16b, v16.16b, v18.16b, #9
2527        ext             v21.16b, v20.16b, v22.16b, #8
2528        ext             v23.16b, v20.16b, v22.16b, #9
2529        umull           v16.8h,  v17.8b,  v0.8b
2530        umull2          v17.8h,  v17.16b, v0.16b
2531        umull           v20.8h,  v21.8b,  v0.8b
2532        umull2          v21.8h,  v21.16b, v0.16b
2533        umlal           v16.8h,  v19.8b,  v1.8b
2534        umlal2          v17.8h,  v19.16b, v1.16b
2535        umlal           v20.8h,  v23.8b,  v1.8b
2536        umlal2          v21.8h,  v23.16b, v1.16b
2537        subs            \mx, \mx, #16
2538.ifc \type, put
2539        uqrshrn         v16.8b,  v16.8h, #4
2540        uqrshrn2        v16.16b, v17.8h, #4
2541        uqrshrn         v20.8b,  v20.8h, #4
2542        uqrshrn2        v20.16b, v21.8h, #4
2543        st1             {v16.16b}, [\dst], #16
2544        st1             {v20.16b}, [\ds2], #16
2545.else
2546        st1             {v16.8h, v17.8h}, [\dst], #32
2547        st1             {v20.8h, v21.8h}, [\ds2], #32
2548.endif
2549        b.le            9f
2550
2551        mov             v16.16b, v18.16b
2552        mov             v20.16b, v22.16b
2553        b               16b
2554
25559:
2556        add             \dst,  \dst,  \d_strd
2557        add             \ds2,  \ds2,  \d_strd
2558        add             \src,  \src,  \s_strd
2559        add             \sr2,  \sr2,  \s_strd
2560
2561        subs            \h,  \h,  #2
2562        b.gt            161b
2563        ret
2564
2565L(\type\()_bilin_h_tbl):
2566        .hword L(\type\()_bilin_h_tbl) - 1280b
2567        .hword L(\type\()_bilin_h_tbl) -  640b
2568        .hword L(\type\()_bilin_h_tbl) -  320b
2569        .hword L(\type\()_bilin_h_tbl) -  160b
2570        .hword L(\type\()_bilin_h_tbl) -   80b
2571        .hword L(\type\()_bilin_h_tbl) -   40b
2572        .hword L(\type\()_bilin_h_tbl) -   20b
2573        .hword 0
2574
2575
2576L(\type\()_bilin_v):
2577        cmp             \h,  #4
2578        adr             x9,  L(\type\()_bilin_v_tbl)
2579        ldrh            w8,  [x9, x8, lsl #1]
2580        sub             x9,  x9,  w8, uxtw
2581        br              x9
2582
258320:     // 2xN v
2584.ifc \type, put
2585        cmp             \h,  #2
2586        add             \ds2,  \dst,  \d_strd
2587        add             \sr2,  \src,  \s_strd
2588        lsl             \s_strd,  \s_strd,  #1
2589        lsl             \d_strd,  \d_strd,  #1
2590
2591        // 2x2 v
2592        ld1             {v16.h}[0], [\src], \s_strd
2593        b.gt            24f
2594        ld1             {v17.h}[0], [\sr2], \s_strd
2595        ld1             {v18.h}[0], [\src], \s_strd
2596        trn1            v16.4h, v16.4h, v17.4h
2597        trn1            v17.4h, v17.4h, v18.4h
2598        umull           v4.8h,  v16.8b,  v2.8b
2599        umlal           v4.8h,  v17.8b,  v3.8b
2600        uqrshrn         v4.8b,  v4.8h,  #4
2601        st1             {v4.h}[0], [\dst]
2602        st1             {v4.h}[1], [\ds2]
2603        ret
260424:     // 2x4, 2x8, ... v
2605        ld1             {v17.h}[0], [\sr2], \s_strd
2606        ld1             {v18.h}[0], [\src], \s_strd
2607        ld1             {v19.h}[0], [\sr2], \s_strd
2608        ld1             {v20.h}[0], [\src], \s_strd
2609        trn1            v16.4h, v16.4h, v17.4h
2610        trn1            v17.4h, v17.4h, v18.4h
2611        trn1            v18.4h, v18.4h, v19.4h
2612        trn1            v19.4h, v19.4h, v20.4h
2613        trn1            v16.2s, v16.2s, v18.2s
2614        trn1            v17.2s, v17.2s, v19.2s
2615        umull           v4.8h,  v16.8b,  v2.8b
2616        umlal           v4.8h,  v17.8b,  v3.8b
2617        subs            \h,  \h,  #4
2618        uqrshrn         v4.8b,  v4.8h,  #4
2619        st1             {v4.h}[0], [\dst], \d_strd
2620        st1             {v4.h}[1], [\ds2], \d_strd
2621        st1             {v4.h}[2], [\dst], \d_strd
2622        st1             {v4.h}[3], [\ds2], \d_strd
2623        b.le            0f
2624        mov             v16.8b, v20.8b
2625        b               24b
26260:
2627        ret
2628.endif
2629
263040:     // 4xN v
2631        add             \ds2,  \dst,  \d_strd
2632        add             \sr2,  \src,  \s_strd
2633        lsl             \s_strd,  \s_strd,  #1
2634        lsl             \d_strd,  \d_strd,  #1
2635        ld1             {v16.s}[0], [\src], \s_strd
26364:
2637        ld1             {v17.s}[0], [\sr2], \s_strd
2638        ld1             {v18.s}[0], [\src], \s_strd
2639        trn1            v16.2s, v16.2s, v17.2s
2640        trn1            v17.2s, v17.2s, v18.2s
2641        umull           v4.8h,  v16.8b,  v2.8b
2642        umlal           v4.8h,  v17.8b,  v3.8b
2643        subs            \h,  \h,  #2
2644.ifc \type, put
2645        uqrshrn         v4.8b,  v4.8h,  #4
2646        st1             {v4.s}[0], [\dst], \d_strd
2647        st1             {v4.s}[1], [\ds2], \d_strd
2648.else
2649        st1             {v4.d}[0], [\dst], \d_strd
2650        st1             {v4.d}[1], [\ds2], \d_strd
2651.endif
2652        b.le            0f
2653        mov             v16.8b, v18.8b
2654        b               4b
26550:
2656        ret
2657
265880:     // 8xN v
2659        add             \ds2,  \dst,  \d_strd
2660        add             \sr2,  \src,  \s_strd
2661        lsl             \s_strd,  \s_strd,  #1
2662        lsl             \d_strd,  \d_strd,  #1
2663        ld1             {v16.8b}, [\src], \s_strd
26648:
2665        ld1             {v17.8b}, [\sr2], \s_strd
2666        ld1             {v18.8b}, [\src], \s_strd
2667        umull           v4.8h,  v16.8b,  v2.8b
2668        umull           v5.8h,  v17.8b,  v2.8b
2669        umlal           v4.8h,  v17.8b,  v3.8b
2670        umlal           v5.8h,  v18.8b,  v3.8b
2671        subs            \h,  \h,  #2
2672.ifc \type, put
2673        uqrshrn         v4.8b,  v4.8h,  #4
2674        uqrshrn         v5.8b,  v5.8h,  #4
2675        st1             {v4.8b}, [\dst], \d_strd
2676        st1             {v5.8b}, [\ds2], \d_strd
2677.else
2678        st1             {v4.8h}, [\dst], \d_strd
2679        st1             {v5.8h}, [\ds2], \d_strd
2680.endif
2681        b.le            0f
2682        mov             v16.8b, v18.8b
2683        b               8b
26840:
2685        ret
2686
2687160:    // 16xN, 32xN, ...
2688320:
2689640:
26901280:
2691        mov             \my,  \h
26921:
2693        add             \ds2, \dst, \d_strd
2694        add             \sr2, \src, \s_strd
2695        lsl             \s_strd, \s_strd, #1
2696        lsl             \d_strd, \d_strd, #1
2697
2698        ld1             {v16.16b}, [\src], \s_strd
26992:
2700        ld1             {v17.16b}, [\sr2], \s_strd
2701        ld1             {v18.16b}, [\src], \s_strd
2702        umull           v4.8h,  v16.8b,  v2.8b
2703        umull2          v5.8h,  v16.16b, v2.16b
2704        umull           v6.8h,  v17.8b,  v2.8b
2705        umull2          v7.8h,  v17.16b, v2.16b
2706        umlal           v4.8h,  v17.8b,  v3.8b
2707        umlal2          v5.8h,  v17.16b, v3.16b
2708        umlal           v6.8h,  v18.8b,  v3.8b
2709        umlal2          v7.8h,  v18.16b, v3.16b
2710        subs            \h,  \h,  #2
2711.ifc \type, put
2712        uqrshrn         v4.8b,  v4.8h,  #4
2713        uqrshrn2        v4.16b, v5.8h,  #4
2714        uqrshrn         v6.8b,  v6.8h,  #4
2715        uqrshrn2        v6.16b, v7.8h,  #4
2716        st1             {v4.16b}, [\dst], \d_strd
2717        st1             {v6.16b}, [\ds2], \d_strd
2718.else
2719        st1             {v4.8h, v5.8h}, [\dst], \d_strd
2720        st1             {v6.8h, v7.8h}, [\ds2], \d_strd
2721.endif
2722        b.le            9f
2723        mov             v16.16b, v18.16b
2724        b               2b
27259:
2726        subs            \w,  \w,  #16
2727        b.le            0f
2728        asr             \s_strd, \s_strd, #1
2729        asr             \d_strd, \d_strd, #1
2730        msub            \src, \s_strd, \xmy, \src
2731        msub            \dst, \d_strd, \xmy, \dst
2732        sub             \src, \src, \s_strd, lsl #1
2733        mov             \h,  \my
2734        add             \src, \src, #16
2735.ifc \type, put
2736        add             \dst, \dst, #16
2737.else
2738        add             \dst, \dst, #32
2739.endif
2740        b               1b
27410:
2742        ret
2743
2744L(\type\()_bilin_v_tbl):
2745        .hword L(\type\()_bilin_v_tbl) - 1280b
2746        .hword L(\type\()_bilin_v_tbl) -  640b
2747        .hword L(\type\()_bilin_v_tbl) -  320b
2748        .hword L(\type\()_bilin_v_tbl) -  160b
2749        .hword L(\type\()_bilin_v_tbl) -   80b
2750        .hword L(\type\()_bilin_v_tbl) -   40b
2751        .hword L(\type\()_bilin_v_tbl) -   20b
2752        .hword 0
2753
2754L(\type\()_bilin_hv):
2755        uxtl            v2.8h, v2.8b
2756        uxtl            v3.8h, v3.8b
2757        adr             x9,  L(\type\()_bilin_hv_tbl)
2758        ldrh            w8,  [x9, x8, lsl #1]
2759        sub             x9,  x9,  w8, uxtw
2760        br              x9
2761
276220:     // 2xN hv
2763.ifc \type, put
2764        add             \sr2, \src, \s_strd
2765        add             \ds2, \dst, \d_strd
2766        lsl             \s_strd, \s_strd, #1
2767        lsl             \d_strd, \d_strd, #1
2768
2769        ld1             {v28.s}[0],  [\src], \s_strd
2770        ext             v29.8b, v28.8b, v28.8b, #1
2771        umull           v16.8h, v28.8b, v0.8b
2772        umlal           v16.8h, v29.8b, v1.8b
2773
27742:
2775        ld1             {v28.s}[0],  [\sr2], \s_strd
2776        ld1             {v30.s}[0],  [\src], \s_strd
2777        ext             v29.8b, v28.8b, v28.8b, #1
2778        ext             v31.8b, v30.8b, v30.8b, #1
2779        trn1            v28.4h, v28.4h, v30.4h
2780        trn1            v29.4h, v29.4h, v31.4h
2781        umull           v17.8h, v28.8b, v0.8b
2782        umlal           v17.8h, v29.8b, v1.8b
2783
2784        trn1            v16.2s, v16.2s, v17.2s
2785
2786        mul             v4.4h,  v16.4h, v2.4h
2787        mla             v4.4h,  v17.4h, v3.4h
2788        uqrshrn         v4.8b,  v4.8h,  #8
2789        subs            \h,  \h,  #2
2790        st1             {v4.h}[0], [\dst], \d_strd
2791        st1             {v4.h}[1], [\ds2], \d_strd
2792        b.le            0f
2793        trn2            v16.2s, v17.2s, v17.2s
2794        b               2b
27950:
2796        ret
2797.endif
2798
279940:     // 4xN hv
2800        add             \sr2, \src, \s_strd
2801        add             \ds2, \dst, \d_strd
2802        lsl             \s_strd, \s_strd, #1
2803        lsl             \d_strd, \d_strd, #1
2804
2805        ld1             {v28.8b},  [\src], \s_strd
2806        ext             v29.8b, v28.8b, v28.8b, #1
2807        umull           v16.8h, v28.8b, v0.8b
2808        umlal           v16.8h, v29.8b, v1.8b
2809
28104:
2811        ld1             {v28.8b},  [\sr2], \s_strd
2812        ld1             {v30.8b},  [\src], \s_strd
2813        ext             v29.8b, v28.8b, v28.8b, #1
2814        ext             v31.8b, v30.8b, v30.8b, #1
2815        trn1            v28.2s, v28.2s, v30.2s
2816        trn1            v29.2s, v29.2s, v31.2s
2817        umull           v17.8h, v28.8b, v0.8b
2818        umlal           v17.8h, v29.8b, v1.8b
2819
2820        trn1            v16.2d, v16.2d, v17.2d
2821
2822        mul             v4.8h,  v16.8h, v2.8h
2823        mla             v4.8h,  v17.8h, v3.8h
2824        subs            \h,  \h,  #2
2825.ifc \type, put
2826        uqrshrn         v4.8b,  v4.8h,  #8
2827        st1             {v4.s}[0], [\dst], \d_strd
2828        st1             {v4.s}[1], [\ds2], \d_strd
2829.else
2830        urshr           v4.8h,  v4.8h,  #4
2831        st1             {v4.d}[0], [\dst], \d_strd
2832        st1             {v4.d}[1], [\ds2], \d_strd
2833.endif
2834        b.le            0f
2835        trn2            v16.2d, v17.2d, v17.2d
2836        b               4b
28370:
2838        ret
2839
284080:     // 8xN, 16xN, ... hv
2841160:
2842320:
2843640:
28441280:
2845        mov             \my,  \h
2846
28471:
2848        add             \sr2, \src, \s_strd
2849        add             \ds2, \dst, \d_strd
2850        lsl             \s_strd, \s_strd, #1
2851        lsl             \d_strd, \d_strd, #1
2852
2853        ld1             {v28.16b},  [\src], \s_strd
2854        ext             v29.16b, v28.16b, v28.16b, #1
2855        umull           v16.8h, v28.8b, v0.8b
2856        umlal           v16.8h, v29.8b, v1.8b
2857
28582:
2859        ld1             {v28.16b},  [\sr2], \s_strd
2860        ld1             {v30.16b},  [\src], \s_strd
2861        ext             v29.16b, v28.16b, v28.16b, #1
2862        ext             v31.16b, v30.16b, v30.16b, #1
2863        umull           v17.8h, v28.8b, v0.8b
2864        umlal           v17.8h, v29.8b, v1.8b
2865        umull           v18.8h, v30.8b, v0.8b
2866        umlal           v18.8h, v31.8b, v1.8b
2867
2868        mul             v4.8h,  v16.8h, v2.8h
2869        mla             v4.8h,  v17.8h, v3.8h
2870        mul             v5.8h,  v17.8h, v2.8h
2871        mla             v5.8h,  v18.8h, v3.8h
2872        subs            \h,  \h,  #2
2873.ifc \type, put
2874        uqrshrn         v4.8b,  v4.8h,  #8
2875        uqrshrn         v5.8b,  v5.8h,  #8
2876        st1             {v4.8b}, [\dst], \d_strd
2877        st1             {v5.8b}, [\ds2], \d_strd
2878.else
2879        urshr           v4.8h,  v4.8h,  #4
2880        urshr           v5.8h,  v5.8h,  #4
2881        st1             {v4.8h}, [\dst], \d_strd
2882        st1             {v5.8h}, [\ds2], \d_strd
2883.endif
2884        b.le            9f
2885        mov             v16.16b, v18.16b
2886        b               2b
28879:
2888        subs            \w,  \w,  #8
2889        b.le            0f
2890        asr             \s_strd,  \s_strd,  #1
2891        asr             \d_strd,  \d_strd,  #1
2892        msub            \src,  \s_strd,  \xmy,  \src
2893        msub            \dst,  \d_strd,  \xmy,  \dst
2894        sub             \src,  \src,  \s_strd,  lsl #1
2895        mov             \h,  \my
2896        add             \src,  \src,  #8
2897.ifc \type, put
2898        add             \dst,  \dst,  #8
2899.else
2900        add             \dst,  \dst,  #16
2901.endif
2902        b               1b
29030:
2904        ret
2905
2906L(\type\()_bilin_hv_tbl):
2907        .hword L(\type\()_bilin_hv_tbl) - 1280b
2908        .hword L(\type\()_bilin_hv_tbl) -  640b
2909        .hword L(\type\()_bilin_hv_tbl) -  320b
2910        .hword L(\type\()_bilin_hv_tbl) -  160b
2911        .hword L(\type\()_bilin_hv_tbl) -   80b
2912        .hword L(\type\()_bilin_hv_tbl) -   40b
2913        .hword L(\type\()_bilin_hv_tbl) -   20b
2914        .hword 0
2915endfunc
2916.endm
2917
2918filter_fn put,  x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 10
2919filter_fn prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6
2920
2921.macro load_filter_row dst, src, inc
2922        asr             w13, \src, #10
2923        add             \src, \src, \inc
2924        ldr             \dst, [x11, w13, sxtw #3]
2925.endm
2926
2927function warp_filter_horz_neon
2928        add             w12, w5,  #512
2929
2930        ld1             {v16.8b, v17.8b}, [x2], x3
2931
2932        load_filter_row d0, w12, w7
2933        load_filter_row d1, w12, w7
2934        load_filter_row d2, w12, w7
2935        load_filter_row d3, w12, w7
2936        load_filter_row d4, w12, w7
2937        load_filter_row d5, w12, w7
2938        load_filter_row d6, w12, w7
2939        // subtract by 128 to allow using smull
2940        eor             v16.8b,  v16.8b,  v22.8b
2941        eor             v17.8b,  v17.8b,  v22.8b
2942        load_filter_row d7, w12, w7
2943
2944        ext             v18.8b,  v16.8b,  v17.8b,  #1
2945        ext             v19.8b,  v16.8b,  v17.8b,  #2
2946        smull           v0.8h,   v0.8b,   v16.8b
2947        smull           v1.8h,   v1.8b,   v18.8b
2948        ext             v18.8b,  v16.8b,  v17.8b,  #3
2949        ext             v20.8b,  v16.8b,  v17.8b,  #4
2950        smull           v2.8h,   v2.8b,   v19.8b
2951        smull           v3.8h,   v3.8b,   v18.8b
2952        ext             v18.8b,  v16.8b,  v17.8b,  #5
2953        ext             v19.8b,  v16.8b,  v17.8b,  #6
2954        smull           v4.8h,   v4.8b,   v20.8b
2955        smull           v5.8h,   v5.8b,   v18.8b
2956        ext             v18.8b,  v16.8b,  v17.8b,  #7
2957        smull           v6.8h,   v6.8b,   v19.8b
2958        smull           v7.8h,   v7.8b,   v18.8b
2959
2960        addp            v0.8h,   v0.8h,   v1.8h
2961        addp            v2.8h,   v2.8h,   v3.8h
2962        addp            v4.8h,   v4.8h,   v5.8h
2963        addp            v6.8h,   v6.8h,   v7.8h
2964
2965        addp            v0.8h,   v0.8h,   v2.8h
2966        addp            v4.8h,   v4.8h,   v6.8h
2967
2968        addp            v0.8h,   v0.8h,   v4.8h
2969
2970        add             w5,  w5,  w8
2971
2972        ret
2973endfunc
2974
2975// void dav1d_warp_affine_8x8_8bpc_neon(
2976//         pixel *dst, const ptrdiff_t dst_stride,
2977//         const pixel *src, const ptrdiff_t src_stride,
2978//         const int16_t *const abcd, int mx, int my)
2979.macro warp t, shift
2980function warp_affine_8x8\t\()_8bpc_neon, export=1
2981        ldr             x4,  [x4]
2982        sbfx            x7,  x4, #0,  #16
2983        sbfx            x8,  x4, #16, #16
2984        sbfx            x9,  x4, #32, #16
2985        sbfx            x4,  x4, #48, #16
2986        mov             w10, #8
2987        sub             x2,  x2,  x3, lsl #1
2988        sub             x2,  x2,  x3
2989        sub             x2,  x2,  #3
2990        movrel          x11, X(mc_warp_filter), 64*8
2991        mov             x15, x30
2992.ifnb \t
2993        lsl             x1,  x1,  #1
2994.endif
2995
2996        movi            v22.8b,  #128
2997.ifb \t
2998        movi            v23.8h,  #128
2999.else
3000        movi            v23.8h,  #8, lsl #8
3001.endif
3002
3003        bl              warp_filter_horz_neon
3004        srshr           v24.8h,  v0.8h,  #3
3005        bl              warp_filter_horz_neon
3006        srshr           v25.8h,  v0.8h,  #3
3007        bl              warp_filter_horz_neon
3008        srshr           v26.8h,  v0.8h,  #3
3009        bl              warp_filter_horz_neon
3010        srshr           v27.8h,  v0.8h,  #3
3011        bl              warp_filter_horz_neon
3012        srshr           v28.8h,  v0.8h,  #3
3013        bl              warp_filter_horz_neon
3014        srshr           v29.8h,  v0.8h,  #3
3015        bl              warp_filter_horz_neon
3016        srshr           v30.8h,  v0.8h,  #3
3017
30181:
3019        add             w14, w6,  #512
3020        bl              warp_filter_horz_neon
3021        srshr           v31.8h,  v0.8h,  #3
3022
3023        load_filter_row d0, w14, w9
3024        load_filter_row d1, w14, w9
3025        load_filter_row d2, w14, w9
3026        load_filter_row d3, w14, w9
3027        load_filter_row d4, w14, w9
3028        load_filter_row d5, w14, w9
3029        load_filter_row d6, w14, w9
3030        load_filter_row d7, w14, w9
3031        transpose_8x8b_xtl v0, v1, v2, v3, v4, v5, v6, v7, sxtl
3032
3033        // This ordering of smull/smlal/smull2/smlal2 is highly
3034        // beneficial for Cortex A53 here.
3035        smull           v16.4s,  v24.4h,  v0.4h
3036        smlal           v16.4s,  v25.4h,  v1.4h
3037        smlal           v16.4s,  v26.4h,  v2.4h
3038        smlal           v16.4s,  v27.4h,  v3.4h
3039        smlal           v16.4s,  v28.4h,  v4.4h
3040        smlal           v16.4s,  v29.4h,  v5.4h
3041        smlal           v16.4s,  v30.4h,  v6.4h
3042        smlal           v16.4s,  v31.4h,  v7.4h
3043        smull2          v17.4s,  v24.8h,  v0.8h
3044        smlal2          v17.4s,  v25.8h,  v1.8h
3045        smlal2          v17.4s,  v26.8h,  v2.8h
3046        smlal2          v17.4s,  v27.8h,  v3.8h
3047        smlal2          v17.4s,  v28.8h,  v4.8h
3048        smlal2          v17.4s,  v29.8h,  v5.8h
3049        smlal2          v17.4s,  v30.8h,  v6.8h
3050        smlal2          v17.4s,  v31.8h,  v7.8h
3051
3052        mov             v24.16b, v25.16b
3053        mov             v25.16b, v26.16b
3054        sqrshrn         v16.4h,  v16.4s,  #\shift
3055        mov             v26.16b, v27.16b
3056        sqrshrn2        v16.8h,  v17.4s,  #\shift
3057        mov             v27.16b, v28.16b
3058        mov             v28.16b, v29.16b
3059        add             v16.8h,  v16.8h,  v23.8h
3060.ifb \t
3061        sqxtun          v16.8b,  v16.8h
3062.endif
3063        mov             v29.16b, v30.16b
3064        mov             v30.16b, v31.16b
3065        subs            w10, w10, #1
3066.ifnb \t
3067        st1             {v16.8h}, [x0], x1
3068.else
3069        st1             {v16.8b}, [x0], x1
3070.endif
3071
3072        add             w6,  w6,  w4
3073        b.gt            1b
3074
3075        br              x15
3076endfunc
3077.endm
3078
3079warp  , 11
3080warp t, 7
3081
3082// void dav1d_emu_edge_8bpc_neon(
3083//         const intptr_t bw, const intptr_t bh,
3084//         const intptr_t iw, const intptr_t ih,
3085//         const intptr_t x, const intptr_t y,
3086//         pixel *dst, const ptrdiff_t dst_stride,
3087//         const pixel *ref, const ptrdiff_t ref_stride)
3088function emu_edge_8bpc_neon, export=1
3089        ldp             x8,  x9,  [sp]
3090
3091        // ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride)
3092        // ref += iclip(x, 0, iw - 1)
3093        sub             x12, x3,  #1           // ih - 1
3094        cmp             x5,  x3
3095        sub             x13, x2,  #1           // iw - 1
3096        csel            x12, x12, x5,  ge      // min(y, ih - 1)
3097        cmp             x4,  x2
3098        bic             x12, x12, x12, asr #63 // max(min(y, ih - 1), 0)
3099        csel            x13, x13, x4,  ge      // min(x, iw - 1)
3100        bic             x13, x13, x13, asr #63 // max(min(x, iw - 1), 0)
3101        madd            x8,  x12, x9,  x8      // ref += iclip() * stride
3102        add             x8,  x8,  x13          // ref += iclip()
3103
3104        // bottom_ext = iclip(y + bh - ih, 0, bh - 1)
3105        // top_ext = iclip(-y, 0, bh - 1)
3106        add             x10, x5,  x1           // y + bh
3107        neg             x5,  x5                // -y
3108        sub             x10, x10, x3           // y + bh - ih
3109        sub             x12, x1,  #1           // bh - 1
3110        cmp             x10, x1
3111        bic             x5,  x5,  x5,  asr #63 // max(-y, 0)
3112        csel            x10, x10, x12, lt      // min(y + bh - ih, bh-1)
3113        cmp             x5,  x1
3114        bic             x10, x10, x10, asr #63 // max(min(y + bh - ih, bh-1), 0)
3115        csel            x5,  x5,  x12, lt      // min(max(-y, 0), bh-1)
3116
3117        // right_ext = iclip(x + bw - iw, 0, bw - 1)
3118        // left_ext = iclip(-x, 0, bw - 1)
3119        add             x11, x4,  x0           // x + bw
3120        neg             x4,  x4                // -x
3121        sub             x11, x11, x2           // x + bw - iw
3122        sub             x13, x0,  #1           // bw - 1
3123        cmp             x11, x0
3124        bic             x4,  x4,  x4,  asr #63 // max(-x, 0)
3125        csel            x11, x11, x13, lt      // min(x + bw - iw, bw-1)
3126        cmp             x4,  x0
3127        bic             x11, x11, x11, asr #63 // max(min(x + bw - iw, bw-1), 0)
3128        csel            x4,  x4,  x13, lt      // min(max(-x, 0), bw - 1)
3129
3130        // center_h = bh - top_ext - bottom_ext
3131        // dst += top_ext * PXSTRIDE(dst_stride)
3132        // center_w = bw - left_ext - right_ext
3133        sub             x1,  x1,  x5           // bh - top_ext
3134        madd            x6,  x5,  x7,  x6
3135        sub             x2,  x0,  x4           // bw - left_ext
3136        sub             x1,  x1,  x10          // center_h = bh - top_ext - bottom_ext
3137        sub             x2,  x2,  x11          // center_w = bw - left_ext - right_ext
3138
3139        mov             x14, x6                // backup of dst
3140
3141.macro v_loop need_left, need_right
31420:
3143.if \need_left
3144        ld1r            {v0.16b}, [x8]
3145        mov             x12, x6                // out = dst
3146        mov             x3,  x4
31471:
3148        subs            x3,  x3,  #16
3149        st1             {v0.16b}, [x12], #16
3150        b.gt            1b
3151.endif
3152        mov             x13, x8
3153        add             x12, x6,  x4           // out = dst + left_ext
3154        mov             x3,  x2
31551:
3156        ld1             {v0.16b, v1.16b}, [x13], #32
3157        subs            x3,  x3,  #32
3158        st1             {v0.16b, v1.16b}, [x12], #32
3159        b.gt            1b
3160.if \need_right
3161        add             x3,  x8,  x2           // in + center_w
3162        sub             x3,  x3,  #1           // in + center_w - 1
3163        add             x12, x6,  x4           // dst + left_ext
3164        ld1r            {v0.16b}, [x3]
3165        add             x12, x12, x2           // out = dst + left_ext + center_w
3166        mov             x3,  x11
31671:
3168        subs            x3,  x3,  #16
3169        st1             {v0.16b}, [x12], #16
3170        b.gt            1b
3171.endif
3172
3173        subs            x1,  x1,  #1           // center_h--
3174        add             x6,  x6,  x7
3175        add             x8,  x8,  x9
3176        b.gt            0b
3177.endm
3178
3179        cbz             x4,  2f
3180        // need_left
3181        cbz             x11, 3f
3182        // need_left + need_right
3183        v_loop          1,   1
3184        b               5f
3185
31862:
3187        // !need_left
3188        cbz             x11, 4f
3189        // !need_left + need_right
3190        v_loop          0,   1
3191        b               5f
3192
31933:
3194        // need_left + !need_right
3195        v_loop          1,   0
3196        b               5f
3197
31984:
3199        // !need_left + !need_right
3200        v_loop          0,   0
3201
32025:
3203
3204        cbz             x10, 3f
3205        // need_bottom
3206        sub             x8,  x6,  x7           // ref = dst - stride
3207        mov             x4,  x0
32081:
3209        ld1             {v0.16b, v1.16b}, [x8], #32
3210        mov             x3,  x10
32112:
3212        subs            x3,  x3,  #1
3213        st1             {v0.16b, v1.16b}, [x6], x7
3214        b.gt            2b
3215        msub            x6,  x7,  x10,  x6     // dst -= bottom_ext * stride
3216        subs            x4,  x4,  #32          // bw -= 32
3217        add             x6,  x6,  #32          // dst += 32
3218        b.gt            1b
3219
32203:
3221        cbz             x5,  3f
3222        // need_top
3223        msub            x6,  x7,  x5,  x14     // dst = stored_dst - top_ext * stride
32241:
3225        ld1             {v0.16b, v1.16b}, [x14], #32
3226        mov             x3,  x5
32272:
3228        subs            x3,  x3,  #1
3229        st1             {v0.16b, v1.16b}, [x6], x7
3230        b.gt            2b
3231        msub            x6,  x7,  x5,  x6      // dst -= top_ext * stride
3232        subs            x0,  x0,  #32          // bw -= 32
3233        add             x6,  x6,  #32          // dst += 32
3234        b.gt            1b
3235
32363:
3237        ret
3238endfunc
3239