1/*
2 * Copyright © 2018, VideoLAN and dav1d authors
3 * Copyright © 2018, Janne Grunau
4 * Copyright © 2020, Martin Storsjo
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice, this
11 *    list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 *    this list of conditions and the following disclaimer in the documentation
15 *    and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
21 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29#include "src/arm/asm.S"
30#include "util.S"
31
32#define PREP_BIAS 8192
33
34.macro avg d0, d1, t0, t1, t2, t3
35        ld1             {\t0\().8h,\t1\().8h},  [x2],  32
36        ld1             {\t2\().8h,\t3\().8h},  [x3],  32
37        sqadd           \t0\().8h,  \t0\().8h,  \t2\().8h
38        sqadd           \t1\().8h,  \t1\().8h,  \t3\().8h
39        smax            \t0\().8h,  \t0\().8h,  v28.8h // -2*PREP_BIAS - 1 << intermediate_bits
40        smax            \t1\().8h,  \t1\().8h,  v28.8h // -2*PREP_BIAS - 1 << intermediate_bits
41        sqsub           \t0\().8h,  \t0\().8h,  v28.8h // -2*PREP_BIAS - 1 << intermediate_bits
42        sqsub           \t1\().8h,  \t1\().8h,  v28.8h // -2*PREP_BIAS - 1 << intermediate_bits
43        sshl            \d0\().8h,  \t0\().8h,  v29.8h // -(intermediate_bits+1)
44        sshl            \d1\().8h,  \t1\().8h,  v29.8h // -(intermediate_bits+1)
45.endm
46
47.macro w_avg d0, d1, t0, t1, t2, t3
48        ld1             {\t0\().8h,\t1\().8h},  [x2],  32
49        ld1             {\t2\().8h,\t3\().8h},  [x3],  32
50        // This difference requires a 17 bit range, and all bits are
51        // significant for the following multiplication.
52        ssubl           \d0\().4s,  \t2\().4h,  \t0\().4h
53        ssubl2          \t0\().4s,  \t2\().8h,  \t0\().8h
54        ssubl           \d1\().4s,  \t3\().4h,  \t1\().4h
55        ssubl2          \t1\().4s,  \t3\().8h,  \t1\().8h
56        mul             \d0\().4s,  \d0\().4s,  v27.4s
57        mul             \t0\().4s,  \t0\().4s,  v27.4s
58        mul             \d1\().4s,  \d1\().4s,  v27.4s
59        mul             \t1\().4s,  \t1\().4s,  v27.4s
60        sshr            \d0\().4s,  \d0\().4s,  #4
61        sshr            \t0\().4s,  \t0\().4s,  #4
62        sshr            \d1\().4s,  \d1\().4s,  #4
63        sshr            \t1\().4s,  \t1\().4s,  #4
64        saddw           \d0\().4s,  \d0\().4s,  \t2\().4h
65        saddw2          \t0\().4s,  \t0\().4s,  \t2\().8h
66        saddw           \d1\().4s,  \d1\().4s,  \t3\().4h
67        saddw2          \t1\().4s,  \t1\().4s,  \t3\().8h
68        xtn             \d0\().4h,  \d0\().4s
69        xtn2            \d0\().8h,  \t0\().4s
70        xtn             \d1\().4h,  \d1\().4s
71        xtn2            \d1\().8h,  \t1\().4s
72        srshl           \d0\().8h,  \d0\().8h,  v29.8h // -intermediate_bits
73        srshl           \d1\().8h,  \d1\().8h,  v29.8h // -intermediate_bits
74        add             \d0\().8h,  \d0\().8h,  v28.8h // PREP_BIAS >> intermediate_bits
75        add             \d1\().8h,  \d1\().8h,  v28.8h // PREP_BIAS >> intermediate_bits
76        smin            \d0\().8h,  \d0\().8h,  v31.8h // bitdepth_max
77        smin            \d1\().8h,  \d1\().8h,  v31.8h // bitdepth_max
78        smax            \d0\().8h,  \d0\().8h,  v30.8h // 0
79        smax            \d1\().8h,  \d1\().8h,  v30.8h // 0
80.endm
81
82.macro mask d0, d1, t0, t1, t2, t3
83        ld1             {v27.16b}, [x6],  16
84        ld1             {\t0\().8h,\t1\().8h},  [x2],  32
85        neg             v27.16b, v27.16b
86        ld1             {\t2\().8h,\t3\().8h},  [x3],  32
87        sxtl            v26.8h,  v27.8b
88        sxtl2           v27.8h,  v27.16b
89        sxtl            v24.4s,  v26.4h
90        sxtl2           v25.4s,  v26.8h
91        sxtl            v26.4s,  v27.4h
92        sxtl2           v27.4s,  v27.8h
93        ssubl           \d0\().4s,  \t2\().4h,  \t0\().4h
94        ssubl2          \t0\().4s,  \t2\().8h,  \t0\().8h
95        ssubl           \d1\().4s,  \t3\().4h,  \t1\().4h
96        ssubl2          \t1\().4s,  \t3\().8h,  \t1\().8h
97        mul             \d0\().4s,  \d0\().4s,  v24.4s
98        mul             \t0\().4s,  \t0\().4s,  v25.4s
99        mul             \d1\().4s,  \d1\().4s,  v26.4s
100        mul             \t1\().4s,  \t1\().4s,  v27.4s
101        sshr            \d0\().4s,  \d0\().4s,  #6
102        sshr            \t0\().4s,  \t0\().4s,  #6
103        sshr            \d1\().4s,  \d1\().4s,  #6
104        sshr            \t1\().4s,  \t1\().4s,  #6
105        saddw           \d0\().4s,  \d0\().4s,  \t2\().4h
106        saddw2          \t0\().4s,  \t0\().4s,  \t2\().8h
107        saddw           \d1\().4s,  \d1\().4s,  \t3\().4h
108        saddw2          \t1\().4s,  \t1\().4s,  \t3\().8h
109        xtn             \d0\().4h,  \d0\().4s
110        xtn2            \d0\().8h,  \t0\().4s
111        xtn             \d1\().4h,  \d1\().4s
112        xtn2            \d1\().8h,  \t1\().4s
113        srshl           \d0\().8h,  \d0\().8h,  v29.8h // -intermediate_bits
114        srshl           \d1\().8h,  \d1\().8h,  v29.8h // -intermediate_bits
115        add             \d0\().8h,  \d0\().8h,  v28.8h // PREP_BIAS >> intermediate_bits
116        add             \d1\().8h,  \d1\().8h,  v28.8h // PREP_BIAS >> intermediate_bits
117        smin            \d0\().8h,  \d0\().8h,  v31.8h // bitdepth_max
118        smin            \d1\().8h,  \d1\().8h,  v31.8h // bitdepth_max
119        smax            \d0\().8h,  \d0\().8h,  v30.8h // 0
120        smax            \d1\().8h,  \d1\().8h,  v30.8h // 0
121.endm
122
123.macro bidir_fn type, bdmax
124function \type\()_16bpc_neon, export=1
125        clz             w4,  w4
126.ifnc \type, avg
127        dup             v31.8h,  \bdmax // bitdepth_max
128        movi            v30.8h,  #0
129.endif
130        clz             w7,  \bdmax
131        sub             w7,  w7,  #18   // intermediate_bits = clz(bitdepth_max) - 18
132.ifc \type, avg
133        mov             w9,  #1
134        mov             w8,  #-2*PREP_BIAS
135        lsl             w9,  w9,  w7    // 1 << intermediate_bits
136        add             w7,  w7,  #1
137        sub             w8,  w8,  w9    // -2*PREP_BIAS - 1 << intermediate_bits
138        neg             w7,  w7         // -(intermediate_bits+1)
139        dup             v28.8h,   w8    // -2*PREP_BIAS - 1 << intermediate_bits
140        dup             v29.8h,   w7    // -(intermediate_bits+1)
141.else
142        mov             w8,  #PREP_BIAS
143        lsr             w8,  w8,  w7    // PREP_BIAS >> intermediate_bits
144        neg             w7,  w7         // -intermediate_bits
145        dup             v28.8h,  w8     // PREP_BIAS >> intermediate_bits
146        dup             v29.8h,  w7     // -intermediate_bits
147.endif
148.ifc \type, w_avg
149        dup             v27.4s,  w6
150        neg             v27.4s,  v27.4s
151.endif
152        adr             x7,  L(\type\()_tbl)
153        sub             w4,  w4,  #24
154        \type           v4,  v5,  v0,  v1,  v2,  v3
155        ldrh            w4,  [x7, x4, lsl #1]
156        sub             x7,  x7,  w4, uxtw
157        br              x7
15840:
159        add             x7,  x0,  x1
160        lsl             x1,  x1,  #1
1614:
162        subs            w5,  w5,  #4
163        st1             {v4.d}[0],  [x0], x1
164        st1             {v4.d}[1],  [x7], x1
165        st1             {v5.d}[0],  [x0], x1
166        st1             {v5.d}[1],  [x7], x1
167        b.le            0f
168        \type           v4,  v5,  v0,  v1,  v2,  v3
169        b               4b
17080:
171        add             x7,  x0,  x1
172        lsl             x1,  x1,  #1
1738:
174        st1             {v4.8h},  [x0], x1
175        subs            w5,  w5,  #2
176        st1             {v5.8h},  [x7], x1
177        b.le            0f
178        \type           v4,  v5,  v0,  v1,  v2,  v3
179        b               8b
18016:
181        \type           v6,  v7,  v0,  v1,  v2,  v3
182        st1             {v4.8h, v5.8h}, [x0], x1
183        subs            w5,  w5,  #2
184        st1             {v6.8h, v7.8h}, [x0], x1
185        b.le            0f
186        \type           v4,  v5,  v0,  v1,  v2,  v3
187        b               16b
18832:
189        \type           v6,  v7,  v0,  v1,  v2,  v3
190        subs            w5,  w5,  #1
191        st1             {v4.8h, v5.8h, v6.8h, v7.8h},  [x0], x1
192        b.le            0f
193        \type           v4,  v5,  v0,  v1,  v2,  v3
194        b               32b
195640:
196        add             x7,  x0,  #64
19764:
198        \type           v6,  v7,  v0,  v1,  v2,  v3
199        \type           v16, v17, v0,  v1,  v2,  v3
200        st1             {v4.8h, v5.8h, v6.8h, v7.8h},  [x0], x1
201        \type           v18, v19, v0,  v1,  v2,  v3
202        subs            w5,  w5,  #1
203        st1             {v16.8h,v17.8h,v18.8h,v19.8h}, [x7], x1
204        b.le            0f
205        \type           v4,  v5,  v0,  v1,  v2,  v3
206        b               64b
2071280:
208        add             x7,  x0,  #64
209        mov             x8,  #128
210        sub             x1,  x1,  #128
211128:
212        \type           v6,  v7,  v0,  v1,  v2,  v3
213        \type           v16, v17, v0,  v1,  v2,  v3
214        st1             {v4.8h, v5.8h, v6.8h, v7.8h},  [x0], x8
215        \type           v18, v19, v0,  v1,  v2,  v3
216        st1             {v16.8h,v17.8h,v18.8h,v19.8h}, [x7], x8
217        \type           v4,  v5,  v0,  v1,  v2,  v3
218        \type           v6,  v7,  v0,  v1,  v2,  v3
219        \type           v16, v17, v0,  v1,  v2,  v3
220        subs            w5,  w5,  #1
221        st1             {v4.8h, v5.8h, v6.8h, v7.8h},  [x0], x1
222        \type           v18, v19, v0,  v1,  v2,  v3
223        st1             {v16.8h,v17.8h,v18.8h,v19.8h}, [x7], x1
224        b.le            0f
225        \type           v4,  v5,  v0,  v1,  v2,  v3
226        b               128b
2270:
228        ret
229L(\type\()_tbl):
230        .hword L(\type\()_tbl) - 1280b
231        .hword L(\type\()_tbl) -  640b
232        .hword L(\type\()_tbl) -   32b
233        .hword L(\type\()_tbl) -   16b
234        .hword L(\type\()_tbl) -   80b
235        .hword L(\type\()_tbl) -   40b
236endfunc
237.endm
238
239bidir_fn avg, w6
240bidir_fn w_avg, w7
241bidir_fn mask, w7
242
243
244.macro w_mask_fn type
245function w_mask_\type\()_16bpc_neon, export=1
246        ldr             w8,  [sp]
247        clz             w9,  w4
248        adr             x10, L(w_mask_\type\()_tbl)
249        dup             v31.8h,  w8   // bitdepth_max
250        sub             w9,  w9,  #24
251        clz             w8,  w8       // clz(bitdepth_max)
252        ldrh            w9,  [x10,  x9,  lsl #1]
253        sub             x10, x10, w9,  uxtw
254        sub             w8,  w8,  #12 // sh = intermediate_bits + 6 = clz(bitdepth_max) - 12
255        mov             w9,  #PREP_BIAS*64
256        neg             w8,  w8       // -sh
257        mov             w11, #27615   // (64 + 1 - 38)<<mask_sh - 1 - mask_rnd
258        dup             v30.4s,  w9   // PREP_BIAS*64
259        dup             v29.4s,  w8   // -sh
260        dup             v0.8h,   w11
261.if \type == 444
262        movi            v1.16b,  #64
263.elseif \type == 422
264        dup             v2.8b,   w7
265        movi            v3.8b,   #129
266        sub             v3.8b,   v3.8b,   v2.8b
267.elseif \type == 420
268        dup             v2.8h,   w7
269        movi            v3.8h,   #1, lsl #8
270        sub             v3.8h,   v3.8h,   v2.8h
271.endif
272        add             x12,  x0,  x1
273        lsl             x1,   x1,  #1
274        br              x10
2754:
276        ld1             {v4.8h, v5.8h}, [x2], #32 // tmp1 (four rows at once)
277        ld1             {v6.8h, v7.8h}, [x3], #32 // tmp2 (four rows at once)
278        subs            w5,  w5,  #4
279        sabd            v20.8h,  v4.8h,   v6.8h   // abs(tmp1 - tmp2)
280        sabd            v21.8h,  v5.8h,   v7.8h
281        ssubl           v16.4s,  v6.4h,   v4.4h   // tmp2 - tmp1 (requires 17 bit)
282        ssubl2          v17.4s,  v6.8h,   v4.8h
283        ssubl           v18.4s,  v7.4h,   v5.4h
284        ssubl2          v19.4s,  v7.8h,   v5.8h
285        uqsub           v20.8h,  v0.8h,   v20.8h  // 27615 - abs()
286        uqsub           v21.8h,  v0.8h,   v21.8h
287        sshll2          v7.4s,   v5.8h,   #6      // tmp1 << 6
288        sshll           v6.4s,   v5.4h,   #6
289        sshll2          v5.4s,   v4.8h,   #6
290        sshll           v4.4s,   v4.4h,   #6
291        ushr            v20.8h,  v20.8h,  #10     // 64-m = (27615 - abs()) >> mask_sh
292        ushr            v21.8h,  v21.8h,  #10
293        add             v4.4s,   v4.4s,   v30.4s  // += PREP_BIAS*64
294        add             v5.4s,   v5.4s,   v30.4s
295        add             v6.4s,   v6.4s,   v30.4s
296        add             v7.4s,   v7.4s,   v30.4s
297        uxtl            v22.4s,  v20.4h
298        uxtl2           v23.4s,  v20.8h
299        uxtl            v24.4s,  v21.4h
300        uxtl2           v25.4s,  v21.8h
301        mla             v4.4s,   v16.4s,  v22.4s  // (tmp2-tmp1)*(64-m)
302        mla             v5.4s,   v17.4s,  v23.4s
303        mla             v6.4s,   v18.4s,  v24.4s
304        mla             v7.4s,   v19.4s,  v25.4s
305        srshl           v4.4s,   v4.4s,   v29.4s  // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh
306        srshl           v5.4s,   v5.4s,   v29.4s
307        srshl           v6.4s,   v6.4s,   v29.4s
308        srshl           v7.4s,   v7.4s,   v29.4s
309        sqxtun          v4.4h,   v4.4s            // iclip_pixel
310        sqxtun2         v4.8h,   v5.4s
311        sqxtun          v5.4h,   v6.4s
312        sqxtun2         v5.8h,   v7.4s
313        umin            v4.8h,   v4.8h,   v31.8h  // iclip_pixel
314        umin            v5.8h,   v5.8h,   v31.8h
315.if \type == 444
316        xtn             v20.8b,  v20.8h           // 64 - m
317        xtn2            v20.16b, v21.8h
318        sub             v20.16b, v1.16b,  v20.16b // m
319        st1             {v20.16b}, [x6], #16
320.elseif \type == 422
321        addp            v20.8h,  v20.8h,  v21.8h  // (64 - m) + (64 - n) (column wise addition)
322        xtn             v20.8b,  v20.8h
323        uhsub           v20.8b,  v3.8b,   v20.8b  // ((129 - sign) - ((64 - m) + (64 - n)) >> 1
324        st1             {v20.8b}, [x6], #8
325.elseif \type == 420
326        trn1            v24.2d,  v20.2d,  v21.2d
327        trn2            v25.2d,  v20.2d,  v21.2d
328        add             v24.8h,  v24.8h,  v25.8h  // (64 - my1) + (64 - my2) (row wise addition)
329        addp            v20.8h,  v24.8h,  v24.8h  // (128 - m) + (128 - n) (column wise addition)
330        sub             v20.4h,  v3.4h,   v20.4h  // (256 - sign) - ((128 - m) + (128 - n))
331        rshrn           v20.8b,  v20.8h,  #2      // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
332        st1             {v20.s}[0], [x6], #4
333.endif
334        st1             {v4.d}[0],  [x0],  x1
335        st1             {v4.d}[1],  [x12], x1
336        st1             {v5.d}[0],  [x0],  x1
337        st1             {v5.d}[1],  [x12], x1
338        b.gt            4b
339        ret
3408:
341        ld1             {v4.8h, v5.8h}, [x2], #32 // tmp1
342        ld1             {v6.8h, v7.8h}, [x3], #32 // tmp2
343        subs            w5,  w5,  #2
344        sabd            v20.8h,  v4.8h,   v6.8h   // abs(tmp1 - tmp2)
345        sabd            v21.8h,  v5.8h,   v7.8h
346        ssubl           v16.4s,  v6.4h,   v4.4h   // tmp2 - tmp1 (requires 17 bit)
347        ssubl2          v17.4s,  v6.8h,   v4.8h
348        ssubl           v18.4s,  v7.4h,   v5.4h
349        ssubl2          v19.4s,  v7.8h,   v5.8h
350        uqsub           v20.8h,  v0.8h,   v20.8h  // 27615 - abs()
351        uqsub           v21.8h,  v0.8h,   v21.8h
352        sshll2          v7.4s,   v5.8h,   #6      // tmp1 << 6
353        sshll           v6.4s,   v5.4h,   #6
354        sshll2          v5.4s,   v4.8h,   #6
355        sshll           v4.4s,   v4.4h,   #6
356        ushr            v20.8h,  v20.8h,  #10     // 64-m = (27615 - abs()) >> mask_sh
357        ushr            v21.8h,  v21.8h,  #10
358        add             v4.4s,   v4.4s,   v30.4s  // += PREP_BIAS*64
359        add             v5.4s,   v5.4s,   v30.4s
360        add             v6.4s,   v6.4s,   v30.4s
361        add             v7.4s,   v7.4s,   v30.4s
362        uxtl            v22.4s,  v20.4h
363        uxtl2           v23.4s,  v20.8h
364        uxtl            v24.4s,  v21.4h
365        uxtl2           v25.4s,  v21.8h
366        mla             v4.4s,   v16.4s,  v22.4s  // (tmp2-tmp1)*(64-m)
367        mla             v5.4s,   v17.4s,  v23.4s
368        mla             v6.4s,   v18.4s,  v24.4s
369        mla             v7.4s,   v19.4s,  v25.4s
370        srshl           v4.4s,   v4.4s,   v29.4s  // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh
371        srshl           v5.4s,   v5.4s,   v29.4s
372        srshl           v6.4s,   v6.4s,   v29.4s
373        srshl           v7.4s,   v7.4s,   v29.4s
374        sqxtun          v4.4h,   v4.4s            // iclip_pixel
375        sqxtun2         v4.8h,   v5.4s
376        sqxtun          v5.4h,   v6.4s
377        sqxtun2         v5.8h,   v7.4s
378        umin            v4.8h,   v4.8h,   v31.8h  // iclip_pixel
379        umin            v5.8h,   v5.8h,   v31.8h
380.if \type == 444
381        xtn             v20.8b,  v20.8h           // 64 - m
382        xtn2            v20.16b, v21.8h
383        sub             v20.16b, v1.16b,  v20.16b // m
384        st1             {v20.16b}, [x6], #16
385.elseif \type == 422
386        addp            v20.8h,  v20.8h,  v21.8h  // (64 - m) + (64 - n) (column wise addition)
387        xtn             v20.8b,  v20.8h
388        uhsub           v20.8b,  v3.8b,   v20.8b  // ((129 - sign) - ((64 - m) + (64 - n)) >> 1
389        st1             {v20.8b}, [x6], #8
390.elseif \type == 420
391        add             v20.8h,  v20.8h,  v21.8h  // (64 - my1) + (64 - my2) (row wise addition)
392        addp            v20.8h,  v20.8h,  v20.8h  // (128 - m) + (128 - n) (column wise addition)
393        sub             v20.4h,  v3.4h,   v20.4h  // (256 - sign) - ((128 - m) + (128 - n))
394        rshrn           v20.8b,  v20.8h,  #2      // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
395        st1             {v20.s}[0], [x6], #4
396.endif
397        st1             {v4.8h}, [x0],  x1
398        st1             {v5.8h}, [x12], x1
399        b.gt            8b
400        ret
4011280:
402640:
403320:
404160:
405        mov             w11, w4
406        sub             x1,  x1,  w4,  uxtw #1
407.if \type == 444
408        add             x10, x6,  w4,  uxtw
409.elseif \type == 422
410        add             x10, x6,  x11, lsr #1
411.endif
412        add             x9,  x3,  w4,  uxtw #1
413        add             x7,  x2,  w4,  uxtw #1
414161:
415        mov             w8,  w4
41616:
417        ld1             {v4.8h,   v5.8h},  [x2], #32 // tmp1
418        ld1             {v16.8h,  v17.8h}, [x3], #32 // tmp2
419        ld1             {v6.8h,   v7.8h},  [x7], #32
420        ld1             {v18.8h,  v19.8h}, [x9], #32
421        subs            w8,  w8,  #16
422        sabd            v20.8h,  v4.8h,   v16.8h  // abs(tmp1 - tmp2)
423        sabd            v21.8h,  v5.8h,   v17.8h
424        ssubl           v22.4s,  v16.4h,  v4.4h   // tmp2 - tmp1 (requires 17 bit)
425        ssubl2          v23.4s,  v16.8h,  v4.8h
426        ssubl           v24.4s,  v17.4h,  v5.4h
427        ssubl2          v25.4s,  v17.8h,  v5.8h
428        uqsub           v20.8h,  v0.8h,   v20.8h  // 27615 - abs()
429        uqsub           v21.8h,  v0.8h,   v21.8h
430        sshll2          v27.4s,  v5.8h,   #6      // tmp1 << 6
431        sshll           v26.4s,  v5.4h,   #6
432        sshll2          v5.4s,   v4.8h,   #6
433        sshll           v4.4s,   v4.4h,   #6
434        ushr            v20.8h,  v20.8h,  #10     // 64-m = (27615 - abs()) >> mask_sh
435        ushr            v21.8h,  v21.8h,  #10
436        add             v4.4s,   v4.4s,   v30.4s  // += PREP_BIAS*64
437        add             v5.4s,   v5.4s,   v30.4s
438        add             v26.4s,  v26.4s,  v30.4s
439        add             v27.4s,  v27.4s,  v30.4s
440        uxtl            v16.4s,  v20.4h
441        uxtl2           v17.4s,  v20.8h
442        uxtl            v28.4s,  v21.4h
443        mla             v4.4s,   v22.4s,  v16.4s  // (tmp2-tmp1)*(64-m)
444        uxtl2           v16.4s,  v21.8h
445        mla             v5.4s,   v23.4s,  v17.4s
446        mla             v26.4s,  v24.4s,  v28.4s
447        mla             v27.4s,  v25.4s,  v16.4s
448        srshl           v4.4s,   v4.4s,   v29.4s  // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh
449        srshl           v5.4s,   v5.4s,   v29.4s
450        srshl           v26.4s,  v26.4s,  v29.4s
451        srshl           v27.4s,  v27.4s,  v29.4s
452        sqxtun          v4.4h,   v4.4s            // iclip_pixel
453        sqxtun2         v4.8h,   v5.4s
454        sqxtun          v5.4h,   v26.4s
455        sqxtun2         v5.8h,   v27.4s
456
457        // Start of other half
458        sabd            v22.8h,  v6.8h,   v18.8h  // abs(tmp1 - tmp2)
459        sabd            v23.8h,  v7.8h,   v19.8h
460
461        umin            v4.8h,   v4.8h,   v31.8h  // iclip_pixel
462        umin            v5.8h,   v5.8h,   v31.8h
463
464        ssubl           v16.4s,  v18.4h,  v6.4h   // tmp2 - tmp1 (requires 17 bit)
465        ssubl2          v17.4s,  v18.8h,  v6.8h
466        ssubl           v18.4s,  v19.4h,  v7.4h
467        ssubl2          v19.4s,  v19.8h,  v7.8h
468        uqsub           v22.8h,  v0.8h,   v22.8h  // 27615 - abs()
469        uqsub           v23.8h,  v0.8h,   v23.8h
470        sshll           v24.4s,  v6.4h,   #6      // tmp1 << 6
471        sshll2          v25.4s,  v6.8h,   #6
472        sshll           v26.4s,  v7.4h,   #6
473        sshll2          v27.4s,  v7.8h,   #6
474        ushr            v22.8h,  v22.8h,  #10     // 64-m = (27615 - abs()) >> mask_sh
475        ushr            v23.8h,  v23.8h,  #10
476        add             v24.4s,  v24.4s,  v30.4s  // += PREP_BIAS*64
477        add             v25.4s,  v25.4s,  v30.4s
478        add             v26.4s,  v26.4s,  v30.4s
479        add             v27.4s,  v27.4s,  v30.4s
480        uxtl            v6.4s,   v22.4h
481        uxtl2           v7.4s,   v22.8h
482        uxtl            v28.4s,  v23.4h
483        mla             v24.4s,  v16.4s,  v6.4s   // (tmp2-tmp1)*(64-m)
484        uxtl2           v6.4s,   v23.8h
485        mla             v25.4s,  v17.4s,  v7.4s
486        mla             v26.4s,  v18.4s,  v28.4s
487        mla             v27.4s,  v19.4s,  v6.4s
488        srshl           v24.4s,  v24.4s,  v29.4s  // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh
489        srshl           v25.4s,  v25.4s,  v29.4s
490        srshl           v26.4s,  v26.4s,  v29.4s
491        srshl           v27.4s,  v27.4s,  v29.4s
492        sqxtun          v6.4h,   v24.4s           // iclip_pixel
493        sqxtun2         v6.8h,   v25.4s
494        sqxtun          v7.4h,   v26.4s
495        sqxtun2         v7.8h,   v27.4s
496        umin            v6.8h,   v6.8h,   v31.8h  // iclip_pixel
497        umin            v7.8h,   v7.8h,   v31.8h
498.if \type == 444
499        xtn             v20.8b,  v20.8h           // 64 - m
500        xtn2            v20.16b, v21.8h
501        xtn             v21.8b,  v22.8h
502        xtn2            v21.16b, v23.8h
503        sub             v20.16b, v1.16b,  v20.16b // m
504        sub             v21.16b, v1.16b,  v21.16b
505        st1             {v20.16b}, [x6],  #16
506        st1             {v21.16b}, [x10], #16
507.elseif \type == 422
508        addp            v20.8h,  v20.8h,  v21.8h  // (64 - m) + (64 - n) (column wise addition)
509        addp            v21.8h,  v22.8h,  v23.8h
510        xtn             v20.8b,  v20.8h
511        xtn             v21.8b,  v21.8h
512        uhsub           v20.8b,  v3.8b,   v20.8b  // ((129 - sign) - ((64 - m) + (64 - n)) >> 1
513        uhsub           v21.8b,  v3.8b,   v21.8b
514        st1             {v20.8b}, [x6],  #8
515        st1             {v21.8b}, [x10], #8
516.elseif \type == 420
517        add             v20.8h,  v20.8h,  v22.8h  // (64 - my1) + (64 - my2) (row wise addition)
518        add             v21.8h,  v21.8h,  v23.8h
519        addp            v20.8h,  v20.8h,  v21.8h  // (128 - m) + (128 - n) (column wise addition)
520        sub             v20.8h,  v3.8h,   v20.8h  // (256 - sign) - ((128 - m) + (128 - n))
521        rshrn           v20.8b,  v20.8h,  #2      // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
522        st1             {v20.8b}, [x6], #8
523.endif
524        st1             {v4.8h, v5.8h}, [x0],  #32
525        st1             {v6.8h, v7.8h}, [x12], #32
526        b.gt            16b
527        subs            w5,  w5,  #2
528        add             x2,  x2,  w4,  uxtw #1
529        add             x3,  x3,  w4,  uxtw #1
530        add             x7,  x7,  w4,  uxtw #1
531        add             x9,  x9,  w4,  uxtw #1
532.if \type == 444
533        add             x6,  x6,  w4,  uxtw
534        add             x10, x10, w4,  uxtw
535.elseif \type == 422
536        add             x6,  x6,  x11, lsr #1
537        add             x10, x10, x11, lsr #1
538.endif
539        add             x0,  x0,  x1
540        add             x12, x12, x1
541        b.gt            161b
542        ret
543L(w_mask_\type\()_tbl):
544        .hword L(w_mask_\type\()_tbl) - 1280b
545        .hword L(w_mask_\type\()_tbl) -  640b
546        .hword L(w_mask_\type\()_tbl) -  320b
547        .hword L(w_mask_\type\()_tbl) -  160b
548        .hword L(w_mask_\type\()_tbl) -    8b
549        .hword L(w_mask_\type\()_tbl) -    4b
550endfunc
551.endm
552
553w_mask_fn 444
554w_mask_fn 422
555w_mask_fn 420
556
557
558function blend_16bpc_neon, export=1
559        adr             x6,  L(blend_tbl)
560        clz             w3,  w3
561        sub             w3,  w3,  #26
562        ldrh            w3,  [x6,  x3,  lsl #1]
563        sub             x6,  x6,  w3,  uxtw
564        add             x8,  x0,  x1
565        br              x6
56640:
567        lsl             x1,  x1,  #1
5684:
569        ld1             {v2.8b},   [x5], #8
570        ld1             {v1.8h},   [x2], #16
571        ld1             {v0.d}[0], [x0]
572        neg             v2.8b,   v2.8b            // -m
573        subs            w4,  w4,  #2
574        ld1             {v0.d}[1], [x8]
575        sxtl            v2.8h,   v2.8b
576        shl             v2.8h,   v2.8h,   #9      // -m << 9
577        sub             v1.8h,   v0.8h,   v1.8h   // a - b
578        sqrdmulh        v1.8h,   v1.8h,   v2.8h   // ((a-b)*-m + 32) >> 6
579        add             v0.8h,   v0.8h,   v1.8h
580        st1             {v0.d}[0], [x0], x1
581        st1             {v0.d}[1], [x8], x1
582        b.gt            4b
583        ret
58480:
585        lsl             x1,  x1,  #1
5868:
587        ld1             {v4.16b},       [x5], #16
588        ld1             {v2.8h, v3.8h}, [x2], #32
589        neg             v5.16b,  v4.16b           // -m
590        ld1             {v0.8h},   [x0]
591        ld1             {v1.8h},   [x8]
592        sxtl            v4.8h,   v5.8b
593        sxtl2           v5.8h,   v5.16b
594        shl             v4.8h,   v4.8h,   #9      // -m << 9
595        shl             v5.8h,   v5.8h,   #9
596        sub             v2.8h,   v0.8h,   v2.8h   // a - b
597        sub             v3.8h,   v1.8h,   v3.8h
598        subs            w4,  w4,  #2
599        sqrdmulh        v2.8h,   v2.8h,   v4.8h   // ((a-b)*-m + 32) >> 6
600        sqrdmulh        v3.8h,   v3.8h,   v5.8h
601        add             v0.8h,   v0.8h,   v2.8h
602        add             v1.8h,   v1.8h,   v3.8h
603        st1             {v0.8h}, [x0], x1
604        st1             {v1.8h}, [x8], x1
605        b.gt            8b
606        ret
607160:
608        lsl             x1,  x1,  #1
60916:
610        ld1             {v16.16b, v17.16b},           [x5], #32
611        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
612        subs            w4,  w4,  #2
613        neg             v18.16b, v16.16b          // -m
614        neg             v19.16b, v17.16b
615        ld1             {v0.8h, v1.8h}, [x0]
616        sxtl            v16.8h,  v18.8b
617        sxtl2           v17.8h,  v18.16b
618        sxtl            v18.8h,  v19.8b
619        sxtl2           v19.8h,  v19.16b
620        ld1             {v2.8h, v3.8h}, [x8]
621        shl             v16.8h,  v16.8h,  #9      // -m << 9
622        shl             v17.8h,  v17.8h,  #9
623        shl             v18.8h,  v18.8h,  #9
624        shl             v19.8h,  v19.8h,  #9
625        sub             v4.8h,   v0.8h,   v4.8h   // a - b
626        sub             v5.8h,   v1.8h,   v5.8h
627        sub             v6.8h,   v2.8h,   v6.8h
628        sub             v7.8h,   v3.8h,   v7.8h
629        sqrdmulh        v4.8h,   v4.8h,   v16.8h  // ((a-b)*-m + 32) >> 6
630        sqrdmulh        v5.8h,   v5.8h,   v17.8h
631        sqrdmulh        v6.8h,   v6.8h,   v18.8h
632        sqrdmulh        v7.8h,   v7.8h,   v19.8h
633        add             v0.8h,   v0.8h,   v4.8h
634        add             v1.8h,   v1.8h,   v5.8h
635        add             v2.8h,   v2.8h,   v6.8h
636        add             v3.8h,   v3.8h,   v7.8h
637        st1             {v0.8h, v1.8h}, [x0], x1
638        st1             {v2.8h, v3.8h}, [x8], x1
639        b.gt            16b
640        ret
64132:
642        ld1             {v16.16b, v17.16b},           [x5], #32
643        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
644        subs            w4,  w4,  #1
645        neg             v18.16b, v16.16b          // -m
646        neg             v19.16b, v17.16b
647        sxtl            v16.8h,  v18.8b
648        sxtl2           v17.8h,  v18.16b
649        sxtl            v18.8h,  v19.8b
650        sxtl2           v19.8h,  v19.16b
651        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0]
652        shl             v16.8h,  v16.8h,  #9      // -m << 9
653        shl             v17.8h,  v17.8h,  #9
654        shl             v18.8h,  v18.8h,  #9
655        shl             v19.8h,  v19.8h,  #9
656        sub             v4.8h,   v0.8h,   v4.8h   // a - b
657        sub             v5.8h,   v1.8h,   v5.8h
658        sub             v6.8h,   v2.8h,   v6.8h
659        sub             v7.8h,   v3.8h,   v7.8h
660        sqrdmulh        v4.8h,   v4.8h,   v16.8h  // ((a-b)*-m + 32) >> 6
661        sqrdmulh        v5.8h,   v5.8h,   v17.8h
662        sqrdmulh        v6.8h,   v6.8h,   v18.8h
663        sqrdmulh        v7.8h,   v7.8h,   v19.8h
664        add             v0.8h,   v0.8h,   v4.8h
665        add             v1.8h,   v1.8h,   v5.8h
666        add             v2.8h,   v2.8h,   v6.8h
667        add             v3.8h,   v3.8h,   v7.8h
668        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
669        b.gt            32b
670        ret
671L(blend_tbl):
672        .hword L(blend_tbl) -  32b
673        .hword L(blend_tbl) - 160b
674        .hword L(blend_tbl) -  80b
675        .hword L(blend_tbl) -  40b
676endfunc
677
678function blend_h_16bpc_neon, export=1
679        adr             x6,  L(blend_h_tbl)
680        movrel          x5,  X(obmc_masks)
681        add             x5,  x5,  w4,  uxtw
682        sub             w4,  w4,  w4,  lsr #2
683        clz             w7,  w3
684        add             x8,  x0,  x1
685        lsl             x1,  x1,  #1
686        sub             w7,  w7,  #24
687        ldrh            w7,  [x6,  x7,  lsl #1]
688        sub             x6,  x6,  w7, uxtw
689        br              x6
6902:
691        ld2r            {v2.8b, v3.8b}, [x5], #2
692        ld1             {v1.4h},        [x2], #8
693        ext             v2.8b,   v2.8b,   v3.8b,   #6
694        subs            w4,  w4,  #2
695        neg             v2.8b,   v2.8b            // -m
696        ld1             {v0.s}[0], [x0]
697        ld1             {v0.s}[1], [x8]
698        sxtl            v2.8h,   v2.8b
699        shl             v2.4h,   v2.4h,   #9      // -m << 9
700        sub             v1.4h,   v0.4h,   v1.4h   // a - b
701        sqrdmulh        v1.4h,   v1.4h,   v2.4h   // ((a-b)*-m + 32) >> 6
702        add             v0.4h,   v0.4h,   v1.4h
703        st1             {v0.s}[0], [x0], x1
704        st1             {v0.s}[1], [x8], x1
705        b.gt            2b
706        ret
7074:
708        ld2r            {v2.8b, v3.8b}, [x5], #2
709        ld1             {v1.8h},        [x2], #16
710        ext             v2.8b,   v2.8b,   v3.8b,   #4
711        subs            w4,  w4,  #2
712        neg             v2.8b,   v2.8b            // -m
713        ld1             {v0.d}[0],   [x0]
714        ld1             {v0.d}[1],   [x8]
715        sxtl            v2.8h,   v2.8b
716        shl             v2.8h,   v2.8h,   #9      // -m << 9
717        sub             v1.8h,   v0.8h,   v1.8h   // a - b
718        sqrdmulh        v1.8h,   v1.8h,   v2.8h   // ((a-b)*-m + 32) >> 6
719        add             v0.8h,   v0.8h,   v1.8h
720        st1             {v0.d}[0], [x0], x1
721        st1             {v0.d}[1], [x8], x1
722        b.gt            4b
723        ret
7248:
725        ld2r            {v4.8b, v5.8b}, [x5], #2
726        ld1             {v2.8h, v3.8h}, [x2], #32
727        neg             v4.8b,   v4.8b            // -m
728        neg             v5.8b,   v5.8b
729        ld1             {v0.8h}, [x0]
730        subs            w4,  w4,  #2
731        sxtl            v4.8h,   v4.8b
732        sxtl            v5.8h,   v5.8b
733        ld1             {v1.8h}, [x8]
734        shl             v4.8h,   v4.8h,   #9      // -m << 9
735        shl             v5.8h,   v5.8h,   #9
736        sub             v2.8h,   v0.8h,   v2.8h   // a - b
737        sub             v3.8h,   v1.8h,   v3.8h
738        sqrdmulh        v2.8h,   v2.8h,   v4.8h   // ((a-b)*-m + 32) >> 6
739        sqrdmulh        v3.8h,   v3.8h,   v5.8h
740        add             v0.8h,   v0.8h,   v2.8h
741        add             v1.8h,   v1.8h,   v3.8h
742        st1             {v0.8h}, [x0], x1
743        st1             {v1.8h}, [x8], x1
744        b.gt            8b
745        ret
74616:
747        ld2r            {v16.8b, v17.8b}, [x5], #2
748        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
749        neg             v16.8b,  v16.8b           // -m
750        neg             v17.8b,  v17.8b
751        ld1             {v0.8h, v1.8h},  [x0]
752        ld1             {v2.8h, v3.8h},  [x8]
753        subs            w4,  w4,  #2
754        sxtl            v16.8h,  v16.8b
755        sxtl            v17.8h,  v17.8b
756        shl             v16.8h,  v16.8h,  #9      // -m << 9
757        shl             v17.8h,  v17.8h,  #9
758        sub             v4.8h,   v0.8h,   v4.8h   // a - b
759        sub             v5.8h,   v1.8h,   v5.8h
760        sub             v6.8h,   v2.8h,   v6.8h
761        sub             v7.8h,   v3.8h,   v7.8h
762        sqrdmulh        v4.8h,   v4.8h,   v16.8h  // ((a-b)*-m + 32) >> 6
763        sqrdmulh        v5.8h,   v5.8h,   v16.8h
764        sqrdmulh        v6.8h,   v6.8h,   v17.8h
765        sqrdmulh        v7.8h,   v7.8h,   v17.8h
766        add             v0.8h,   v0.8h,   v4.8h
767        add             v1.8h,   v1.8h,   v5.8h
768        add             v2.8h,   v2.8h,   v6.8h
769        add             v3.8h,   v3.8h,   v7.8h
770        st1             {v0.8h, v1.8h}, [x0], x1
771        st1             {v2.8h, v3.8h}, [x8], x1
772        b.gt            16b
773        ret
7741280:
775640:
776320:
777        sub             x1,  x1,  w3,  uxtw #1
778        add             x7,  x2,  w3,  uxtw #1
779321:
780        ld2r            {v24.8b, v25.8b}, [x5], #2
781        mov             w6,  w3
782        neg             v24.8b,  v24.8b           // -m
783        neg             v25.8b,  v25.8b
784        sxtl            v24.8h,  v24.8b
785        sxtl            v25.8h,  v25.8b
786        shl             v24.8h,  v24.8h,  #9      // -m << 9
787        shl             v25.8h,  v25.8h,  #9
78832:
789        ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x2], #64
790        ld1             {v0.8h,  v1.8h,  v2.8h,  v3.8h},  [x0]
791        subs            w6,  w6,  #32
792        sub             v16.8h,  v0.8h,   v16.8h  // a - b
793        sub             v17.8h,  v1.8h,   v17.8h
794        sub             v18.8h,  v2.8h,   v18.8h
795        sub             v19.8h,  v3.8h,   v19.8h
796        ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x7], #64
797        ld1             {v4.8h,  v5.8h,  v6.8h,  v7.8h},  [x8]
798        sqrdmulh        v16.8h,  v16.8h,  v24.8h  // ((a-b)*-m + 32) >> 6
799        sqrdmulh        v17.8h,  v17.8h,  v24.8h
800        sqrdmulh        v18.8h,  v18.8h,  v24.8h
801        sqrdmulh        v19.8h,  v19.8h,  v24.8h
802        sub             v20.8h,  v4.8h,   v20.8h  // a - b
803        sub             v21.8h,  v5.8h,   v21.8h
804        sub             v22.8h,  v6.8h,   v22.8h
805        sub             v23.8h,  v7.8h,   v23.8h
806        add             v0.8h,   v0.8h,   v16.8h
807        add             v1.8h,   v1.8h,   v17.8h
808        add             v2.8h,   v2.8h,   v18.8h
809        add             v3.8h,   v3.8h,   v19.8h
810        sqrdmulh        v20.8h,  v20.8h,  v25.8h  // ((a-b)*-m + 32) >> 6
811        sqrdmulh        v21.8h,  v21.8h,  v25.8h
812        sqrdmulh        v22.8h,  v22.8h,  v25.8h
813        sqrdmulh        v23.8h,  v23.8h,  v25.8h
814        st1             {v0.8h,  v1.8h,  v2.8h,  v3.8h},  [x0], #64
815        add             v4.8h,   v4.8h,   v20.8h
816        add             v5.8h,   v5.8h,   v21.8h
817        add             v6.8h,   v6.8h,   v22.8h
818        add             v7.8h,   v7.8h,   v23.8h
819        st1             {v4.8h,  v5.8h,  v6.8h,  v7.8h},  [x8], #64
820        b.gt            32b
821        subs            w4,  w4,  #2
822        add             x0,  x0,  x1
823        add             x8,  x8,  x1
824        add             x2,  x2,  w3,  uxtw #1
825        add             x7,  x7,  w3,  uxtw #1
826        b.gt            321b
827        ret
828L(blend_h_tbl):
829        .hword L(blend_h_tbl) - 1280b
830        .hword L(blend_h_tbl) -  640b
831        .hword L(blend_h_tbl) -  320b
832        .hword L(blend_h_tbl) -   16b
833        .hword L(blend_h_tbl) -    8b
834        .hword L(blend_h_tbl) -    4b
835        .hword L(blend_h_tbl) -    2b
836endfunc
837
838function blend_v_16bpc_neon, export=1
839        adr             x6,  L(blend_v_tbl)
840        movrel          x5,  X(obmc_masks)
841        add             x5,  x5,  w3,  uxtw
842        clz             w3,  w3
843        add             x8,  x0,  x1
844        lsl             x1,  x1,  #1
845        sub             w3,  w3,  #26
846        ldrh            w3,  [x6,  x3,  lsl #1]
847        sub             x6,  x6,  w3,  uxtw
848        br              x6
84920:
850        ld1r            {v2.8b}, [x5]
851        neg             v2.8b,   v2.8b            // -m
852        sxtl            v2.8h,   v2.8b
853        shl             v2.4h,   v2.4h,   #9      // -m << 9
8542:
855        ld1             {v1.s}[0], [x2], #4
856        ld1             {v0.h}[0], [x0]
857        subs            w4,  w4,  #2
858        ld1             {v1.h}[1], [x2]
859        ld1             {v0.h}[1], [x8]
860        add             x2,  x2,  #4
861        sub             v1.4h,   v0.4h,   v1.4h   // a - b
862        sqrdmulh        v1.4h,   v1.4h,   v2.4h   // ((a-b)*-m + 32) >> 6
863        add             v0.4h,   v0.4h,   v1.4h
864        st1             {v0.h}[0], [x0],  x1
865        st1             {v0.h}[1], [x8],  x1
866        b.gt            2b
867        ret
86840:
869        ld1r            {v2.2s}, [x5]
870        sub             x1,  x1,  #4
871        neg             v2.8b,   v2.8b            // -m
872        sxtl            v2.8h,   v2.8b
873        shl             v2.8h,   v2.8h,   #9      // -m << 9
8744:
875        ld1             {v1.8h},   [x2], #16
876        ld1             {v0.d}[0], [x0]
877        ld1             {v0.d}[1], [x8]
878        subs            w4,  w4,  #2
879        sub             v1.8h,   v0.8h,   v1.8h   // a - b
880        sqrdmulh        v1.8h,   v1.8h,   v2.8h   // ((a-b)*-m + 32) >> 6
881        add             v0.8h,   v0.8h,   v1.8h
882        st1             {v0.s}[0], [x0], #4
883        st1             {v0.s}[2], [x8], #4
884        st1             {v0.h}[2], [x0], x1
885        st1             {v0.h}[6], [x8], x1
886        b.gt            4b
887        ret
88880:
889        ld1             {v4.8b}, [x5]
890        sub             x1,  x1,  #8
891        neg             v4.8b,   v4.8b            // -m
892        sxtl            v4.8h,   v4.8b
893        shl             v4.8h,   v4.8h,   #9      // -m << 9
8948:
895        ld1             {v2.8h, v3.8h}, [x2], #32
896        ld1             {v0.8h}, [x0]
897        ld1             {v1.8h}, [x8]
898        subs            w4,  w4,  #2
899        sub             v2.8h,   v0.8h,   v2.8h   // a - b
900        sub             v3.8h,   v1.8h,   v3.8h
901        sqrdmulh        v2.8h,   v2.8h,   v4.8h   // ((a-b)*-m + 32) >> 6
902        sqrdmulh        v3.8h,   v3.8h,   v4.8h
903        add             v0.8h,   v0.8h,   v2.8h
904        add             v1.8h,   v1.8h,   v3.8h
905        st1             {v0.d}[0], [x0], #8
906        st1             {v1.d}[0], [x8], #8
907        st1             {v0.s}[2], [x0], x1
908        st1             {v1.s}[2], [x8], x1
909        b.gt            8b
910        ret
911160:
912        ld1             {v16.16b}, [x5]
913        sub             x1,  x1,  #16
914        neg             v17.16b, v16.16b          // -m
915        sxtl            v16.8h,  v17.8b
916        sxtl2           v17.8h,  v17.16b
917        shl             v16.8h,  v16.8h,  #9      // -m << 9
918        shl             v17.4h,  v17.4h,  #9
91916:
920        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
921        ld1             {v0.8h, v1.8h}, [x0]
922        subs            w4,  w4,  #2
923        ld1             {v2.8h, v3.8h}, [x8]
924        sub             v4.8h,   v0.8h,   v4.8h   // a - b
925        sub             v5.4h,   v1.4h,   v5.4h
926        sub             v6.8h,   v2.8h,   v6.8h
927        sub             v7.4h,   v3.4h,   v7.4h
928        sqrdmulh        v4.8h,   v4.8h,   v16.8h  // ((a-b)*-m + 32) >> 6
929        sqrdmulh        v5.4h,   v5.4h,   v17.4h
930        sqrdmulh        v6.8h,   v6.8h,   v16.8h
931        sqrdmulh        v7.4h,   v7.4h,   v17.4h
932        add             v0.8h,   v0.8h,   v4.8h
933        add             v1.4h,   v1.4h,   v5.4h
934        add             v2.8h,   v2.8h,   v6.8h
935        add             v3.4h,   v3.4h,   v7.4h
936        st1             {v0.8h}, [x0], #16
937        st1             {v2.8h}, [x8], #16
938        st1             {v1.4h}, [x0], x1
939        st1             {v3.4h}, [x8], x1
940        b.gt            16b
941        ret
942320:
943        ld1             {v24.16b, v25.16b},  [x5]
944        neg             v26.16b, v24.16b          // -m
945        neg             v27.8b,  v25.8b
946        sxtl            v24.8h,  v26.8b
947        sxtl2           v25.8h,  v26.16b
948        sxtl            v26.8h,  v27.8b
949        shl             v24.8h,  v24.8h,  #9      // -m << 9
950        shl             v25.8h,  v25.8h,  #9
951        shl             v26.8h,  v26.8h,  #9
95232:
953        ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x2], #64
954        ld1             {v0.8h, v1.8h, v2.8h}, [x0]
955        ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x2], #64
956        ld1             {v4.8h, v5.8h, v6.8h}, [x8]
957        subs            w4,  w4,  #2
958        sub             v16.8h,  v0.8h,   v16.8h  // a - b
959        sub             v17.8h,  v1.8h,   v17.8h
960        sub             v18.8h,  v2.8h,   v18.8h
961        sub             v20.8h,  v4.8h,   v20.8h
962        sub             v21.8h,  v5.8h,   v21.8h
963        sub             v22.8h,  v6.8h,   v22.8h
964        sqrdmulh        v16.8h,  v16.8h,  v24.8h  // ((a-b)*-m + 32) >> 6
965        sqrdmulh        v17.8h,  v17.8h,  v25.8h
966        sqrdmulh        v18.8h,  v18.8h,  v26.8h
967        sqrdmulh        v20.8h,  v20.8h,  v24.8h
968        sqrdmulh        v21.8h,  v21.8h,  v25.8h
969        sqrdmulh        v22.8h,  v22.8h,  v26.8h
970        add             v0.8h,   v0.8h,   v16.8h
971        add             v1.8h,   v1.8h,   v17.8h
972        add             v2.8h,   v2.8h,   v18.8h
973        add             v4.8h,   v4.8h,   v20.8h
974        add             v5.8h,   v5.8h,   v21.8h
975        add             v6.8h,   v6.8h,   v22.8h
976        st1             {v0.8h, v1.8h, v2.8h}, [x0], x1
977        st1             {v4.8h, v5.8h, v6.8h}, [x8], x1
978        b.gt            32b
979        ret
980L(blend_v_tbl):
981        .hword L(blend_v_tbl) - 320b
982        .hword L(blend_v_tbl) - 160b
983        .hword L(blend_v_tbl) -  80b
984        .hword L(blend_v_tbl) -  40b
985        .hword L(blend_v_tbl) -  20b
986endfunc
987
988
989// This has got the same signature as the put_8tap functions,
990// and assumes that x9 is set to (clz(w)-24).
991function put_neon
992        adr             x10, L(put_tbl)
993        ldrh            w9, [x10, x9, lsl #1]
994        sub             x10, x10, w9, uxtw
995        br              x10
996
9972:
998        ld1             {v0.s}[0], [x2], x3
999        ld1             {v1.s}[0], [x2], x3
1000        subs            w5,  w5,  #2
1001        st1             {v0.s}[0], [x0], x1
1002        st1             {v1.s}[0], [x0], x1
1003        b.gt            2b
1004        ret
10054:
1006        ld1             {v0.4h}, [x2], x3
1007        ld1             {v1.4h}, [x2], x3
1008        subs            w5,  w5,  #2
1009        st1             {v0.4h}, [x0], x1
1010        st1             {v1.4h}, [x0], x1
1011        b.gt            4b
1012        ret
101380:
1014        add             x8,  x0,  x1
1015        lsl             x1,  x1,  #1
1016        add             x9,  x2,  x3
1017        lsl             x3,  x3,  #1
10188:
1019        ld1             {v0.8h}, [x2], x3
1020        ld1             {v1.8h}, [x9], x3
1021        subs            w5,  w5,  #2
1022        st1             {v0.8h}, [x0], x1
1023        st1             {v1.8h}, [x8], x1
1024        b.gt            8b
1025        ret
102616:
1027        ldp             x6,  x7,  [x2]
1028        ldp             x8,  x9,  [x2, #16]
1029        stp             x6,  x7,  [x0]
1030        subs            w5,  w5,  #1
1031        stp             x8,  x9,  [x0, #16]
1032        add             x2,  x2,  x3
1033        add             x0,  x0,  x1
1034        b.gt            16b
1035        ret
103632:
1037        ldp             x6,  x7,  [x2]
1038        ldp             x8,  x9,  [x2, #16]
1039        stp             x6,  x7,  [x0]
1040        ldp             x10, x11, [x2, #32]
1041        stp             x8,  x9,  [x0, #16]
1042        subs            w5,  w5,  #1
1043        ldp             x12, x13, [x2, #48]
1044        stp             x10, x11, [x0, #32]
1045        stp             x12, x13, [x0, #48]
1046        add             x2,  x2,  x3
1047        add             x0,  x0,  x1
1048        b.gt            32b
1049        ret
105064:
1051        ldp             q0,  q1,  [x2]
1052        ldp             q2,  q3,  [x2, #32]
1053        stp             q0,  q1,  [x0]
1054        ldp             q4,  q5,  [x2, #64]
1055        stp             q2,  q3,  [x0, #32]
1056        ldp             q6,  q7,  [x2, #96]
1057        subs            w5,  w5,  #1
1058        stp             q4,  q5,  [x0, #64]
1059        stp             q6,  q7,  [x0, #96]
1060        add             x2,  x2,  x3
1061        add             x0,  x0,  x1
1062        b.gt            64b
1063        ret
1064128:
1065        ldp             q0,  q1,  [x2]
1066        ldp             q2,  q3,  [x2, #32]
1067        stp             q0,  q1,  [x0]
1068        ldp             q4,  q5,  [x2, #64]
1069        stp             q2,  q3,  [x0, #32]
1070        ldp             q6,  q7,  [x2, #96]
1071        subs            w5,  w5,  #1
1072        stp             q4,  q5,  [x0, #64]
1073        ldp             q16, q17, [x2, #128]
1074        stp             q6,  q7,  [x0, #96]
1075        ldp             q18, q19, [x2, #160]
1076        stp             q16, q17, [x0, #128]
1077        ldp             q20, q21, [x2, #192]
1078        stp             q18, q19, [x0, #160]
1079        ldp             q22, q23, [x2, #224]
1080        stp             q20, q21, [x0, #192]
1081        stp             q22, q23, [x0, #224]
1082        add             x2,  x2,  x3
1083        add             x0,  x0,  x1
1084        b.gt            128b
1085        ret
1086
1087L(put_tbl):
1088        .hword L(put_tbl) - 128b
1089        .hword L(put_tbl) -  64b
1090        .hword L(put_tbl) -  32b
1091        .hword L(put_tbl) -  16b
1092        .hword L(put_tbl) -  80b
1093        .hword L(put_tbl) -   4b
1094        .hword L(put_tbl) -   2b
1095endfunc
1096
1097
1098// This has got the same signature as the prep_8tap functions,
1099// and assumes that x9 is set to (clz(w)-24), w7 to intermediate_bits and
1100// x8 to w*2.
1101function prep_neon
1102        adr             x10, L(prep_tbl)
1103        ldrh            w9, [x10, x9, lsl #1]
1104        dup             v31.8h,  w7   // intermediate_bits
1105        movi            v30.8h,  #(PREP_BIAS >> 8), lsl #8
1106        sub             x10, x10, w9, uxtw
1107        br              x10
1108
110940:
1110        add             x9,  x1,  x2
1111        lsl             x2,  x2,  #1
11124:
1113        ld1             {v0.d}[0], [x1], x2
1114        ld1             {v0.d}[1], [x9], x2
1115        subs            w4,  w4,  #2
1116        sshl            v0.8h,   v0.8h,   v31.8h
1117        sub             v0.8h,   v0.8h,   v30.8h
1118        st1             {v0.8h}, [x0], #16
1119        b.gt            4b
1120        ret
112180:
1122        add             x9,  x1,  x2
1123        lsl             x2,  x2,  #1
11248:
1125        ld1             {v0.8h}, [x1], x2
1126        ld1             {v1.8h}, [x9], x2
1127        subs            w4,  w4,  #2
1128        sshl            v0.8h,   v0.8h,   v31.8h
1129        sshl            v1.8h,   v1.8h,   v31.8h
1130        sub             v0.8h,   v0.8h,   v30.8h
1131        sub             v1.8h,   v1.8h,   v30.8h
1132        st1             {v0.8h, v1.8h}, [x0], #32
1133        b.gt            8b
1134        ret
113516:
1136        ldp             q0,  q1,  [x1]
1137        add             x1,  x1,  x2
1138        sshl            v0.8h,   v0.8h,   v31.8h
1139        ldp             q2,  q3,  [x1]
1140        add             x1,  x1,  x2
1141        subs            w4,  w4,  #2
1142        sshl            v1.8h,   v1.8h,   v31.8h
1143        sshl            v2.8h,   v2.8h,   v31.8h
1144        sshl            v3.8h,   v3.8h,   v31.8h
1145        sub             v0.8h,   v0.8h,   v30.8h
1146        sub             v1.8h,   v1.8h,   v30.8h
1147        sub             v2.8h,   v2.8h,   v30.8h
1148        sub             v3.8h,   v3.8h,   v30.8h
1149        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
1150        b.gt            16b
1151        ret
115232:
1153        ldp             q0,  q1,  [x1]
1154        sshl            v0.8h,   v0.8h,   v31.8h
1155        ldp             q2,  q3,  [x1, #32]
1156        add             x1,  x1,  x2
1157        sshl            v1.8h,   v1.8h,   v31.8h
1158        sshl            v2.8h,   v2.8h,   v31.8h
1159        sshl            v3.8h,   v3.8h,   v31.8h
1160        subs            w4,  w4,  #1
1161        sub             v0.8h,   v0.8h,   v30.8h
1162        sub             v1.8h,   v1.8h,   v30.8h
1163        sub             v2.8h,   v2.8h,   v30.8h
1164        sub             v3.8h,   v3.8h,   v30.8h
1165        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
1166        b.gt            32b
1167        ret
116864:
1169        ldp             q0,  q1,  [x1]
1170        subs            w4,  w4,  #1
1171        sshl            v0.8h,   v0.8h,   v31.8h
1172        ldp             q2,  q3,  [x1, #32]
1173        sshl            v1.8h,   v1.8h,   v31.8h
1174        ldp             q4,  q5,  [x1, #64]
1175        sshl            v2.8h,   v2.8h,   v31.8h
1176        sshl            v3.8h,   v3.8h,   v31.8h
1177        ldp             q6,  q7,  [x1, #96]
1178        add             x1,  x1,  x2
1179        sshl            v4.8h,   v4.8h,   v31.8h
1180        sshl            v5.8h,   v5.8h,   v31.8h
1181        sshl            v6.8h,   v6.8h,   v31.8h
1182        sshl            v7.8h,   v7.8h,   v31.8h
1183        sub             v0.8h,   v0.8h,   v30.8h
1184        sub             v1.8h,   v1.8h,   v30.8h
1185        sub             v2.8h,   v2.8h,   v30.8h
1186        sub             v3.8h,   v3.8h,   v30.8h
1187        stp             q0,  q1,  [x0]
1188        sub             v4.8h,   v4.8h,   v30.8h
1189        sub             v5.8h,   v5.8h,   v30.8h
1190        stp             q2,  q3,  [x0, #32]
1191        sub             v6.8h,   v6.8h,   v30.8h
1192        sub             v7.8h,   v7.8h,   v30.8h
1193        stp             q4,  q5,  [x0, #64]
1194        stp             q6,  q7,  [x0, #96]
1195        add             x0,  x0,  x8
1196        b.gt            64b
1197        ret
1198128:
1199        ldp             q0,  q1,  [x1]
1200        subs            w4,  w4,  #1
1201        sshl            v0.8h,   v0.8h,   v31.8h
1202        ldp             q2,  q3,  [x1, #32]
1203        sshl            v1.8h,   v1.8h,   v31.8h
1204        ldp             q4,  q5,  [x1, #64]
1205        sshl            v2.8h,   v2.8h,   v31.8h
1206        sshl            v3.8h,   v3.8h,   v31.8h
1207        ldp             q6,  q7,  [x1, #96]
1208        sshl            v4.8h,   v4.8h,   v31.8h
1209        sshl            v5.8h,   v5.8h,   v31.8h
1210        ldp             q16, q17, [x1, #128]
1211        sshl            v6.8h,   v6.8h,   v31.8h
1212        sshl            v7.8h,   v7.8h,   v31.8h
1213        ldp             q18, q19, [x1, #160]
1214        sshl            v16.8h,  v16.8h,  v31.8h
1215        sshl            v17.8h,  v17.8h,  v31.8h
1216        ldp             q20, q21, [x1, #192]
1217        sshl            v18.8h,  v18.8h,  v31.8h
1218        sshl            v19.8h,  v19.8h,  v31.8h
1219        ldp             q22, q23, [x1, #224]
1220        add             x1,  x1,  x2
1221        sshl            v20.8h,  v20.8h,  v31.8h
1222        sshl            v21.8h,  v21.8h,  v31.8h
1223        sshl            v22.8h,  v22.8h,  v31.8h
1224        sshl            v23.8h,  v23.8h,  v31.8h
1225        sub             v0.8h,   v0.8h,   v30.8h
1226        sub             v1.8h,   v1.8h,   v30.8h
1227        sub             v2.8h,   v2.8h,   v30.8h
1228        sub             v3.8h,   v3.8h,   v30.8h
1229        stp             q0,  q1,  [x0]
1230        sub             v4.8h,   v4.8h,   v30.8h
1231        sub             v5.8h,   v5.8h,   v30.8h
1232        stp             q2,  q3,  [x0, #32]
1233        sub             v6.8h,   v6.8h,   v30.8h
1234        sub             v7.8h,   v7.8h,   v30.8h
1235        stp             q4,  q5,  [x0, #64]
1236        sub             v16.8h,  v16.8h,  v30.8h
1237        sub             v17.8h,  v17.8h,  v30.8h
1238        stp             q6,  q7,  [x0, #96]
1239        sub             v18.8h,  v18.8h,  v30.8h
1240        sub             v19.8h,  v19.8h,  v30.8h
1241        stp             q16, q17, [x0, #128]
1242        sub             v20.8h,  v20.8h,  v30.8h
1243        sub             v21.8h,  v21.8h,  v30.8h
1244        stp             q18, q19, [x0, #160]
1245        sub             v22.8h,  v22.8h,  v30.8h
1246        sub             v23.8h,  v23.8h,  v30.8h
1247        stp             q20, q21, [x0, #192]
1248        stp             q22, q23, [x0, #224]
1249        add             x0,  x0,  x8
1250        b.gt            128b
1251        ret
1252
1253L(prep_tbl):
1254        .hword L(prep_tbl) - 128b
1255        .hword L(prep_tbl) -  64b
1256        .hword L(prep_tbl) -  32b
1257        .hword L(prep_tbl) -  16b
1258        .hword L(prep_tbl) -  80b
1259        .hword L(prep_tbl) -  40b
1260endfunc
1261
1262
1263.macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6
1264        ld1             {\d0\wd}[0], [\s0], \strd
1265        ld1             {\d1\wd}[0], [\s1], \strd
1266.ifnb \d2
1267        ld1             {\d2\wd}[0], [\s0], \strd
1268        ld1             {\d3\wd}[0], [\s1], \strd
1269.endif
1270.ifnb \d4
1271        ld1             {\d4\wd}[0], [\s0], \strd
1272.endif
1273.ifnb \d5
1274        ld1             {\d5\wd}[0], [\s1], \strd
1275.endif
1276.ifnb \d6
1277        ld1             {\d6\wd}[0], [\s0], \strd
1278.endif
1279.endm
1280.macro load_reg s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6
1281        ld1             {\d0\wd}, [\s0], \strd
1282        ld1             {\d1\wd}, [\s1], \strd
1283.ifnb \d2
1284        ld1             {\d2\wd}, [\s0], \strd
1285        ld1             {\d3\wd}, [\s1], \strd
1286.endif
1287.ifnb \d4
1288        ld1             {\d4\wd}, [\s0], \strd
1289.endif
1290.ifnb \d5
1291        ld1             {\d5\wd}, [\s1], \strd
1292.endif
1293.ifnb \d6
1294        ld1             {\d6\wd}, [\s0], \strd
1295.endif
1296.endm
1297.macro load_regpair s0, s1, strd, wd, d0, d1, d2, d3, d4, d5
1298        ld1             {\d0\wd, \d1\wd}, [\s0], \strd
1299.ifnb \d2
1300        ld1             {\d2\wd, \d3\wd}, [\s1], \strd
1301.endif
1302.ifnb \d4
1303        ld1             {\d4\wd, \d5\wd}, [\s0], \strd
1304.endif
1305.endm
1306.macro load_s s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
1307        load_slice      \s0, \s1, \strd, .s, \d0, \d1, \d2, \d3, \d4, \d5, \d6
1308.endm
1309.macro load_4h s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
1310        load_reg        \s0, \s1, \strd, .4h, \d0, \d1, \d2, \d3, \d4, \d5, \d6
1311.endm
1312.macro load_8h s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
1313        load_reg        \s0, \s1, \strd, .8h, \d0, \d1, \d2, \d3, \d4, \d5, \d6
1314.endm
1315.macro load_16h s0, s1, strd, d0, d1, d2, d3, d4, d5
1316        load_regpair    \s0, \s1, \strd, .8h, \d0, \d1, \d2, \d3, \d4, \d5
1317.endm
1318.macro interleave_1 wd, r0, r1, r2, r3, r4
1319        trn1            \r0\wd, \r0\wd, \r1\wd
1320        trn1            \r1\wd, \r1\wd, \r2\wd
1321.ifnb \r3
1322        trn1            \r2\wd, \r2\wd, \r3\wd
1323        trn1            \r3\wd, \r3\wd, \r4\wd
1324.endif
1325.endm
1326.macro interleave_1_s r0, r1, r2, r3, r4
1327        interleave_1    .2s, \r0, \r1, \r2, \r3, \r4
1328.endm
1329.macro umin_h c, wd, r0, r1, r2, r3
1330        umin            \r0\wd,  \r0\wd,  \c\wd
1331.ifnb \r1
1332        umin            \r1\wd,  \r1\wd,  \c\wd
1333.endif
1334.ifnb \r2
1335        umin            \r2\wd,  \r2\wd,  \c\wd
1336        umin            \r3\wd,  \r3\wd,  \c\wd
1337.endif
1338.endm
1339.macro sub_h c, wd, r0, r1, r2, r3
1340        sub             \r0\wd,  \r0\wd,  \c\wd
1341.ifnb \r1
1342        sub             \r1\wd,  \r1\wd,  \c\wd
1343.endif
1344.ifnb \r2
1345        sub             \r2\wd,  \r2\wd,  \c\wd
1346        sub             \r3\wd,  \r3\wd,  \c\wd
1347.endif
1348.endm
1349.macro smull_smlal_4 d, s0, s1, s2, s3
1350        smull           \d\().4s,  \s0\().4h,  v0.h[0]
1351        smlal           \d\().4s,  \s1\().4h,  v0.h[1]
1352        smlal           \d\().4s,  \s2\().4h,  v0.h[2]
1353        smlal           \d\().4s,  \s3\().4h,  v0.h[3]
1354.endm
1355.macro smull2_smlal2_4 d, s0, s1, s2, s3
1356        smull2          \d\().4s,  \s0\().8h,  v0.h[0]
1357        smlal2          \d\().4s,  \s1\().8h,  v0.h[1]
1358        smlal2          \d\().4s,  \s2\().8h,  v0.h[2]
1359        smlal2          \d\().4s,  \s3\().8h,  v0.h[3]
1360.endm
1361.macro smull_smlal_8 d, s0, s1, s2, s3, s4, s5, s6, s7
1362        smull           \d\().4s,  \s0\().4h,  v0.h[0]
1363        smlal           \d\().4s,  \s1\().4h,  v0.h[1]
1364        smlal           \d\().4s,  \s2\().4h,  v0.h[2]
1365        smlal           \d\().4s,  \s3\().4h,  v0.h[3]
1366        smlal           \d\().4s,  \s4\().4h,  v0.h[4]
1367        smlal           \d\().4s,  \s5\().4h,  v0.h[5]
1368        smlal           \d\().4s,  \s6\().4h,  v0.h[6]
1369        smlal           \d\().4s,  \s7\().4h,  v0.h[7]
1370.endm
1371.macro smull2_smlal2_8 d, s0, s1, s2, s3, s4, s5, s6, s7
1372        smull2          \d\().4s,  \s0\().8h,  v0.h[0]
1373        smlal2          \d\().4s,  \s1\().8h,  v0.h[1]
1374        smlal2          \d\().4s,  \s2\().8h,  v0.h[2]
1375        smlal2          \d\().4s,  \s3\().8h,  v0.h[3]
1376        smlal2          \d\().4s,  \s4\().8h,  v0.h[4]
1377        smlal2          \d\().4s,  \s5\().8h,  v0.h[5]
1378        smlal2          \d\().4s,  \s6\().8h,  v0.h[6]
1379        smlal2          \d\().4s,  \s7\().8h,  v0.h[7]
1380.endm
1381.macro sqrshrun_h shift, r0, r1, r2, r3
1382        sqrshrun        \r0\().4h, \r0\().4s,  #\shift
1383.ifnb \r1
1384        sqrshrun2       \r0\().8h, \r1\().4s,  #\shift
1385.endif
1386.ifnb \r2
1387        sqrshrun        \r2\().4h, \r2\().4s,  #\shift
1388        sqrshrun2       \r2\().8h, \r3\().4s,  #\shift
1389.endif
1390.endm
1391.macro xtn_h r0, r1, r2, r3
1392        xtn             \r0\().4h,  \r0\().4s
1393        xtn2            \r0\().8h,  \r1\().4s
1394.ifnb \r2
1395        xtn             \r2\().4h,  \r2\().4s
1396        xtn2            \r2\().8h,  \r3\().4s
1397.endif
1398.endm
1399.macro srshl_s shift, r0, r1, r2, r3
1400        srshl           \r0\().4s,  \r0\().4s,  \shift\().4s
1401        srshl           \r1\().4s,  \r1\().4s,  \shift\().4s
1402.ifnb \r2
1403        srshl           \r2\().4s,  \r2\().4s,  \shift\().4s
1404        srshl           \r3\().4s,  \r3\().4s,  \shift\().4s
1405.endif
1406.endm
1407.macro st_s strd, reg, lanes
1408        st1             {\reg\().s}[0], [x0], \strd
1409        st1             {\reg\().s}[1], [x9], \strd
1410.if \lanes > 2
1411        st1             {\reg\().s}[2], [x0], \strd
1412        st1             {\reg\().s}[3], [x9], \strd
1413.endif
1414.endm
1415.macro st_d strd, r0, r1
1416        st1             {\r0\().d}[0], [x0], \strd
1417        st1             {\r0\().d}[1], [x9], \strd
1418.ifnb \r1
1419        st1             {\r1\().d}[0], [x0], \strd
1420        st1             {\r1\().d}[1], [x9], \strd
1421.endif
1422.endm
1423.macro shift_store_4 type, strd, r0, r1, r2, r3
1424.ifc \type, put
1425        sqrshrun_h      6,   \r0, \r1, \r2, \r3
1426        umin_h          v31, .8h, \r0, \r2
1427.else
1428        srshl_s         v30, \r0, \r1, \r2, \r3  // -(6-intermediate_bits)
1429        xtn_h           \r0, \r1, \r2, \r3
1430        sub_h           v29, .8h, \r0, \r2       // PREP_BIAS
1431.endif
1432        st_d            \strd, \r0, \r2
1433.endm
1434.macro st_reg strd, wd, r0, r1, r2, r3, r4, r5, r6, r7
1435        st1             {\r0\wd}, [x0], \strd
1436        st1             {\r1\wd}, [x9], \strd
1437.ifnb \r2
1438        st1             {\r2\wd}, [x0], \strd
1439        st1             {\r3\wd}, [x9], \strd
1440.endif
1441.ifnb \r4
1442        st1             {\r4\wd}, [x0], \strd
1443        st1             {\r5\wd}, [x9], \strd
1444        st1             {\r6\wd}, [x0], \strd
1445        st1             {\r7\wd}, [x9], \strd
1446.endif
1447.endm
1448.macro st_8h strd, r0, r1, r2, r3, r4, r5, r6, r7
1449        st_reg          \strd, .8h, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7
1450.endm
1451.macro shift_store_8 type, strd, r0, r1, r2, r3
1452.ifc \type, put
1453        sqrshrun_h      6,   \r0, \r1, \r2, \r3
1454        umin_h          v31, .8h, \r0, \r2
1455.else
1456        srshl_s         v30, \r0, \r1, \r2, \r3  // -(6-intermediate_bits)
1457        xtn_h           \r0, \r1, \r2, \r3
1458        sub_h           v29, .8h, \r0, \r2       // PREP_BIAS
1459.endif
1460        st_8h           \strd, \r0, \r2
1461.endm
1462.macro shift_store_16 type, strd, dst, r0, r1, r2, r3
1463.ifc \type, put
1464        sqrshrun_h      6,   \r0, \r1, \r2, \r3
1465        umin            \r0\().8h, \r0\().8h, v31.8h
1466        umin            \r1\().8h, \r2\().8h, v31.8h
1467.else
1468        srshl_s         v30, \r0, \r1, \r2, \r3  // -(6-intermediate_bits)
1469        xtn_h           \r0, \r1, \r2, \r3
1470        sub             \r0\().8h, \r0\().8h, v29.8h
1471        sub             \r1\().8h, \r2\().8h, v29.8h
1472.endif
1473        st1             {\r0\().8h, \r1\().8h}, [\dst], \strd
1474.endm
1475
1476.macro make_8tap_fn op, type, type_h, type_v
1477function \op\()_8tap_\type\()_16bpc_neon, export=1
1478        mov             w9,  \type_h
1479        mov             w10, \type_v
1480        b               \op\()_8tap_neon
1481endfunc
1482.endm
1483
1484// No spaces in these expressions, due to gas-preprocessor.
1485#define REGULAR ((0*15<<7)|3*15)
1486#define SMOOTH  ((1*15<<7)|4*15)
1487#define SHARP   ((2*15<<7)|3*15)
1488
1489.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, bdmax, ds2, sr2
1490make_8tap_fn \type, regular,        REGULAR, REGULAR
1491make_8tap_fn \type, regular_smooth, REGULAR, SMOOTH
1492make_8tap_fn \type, regular_sharp,  REGULAR, SHARP
1493make_8tap_fn \type, smooth,         SMOOTH,  SMOOTH
1494make_8tap_fn \type, smooth_regular, SMOOTH,  REGULAR
1495make_8tap_fn \type, smooth_sharp,   SMOOTH,  SHARP
1496make_8tap_fn \type, sharp,          SHARP,   SHARP
1497make_8tap_fn \type, sharp_regular,  SHARP,   REGULAR
1498make_8tap_fn \type, sharp_smooth,   SHARP,   SMOOTH
1499
1500function \type\()_8tap_neon
1501.ifc \bdmax, w8
1502        ldr             w8,  [sp]
1503.endif
1504        mov             w11,  #0x4081  // (1 << 14) | (1 << 7) | (1 << 0)
1505        mul             \mx,  \mx, w11
1506        mul             \my,  \my, w11
1507        add             \mx,  \mx, w9  // mx, 8tap_h, 4tap_h
1508        add             \my,  \my, w10 // my, 8tap_v, 4tap_v
1509.ifc \type, prep
1510        uxtw            \d_strd, \w
1511        lsl             \d_strd, \d_strd, #1
1512.endif
1513
1514        dup             v31.8h,  \bdmax        // bitdepth_max
1515        clz             \bdmax,  \bdmax
1516        clz             w9,  \w
1517        sub             \bdmax,  \bdmax,  #18  // intermediate_bits = clz(bitdepth_max) - 18
1518        mov             w12, #6
1519        tst             \mx, #(0x7f << 14)
1520        sub             w9,  w9,  #24
1521        add             w13, w12, \bdmax       // 6 + intermediate_bits
1522        sub             w12, w12, \bdmax       // 6 - intermediate_bits
1523        movrel          x11, X(mc_subpel_filters), -8
1524        b.ne            L(\type\()_8tap_h)
1525        tst             \my, #(0x7f << 14)
1526        b.ne            L(\type\()_8tap_v)
1527        b               \type\()_neon
1528
1529L(\type\()_8tap_h):
1530        cmp             \w,   #4
1531        ubfx            w10,  \mx, #7, #7
1532        and             \mx,  \mx, #0x7f
1533        b.le            4f
1534        mov             \mx,  w10
15354:
1536        tst             \my,  #(0x7f << 14)
1537        add             \xmx, x11, \mx, uxtw #3
1538        b.ne            L(\type\()_8tap_hv)
1539
1540        adr             x10, L(\type\()_8tap_h_tbl)
1541        dup             v30.4s,  w12           // 6 - intermediate_bits
1542        ldrh            w9,  [x10, x9, lsl #1]
1543        neg             v30.4s,  v30.4s        // -(6-intermediate_bits)
1544.ifc \type, put
1545        dup             v29.8h,  \bdmax        // intermediate_bits
1546.else
1547        movi            v28.8h,  #(PREP_BIAS >> 8), lsl #8
1548.endif
1549        sub             x10, x10, w9, uxtw
1550.ifc \type, put
1551        neg             v29.8h,  v29.8h        // -intermediate_bits
1552.endif
1553        br              x10
1554
155520:     // 2xN h
1556.ifc \type, put
1557        add             \xmx,  \xmx,  #2
1558        ld1             {v0.s}[0], [\xmx]
1559        sub             \src,  \src,  #2
1560        add             \ds2,  \dst,  \d_strd
1561        add             \sr2,  \src,  \s_strd
1562        lsl             \d_strd,  \d_strd,  #1
1563        lsl             \s_strd,  \s_strd,  #1
1564        sxtl            v0.8h,   v0.8b
15652:
1566        ld1             {v4.8h},  [\src], \s_strd
1567        ld1             {v6.8h},  [\sr2], \s_strd
1568        ext             v5.16b,  v4.16b,  v4.16b,  #2
1569        ext             v7.16b,  v6.16b,  v6.16b,  #2
1570        subs            \h,  \h,  #2
1571        trn1            v3.2s,   v4.2s,   v6.2s
1572        trn2            v6.2s,   v4.2s,   v6.2s
1573        trn1            v4.2s,   v5.2s,   v7.2s
1574        trn2            v7.2s,   v5.2s,   v7.2s
1575        smull           v3.4s,   v3.4h,   v0.h[0]
1576        smlal           v3.4s,   v4.4h,   v0.h[1]
1577        smlal           v3.4s,   v6.4h,   v0.h[2]
1578        smlal           v3.4s,   v7.4h,   v0.h[3]
1579        srshl           v3.4s,   v3.4s,   v30.4s // -(6-intermediate_bits)
1580        sqxtun          v3.4h,   v3.4s
1581        srshl           v3.4h,   v3.4h,   v29.4h // -intermediate_bits
1582        umin            v3.4h,   v3.4h,   v31.4h
1583        st1             {v3.s}[0], [\dst], \d_strd
1584        st1             {v3.s}[1], [\ds2], \d_strd
1585        b.gt            2b
1586        ret
1587.endif
1588
158940:     // 4xN h
1590        add             \xmx,  \xmx,  #2
1591        ld1             {v0.s}[0], [\xmx]
1592        sub             \src,  \src,  #2
1593        add             \ds2,  \dst,  \d_strd
1594        add             \sr2,  \src,  \s_strd
1595        lsl             \d_strd,  \d_strd,  #1
1596        lsl             \s_strd,  \s_strd,  #1
1597        sxtl            v0.8h,   v0.8b
15984:
1599        ld1             {v16.8h}, [\src], \s_strd
1600        ld1             {v20.8h}, [\sr2], \s_strd
1601        ext             v17.16b, v16.16b, v16.16b, #2
1602        ext             v18.16b, v16.16b, v16.16b, #4
1603        ext             v19.16b, v16.16b, v16.16b, #6
1604        ext             v21.16b, v20.16b, v20.16b, #2
1605        ext             v22.16b, v20.16b, v20.16b, #4
1606        ext             v23.16b, v20.16b, v20.16b, #6
1607        subs            \h,  \h,  #2
1608        smull           v16.4s,  v16.4h,  v0.h[0]
1609        smlal           v16.4s,  v17.4h,  v0.h[1]
1610        smlal           v16.4s,  v18.4h,  v0.h[2]
1611        smlal           v16.4s,  v19.4h,  v0.h[3]
1612        smull           v20.4s,  v20.4h,  v0.h[0]
1613        smlal           v20.4s,  v21.4h,  v0.h[1]
1614        smlal           v20.4s,  v22.4h,  v0.h[2]
1615        smlal           v20.4s,  v23.4h,  v0.h[3]
1616        srshl           v16.4s,  v16.4s,  v30.4s // -(6-intermediate_bits)
1617        srshl           v20.4s,  v20.4s,  v30.4s // -(6-intermediate_bits)
1618.ifc \type, put
1619        sqxtun          v16.4h,  v16.4s
1620        sqxtun2         v16.8h,  v20.4s
1621        srshl           v16.8h,  v16.8h,  v29.8h // -intermediate_bits
1622        umin            v16.8h,  v16.8h,  v31.8h
1623.else
1624        xtn             v16.4h,  v16.4s
1625        xtn2            v16.8h,  v20.4s
1626        sub             v16.8h,  v16.8h,  v28.8h // PREP_BIAS
1627.endif
1628        st1             {v16.d}[0], [\dst], \d_strd
1629        st1             {v16.d}[1], [\ds2], \d_strd
1630        b.gt            4b
1631        ret
1632
163380:
1634160:
1635320:
1636640:
16371280:   // 8xN, 16xN, 32xN, ... h
1638        ld1             {v0.8b}, [\xmx]
1639        sub             \src,  \src,  #6
1640        add             \ds2,  \dst,  \d_strd
1641        add             \sr2,  \src,  \s_strd
1642        lsl             \s_strd,  \s_strd,  #1
1643        sxtl            v0.8h,   v0.8b
1644
1645        sub             \s_strd,  \s_strd,  \w, uxtw #1
1646        sub             \s_strd,  \s_strd,  #16
1647.ifc \type, put
1648        lsl             \d_strd,  \d_strd,  #1
1649        sub             \d_strd,  \d_strd,  \w, uxtw #1
1650.endif
165181:
1652        ld1             {v16.8h, v17.8h},  [\src], #32
1653        ld1             {v20.8h, v21.8h},  [\sr2], #32
1654        mov             \mx, \w
1655
16568:
1657        smull           v18.4s,  v16.4h,  v0.h[0]
1658        smull2          v19.4s,  v16.8h,  v0.h[0]
1659        smull           v22.4s,  v20.4h,  v0.h[0]
1660        smull2          v23.4s,  v20.8h,  v0.h[0]
1661.irpc i, 1234567
1662        ext             v24.16b, v16.16b, v17.16b, #(2*\i)
1663        ext             v25.16b, v20.16b, v21.16b, #(2*\i)
1664        smlal           v18.4s,  v24.4h,  v0.h[\i]
1665        smlal2          v19.4s,  v24.8h,  v0.h[\i]
1666        smlal           v22.4s,  v25.4h,  v0.h[\i]
1667        smlal2          v23.4s,  v25.8h,  v0.h[\i]
1668.endr
1669        subs            \mx, \mx, #8
1670        srshl           v18.4s,  v18.4s,  v30.4s // -(6-intermediate_bits)
1671        srshl           v19.4s,  v19.4s,  v30.4s // -(6-intermediate_bits)
1672        srshl           v22.4s,  v22.4s,  v30.4s // -(6-intermediate_bits)
1673        srshl           v23.4s,  v23.4s,  v30.4s // -(6-intermediate_bits)
1674.ifc \type, put
1675        sqxtun          v18.4h,  v18.4s
1676        sqxtun2         v18.8h,  v19.4s
1677        sqxtun          v22.4h,  v22.4s
1678        sqxtun2         v22.8h,  v23.4s
1679        srshl           v18.8h,  v18.8h,  v29.8h // -intermediate_bits
1680        srshl           v22.8h,  v22.8h,  v29.8h // -intermediate_bits
1681        umin            v18.8h,  v18.8h,  v31.8h
1682        umin            v22.8h,  v22.8h,  v31.8h
1683.else
1684        xtn             v18.4h,  v18.4s
1685        xtn2            v18.8h,  v19.4s
1686        xtn             v22.4h,  v22.4s
1687        xtn2            v22.8h,  v23.4s
1688        sub             v18.8h,  v18.8h,  v28.8h // PREP_BIAS
1689        sub             v22.8h,  v22.8h,  v28.8h // PREP_BIAS
1690.endif
1691        st1             {v18.8h}, [\dst], #16
1692        st1             {v22.8h}, [\ds2], #16
1693        b.le            9f
1694
1695        mov             v16.16b, v17.16b
1696        mov             v20.16b, v21.16b
1697        ld1             {v17.8h}, [\src], #16
1698        ld1             {v21.8h}, [\sr2], #16
1699        b               8b
1700
17019:
1702        add             \dst,  \dst,  \d_strd
1703        add             \ds2,  \ds2,  \d_strd
1704        add             \src,  \src,  \s_strd
1705        add             \sr2,  \sr2,  \s_strd
1706
1707        subs            \h,  \h,  #2
1708        b.gt            81b
1709        ret
1710
1711L(\type\()_8tap_h_tbl):
1712        .hword L(\type\()_8tap_h_tbl) - 1280b
1713        .hword L(\type\()_8tap_h_tbl) -  640b
1714        .hword L(\type\()_8tap_h_tbl) -  320b
1715        .hword L(\type\()_8tap_h_tbl) -  160b
1716        .hword L(\type\()_8tap_h_tbl) -   80b
1717        .hword L(\type\()_8tap_h_tbl) -   40b
1718        .hword L(\type\()_8tap_h_tbl) -   20b
1719        .hword 0
1720
1721
1722L(\type\()_8tap_v):
1723        cmp             \h,  #4
1724        ubfx            w10, \my, #7, #7
1725        and             \my, \my, #0x7f
1726        b.le            4f
1727        mov             \my, w10
17284:
1729        add             \xmy, x11, \my, uxtw #3
1730
1731.ifc \type, prep
1732        dup             v30.4s,  w12           // 6 - intermediate_bits
1733        movi            v29.8h,  #(PREP_BIAS >> 8), lsl #8
1734.endif
1735        adr             x10, L(\type\()_8tap_v_tbl)
1736        ldrh            w9,  [x10, x9, lsl #1]
1737.ifc \type, prep
1738        neg             v30.4s,  v30.4s        // -(6-intermediate_bits)
1739.endif
1740        sub             x10, x10, w9, uxtw
1741        br              x10
1742
174320:     // 2xN v
1744.ifc \type, put
1745        b.gt            28f
1746
1747        cmp             \h,  #2
1748        add             \xmy, \xmy, #2
1749        ld1             {v0.s}[0], [\xmy]
1750        sub             \src,  \src,  \s_strd
1751        add             \ds2,  \dst,  \d_strd
1752        add             \sr2,  \src,  \s_strd
1753        lsl             \s_strd,  \s_strd,  #1
1754        lsl             \d_strd,  \d_strd,  #1
1755        sxtl            v0.8h,   v0.8b
1756
1757        // 2x2 v
1758        load_s          \src, \sr2, \s_strd, v1, v2, v3, v4, v5
1759        interleave_1_s  v1,  v2,  v3,  v4,  v5
1760        b.gt            24f
1761        smull_smlal_4   v6,  v1,  v2,  v3,  v4
1762        sqrshrun_h      6,   v6
1763        umin_h          v31, .8h, v6
1764        st_s            \d_strd, v6, 2
1765        ret
1766
176724:     // 2x4 v
1768        load_s          \sr2, \src, \s_strd, v6, v7
1769        interleave_1_s  v5,  v6,  v7
1770        smull_smlal_4   v16, v1,  v2,  v3,  v4
1771        smull_smlal_4   v17, v3,  v4,  v5,  v6
1772        sqrshrun_h      6,   v16, v17
1773        umin_h          v31, .8h, v16
1774        st_s            \d_strd, v16, 4
1775        ret
1776
177728:     // 2x8, 2x16 v
1778        ld1             {v0.8b}, [\xmy]
1779        sub             \sr2,  \src,  \s_strd, lsl #1
1780        add             \ds2,  \dst,  \d_strd
1781        sub             \src,  \sr2,  \s_strd
1782        lsl             \d_strd,  \d_strd,  #1
1783        lsl             \s_strd,  \s_strd,  #1
1784        sxtl            v0.8h,   v0.8b
1785
1786        load_s          \src, \sr2, \s_strd, v1,  v2,  v3,  v4, v5, v6, v7
1787        interleave_1_s  v1,  v2,  v3,  v4,  v5
1788        interleave_1_s  v5,  v6,  v7
1789216:
1790        subs            \h,  \h,  #8
1791        load_s          \sr2, \src, \s_strd, v16, v17, v18, v19
1792        load_s          \sr2, \src, \s_strd, v20, v21, v22, v23
1793        interleave_1_s  v7,  v16, v17, v18, v19
1794        interleave_1_s  v19, v20, v21, v22, v23
1795        smull_smlal_8   v24, v1,  v2,  v3,  v4,  v5,  v6,  v7,  v16
1796        smull_smlal_8   v25, v3,  v4,  v5,  v6,  v7,  v16, v17, v18
1797        smull_smlal_8   v26, v5,  v6,  v7,  v16, v17, v18, v19, v20
1798        smull_smlal_8   v27, v7,  v16, v17, v18, v19, v20, v21, v22
1799        sqrshrun_h      6,   v24, v25, v26, v27
1800        umin_h          v31, .8h, v24, v26
1801        st_s            \d_strd, v24, 4
1802        st_s            \d_strd, v26, 4
1803        b.le            0f
1804        mov             v1.16b,  v17.16b
1805        mov             v2.16b,  v18.16b
1806        mov             v3.16b,  v19.16b
1807        mov             v4.16b,  v20.16b
1808        mov             v5.16b,  v21.16b
1809        mov             v6.16b,  v22.16b
1810        mov             v7.16b,  v23.16b
1811        b               216b
18120:
1813        ret
1814.endif
1815
181640:
1817        b.gt            480f
1818
1819        // 4x2, 4x4 v
1820        cmp             \h,  #2
1821        add             \xmy, \xmy, #2
1822        ld1             {v0.s}[0], [\xmy]
1823        sub             \src, \src, \s_strd
1824        add             \ds2, \dst, \d_strd
1825        add             \sr2, \src, \s_strd
1826        lsl             \s_strd, \s_strd, #1
1827        lsl             \d_strd, \d_strd, #1
1828        sxtl            v0.8h,   v0.8b
1829
1830        load_4h         \src, \sr2, \s_strd, v1, v2, v3, v4, v5
1831        smull_smlal_4   v6,  v1,  v2,  v3,  v4
1832        smull_smlal_4   v7,  v2,  v3,  v4,  v5
1833        shift_store_4   \type, \d_strd, v6, v7
1834        b.le            0f
1835        load_4h         \sr2, \src, \s_strd, v6, v7
1836        smull_smlal_4   v1,  v3,  v4,  v5,  v6
1837        smull_smlal_4   v2,  v4,  v5,  v6,  v7
1838        shift_store_4   \type, \d_strd, v1, v2
18390:
1840        ret
1841
1842480:    // 4x8, 4x16 v
1843        ld1             {v0.8b}, [\xmy]
1844        sub             \sr2, \src, \s_strd, lsl #1
1845        add             \ds2, \dst, \d_strd
1846        sub             \src, \sr2, \s_strd
1847        lsl             \s_strd, \s_strd, #1
1848        lsl             \d_strd, \d_strd, #1
1849        sxtl            v0.8h,   v0.8b
1850
1851        load_4h         \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22
1852
185348:
1854        subs            \h,  \h,  #4
1855        load_4h         \sr2, \src, \s_strd, v23, v24, v25, v26
1856        smull_smlal_8   v1,  v16, v17, v18, v19, v20, v21, v22, v23
1857        smull_smlal_8   v2,  v17, v18, v19, v20, v21, v22, v23, v24
1858        smull_smlal_8   v3,  v18, v19, v20, v21, v22, v23, v24, v25
1859        smull_smlal_8   v4,  v19, v20, v21, v22, v23, v24, v25, v26
1860        shift_store_4   \type, \d_strd, v1, v2, v3, v4
1861        b.le            0f
1862        mov             v16.8b,  v20.8b
1863        mov             v17.8b,  v21.8b
1864        mov             v18.8b,  v22.8b
1865        mov             v19.8b,  v23.8b
1866        mov             v20.8b,  v24.8b
1867        mov             v21.8b,  v25.8b
1868        mov             v22.8b,  v26.8b
1869        b               48b
18700:
1871        ret
1872
187380:
1874        b.gt            880f
1875
1876        // 8x2, 8x4 v
1877        cmp             \h,  #2
1878        add             \xmy, \xmy, #2
1879        ld1             {v0.s}[0], [\xmy]
1880        sub             \src, \src, \s_strd
1881        add             \ds2, \dst, \d_strd
1882        add             \sr2, \src, \s_strd
1883        lsl             \s_strd, \s_strd, #1
1884        lsl             \d_strd, \d_strd, #1
1885        sxtl            v0.8h,   v0.8b
1886
1887        load_8h         \src, \sr2, \s_strd, v1, v2, v3, v4, v5
1888        smull_smlal_4   v16, v1,  v2,  v3,  v4
1889        smull2_smlal2_4 v17, v1,  v2,  v3,  v4
1890        smull_smlal_4   v18, v2,  v3,  v4,  v5
1891        smull2_smlal2_4 v19, v2,  v3,  v4,  v5
1892        shift_store_8   \type, \d_strd, v16, v17, v18, v19
1893        b.le            0f
1894        load_8h         \sr2, \src, \s_strd, v6, v7
1895        smull_smlal_4   v16, v3,  v4,  v5,  v6
1896        smull2_smlal2_4 v17, v3,  v4,  v5,  v6
1897        smull_smlal_4   v18, v4,  v5,  v6,  v7
1898        smull2_smlal2_4 v19, v4,  v5,  v6,  v7
1899        shift_store_8   \type, \d_strd, v16, v17, v18, v19
19000:
1901        ret
1902
1903880:    // 8x6, 8x8, 8x16, 8x32 v
19041680:   // 16x8, 16x16, ...
1905320:    // 32x8, 32x16, ...
1906640:
19071280:
1908        ld1             {v0.8b}, [\xmy]
1909        sub             \src, \src, \s_strd
1910        sub             \src, \src, \s_strd, lsl #1
1911        sxtl            v0.8h,   v0.8b
1912        mov             \my,  \h
1913168:
1914        add             \ds2, \dst, \d_strd
1915        add             \sr2, \src, \s_strd
1916        lsl             \s_strd, \s_strd, #1
1917        lsl             \d_strd, \d_strd, #1
1918
1919        load_8h         \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22
1920
192188:
1922        subs            \h,  \h,  #2
1923        load_8h         \sr2, \src, \s_strd, v23, v24
1924        smull_smlal_8   v1,  v16, v17, v18, v19, v20, v21, v22, v23
1925        smull2_smlal2_8 v2,  v16, v17, v18, v19, v20, v21, v22, v23
1926        smull_smlal_8   v3,  v17, v18, v19, v20, v21, v22, v23, v24
1927        smull2_smlal2_8 v4,  v17, v18, v19, v20, v21, v22, v23, v24
1928        shift_store_8   \type, \d_strd, v1, v2, v3, v4
1929        b.le            9f
1930        subs            \h,  \h,  #2
1931        load_8h         \sr2, \src, \s_strd, v25, v26
1932        smull_smlal_8   v1,  v18, v19, v20, v21, v22, v23, v24, v25
1933        smull2_smlal2_8 v2,  v18, v19, v20, v21, v22, v23, v24, v25
1934        smull_smlal_8   v3,  v19, v20, v21, v22, v23, v24, v25, v26
1935        smull2_smlal2_8 v4,  v19, v20, v21, v22, v23, v24, v25, v26
1936        shift_store_8   \type, \d_strd, v1, v2, v3, v4
1937        b.le            9f
1938        mov             v16.16b, v20.16b
1939        mov             v17.16b, v21.16b
1940        mov             v18.16b, v22.16b
1941        mov             v19.16b, v23.16b
1942        mov             v20.16b, v24.16b
1943        mov             v21.16b, v25.16b
1944        mov             v22.16b, v26.16b
1945        b               88b
19469:
1947        subs            \w,  \w,  #8
1948        b.le            0f
1949        asr             \s_strd, \s_strd, #1
1950        asr             \d_strd, \d_strd, #1
1951        msub            \src, \s_strd, \xmy, \src
1952        msub            \dst, \d_strd, \xmy, \dst
1953        sub             \src, \src, \s_strd, lsl #3
1954        mov             \h,  \my
1955        add             \src, \src, #16
1956        add             \dst, \dst, #16
1957        b               168b
19580:
1959        ret
1960
1961160:
1962        b.gt            1680b
1963
1964        // 16x2, 16x4 v
1965        add             \xmy, \xmy, #2
1966        ld1             {v0.s}[0], [\xmy]
1967        sub             \src, \src, \s_strd
1968        sxtl            v0.8h,   v0.8b
1969
1970        load_16h        \src, \src, \s_strd, v16, v17, v18, v19, v20, v21
197116:
1972        load_16h        \src, \src, \s_strd, v22, v23
1973        subs            \h,  \h,  #1
1974        smull_smlal_4   v1,  v16, v18, v20, v22
1975        smull2_smlal2_4 v2,  v16, v18, v20, v22
1976        smull_smlal_4   v3,  v17, v19, v21, v23
1977        smull2_smlal2_4 v4,  v17, v19, v21, v23
1978        shift_store_16  \type, \d_strd, x0, v1, v2, v3, v4
1979        b.le            0f
1980        mov             v16.16b, v18.16b
1981        mov             v17.16b, v19.16b
1982        mov             v18.16b, v20.16b
1983        mov             v19.16b, v21.16b
1984        mov             v20.16b, v22.16b
1985        mov             v21.16b, v23.16b
1986        b               16b
19870:
1988        ret
1989
1990L(\type\()_8tap_v_tbl):
1991        .hword L(\type\()_8tap_v_tbl) - 1280b
1992        .hword L(\type\()_8tap_v_tbl) -  640b
1993        .hword L(\type\()_8tap_v_tbl) -  320b
1994        .hword L(\type\()_8tap_v_tbl) -  160b
1995        .hword L(\type\()_8tap_v_tbl) -   80b
1996        .hword L(\type\()_8tap_v_tbl) -   40b
1997        .hword L(\type\()_8tap_v_tbl) -   20b
1998        .hword 0
1999
2000L(\type\()_8tap_hv):
2001        cmp             \h,  #4
2002        ubfx            w10, \my, #7, #7
2003        and             \my, \my, #0x7f
2004        b.le            4f
2005        mov             \my,  w10
20064:
2007        add             \xmy, x11, \my, uxtw #3
2008
2009        adr             x10, L(\type\()_8tap_hv_tbl)
2010        dup             v30.4s,  w12           // 6 - intermediate_bits
2011        ldrh            w9,  [x10, x9, lsl #1]
2012        neg             v30.4s,  v30.4s        // -(6-intermediate_bits)
2013.ifc \type, put
2014        dup             v29.4s,  w13           // 6 + intermediate_bits
2015.else
2016        movi            v29.8h,  #(PREP_BIAS >> 8), lsl #8
2017.endif
2018        sub             x10, x10, w9, uxtw
2019.ifc \type, put
2020        neg             v29.4s,  v29.4s        // -(6+intermediate_bits)
2021.endif
2022        br              x10
2023
202420:
2025.ifc \type, put
2026        add             \xmx,  \xmx,  #2
2027        ld1             {v0.s}[0],  [\xmx]
2028        b.gt            280f
2029        add             \xmy,  \xmy,  #2
2030        ld1             {v1.s}[0],  [\xmy]
2031
2032        // 2x2, 2x4 hv
2033        sub             \sr2, \src, #2
2034        sub             \src, \sr2, \s_strd
2035        add             \ds2, \dst, \d_strd
2036        lsl             \s_strd, \s_strd, #1
2037        lsl             \d_strd, \d_strd, #1
2038        sxtl            v0.8h,   v0.8b
2039        sxtl            v1.8h,   v1.8b
2040        mov             x15, x30
2041
2042        ld1             {v27.8h}, [\src], \s_strd
2043        ext             v28.16b, v27.16b, v27.16b, #2
2044        smull           v27.4s,  v27.4h,  v0.4h
2045        smull           v28.4s,  v28.4h,  v0.4h
2046        addp            v27.4s,  v27.4s,  v28.4s
2047        addp            v16.4s,  v27.4s,  v27.4s
2048        srshl           v16.2s,  v16.2s,  v30.2s // -(6-intermediate_bits)
2049        bl              L(\type\()_8tap_filter_2)
2050        // The intermediates from the horizontal pass fit in 16 bit without
2051        // any bias; we could just as well keep them as .4s, but narrowing
2052        // them to .4h gives a significant speedup on out of order cores
2053        // (at the cost of a smaller slowdown on in-order cores such as A53).
2054        xtn             v16.4h,  v16.4s
2055
2056        trn1            v16.2s,  v16.2s,  v24.2s
2057        mov             v17.8b,  v24.8b
2058
20592:
2060        bl              L(\type\()_8tap_filter_2)
2061
2062        ext             v18.8b,  v17.8b,  v24.8b,  #4
2063        smull           v2.4s,   v16.4h,  v1.h[0]
2064        smlal           v2.4s,   v17.4h,  v1.h[1]
2065        smlal           v2.4s,   v18.4h,  v1.h[2]
2066        smlal           v2.4s,   v24.4h,  v1.h[3]
2067
2068        srshl           v2.4s,   v2.4s,   v29.4s // -(6+intermediate_bits)
2069        sqxtun          v2.4h,   v2.4s
2070        umin            v2.4h,   v2.4h,   v31.4h
2071        subs            \h,  \h,  #2
2072        st1             {v2.s}[0], [\dst], \d_strd
2073        st1             {v2.s}[1], [\ds2], \d_strd
2074        b.le            0f
2075        mov             v16.8b,  v18.8b
2076        mov             v17.8b,  v24.8b
2077        b               2b
2078
2079280:    // 2x8, 2x16, 2x32 hv
2080        ld1             {v1.8b},  [\xmy]
2081        sub             \src, \src, #2
2082        sub             \sr2, \src, \s_strd, lsl #1
2083        sub             \src, \sr2, \s_strd
2084        add             \ds2, \dst, \d_strd
2085        lsl             \s_strd, \s_strd, #1
2086        lsl             \d_strd, \d_strd, #1
2087        sxtl            v0.8h,   v0.8b
2088        sxtl            v1.8h,   v1.8b
2089        mov             x15, x30
2090
2091        ld1             {v27.8h}, [\src], \s_strd
2092        ext             v28.16b, v27.16b, v27.16b, #2
2093        smull           v27.4s,  v27.4h,  v0.4h
2094        smull           v28.4s,  v28.4h,  v0.4h
2095        addp            v27.4s,  v27.4s,  v28.4s
2096        addp            v16.4s,  v27.4s,  v27.4s
2097        srshl           v16.2s,  v16.2s,  v30.2s // -(6-intermediate_bits)
2098        // The intermediates from the horizontal pass fit in 16 bit without
2099        // any bias; we could just as well keep them as .4s, but narrowing
2100        // them to .4h gives a significant speedup on out of order cores
2101        // (at the cost of a smaller slowdown on in-order cores such as A53).
2102
2103        bl              L(\type\()_8tap_filter_2)
2104        xtn             v16.4h,  v16.4s
2105        trn1            v16.2s,  v16.2s,  v24.2s
2106        mov             v17.8b,  v24.8b
2107        bl              L(\type\()_8tap_filter_2)
2108        ext             v18.8b,  v17.8b,  v24.8b,  #4
2109        mov             v19.8b,  v24.8b
2110        bl              L(\type\()_8tap_filter_2)
2111        ext             v20.8b,  v19.8b,  v24.8b,  #4
2112        mov             v21.8b,  v24.8b
2113
211428:
2115        bl              L(\type\()_8tap_filter_2)
2116        ext             v22.8b,  v21.8b,  v24.8b,  #4
2117        smull           v3.4s,   v16.4h,  v1.h[0]
2118        smlal           v3.4s,   v17.4h,  v1.h[1]
2119        smlal           v3.4s,   v18.4h,  v1.h[2]
2120        smlal           v3.4s,   v19.4h,  v1.h[3]
2121        smlal           v3.4s,   v20.4h,  v1.h[4]
2122        smlal           v3.4s,   v21.4h,  v1.h[5]
2123        smlal           v3.4s,   v22.4h,  v1.h[6]
2124        smlal           v3.4s,   v24.4h,  v1.h[7]
2125
2126        srshl           v3.4s,   v3.4s,   v29.4s // -(6+intermediate_bits)
2127        sqxtun          v3.4h,   v3.4s
2128        umin            v3.4h,   v3.4h,   v31.4h
2129        subs            \h,  \h,  #2
2130        st1             {v3.s}[0], [\dst], \d_strd
2131        st1             {v3.s}[1], [\ds2], \d_strd
2132        b.le            0f
2133        mov             v16.8b,  v18.8b
2134        mov             v17.8b,  v19.8b
2135        mov             v18.8b,  v20.8b
2136        mov             v19.8b,  v21.8b
2137        mov             v20.8b,  v22.8b
2138        mov             v21.8b,  v24.8b
2139        b               28b
2140
21410:
2142        br              x15
2143
2144L(\type\()_8tap_filter_2):
2145        ld1             {v25.8h},  [\sr2], \s_strd
2146        ld1             {v27.8h},  [\src], \s_strd
2147        ext             v26.16b, v25.16b, v25.16b, #2
2148        ext             v28.16b, v27.16b, v27.16b, #2
2149        trn1            v24.2s,  v25.2s,  v27.2s
2150        trn2            v27.2s,  v25.2s,  v27.2s
2151        trn1            v25.2s,  v26.2s,  v28.2s
2152        trn2            v28.2s,  v26.2s,  v28.2s
2153        smull           v24.4s,  v24.4h,  v0.h[0]
2154        smlal           v24.4s,  v25.4h,  v0.h[1]
2155        smlal           v24.4s,  v27.4h,  v0.h[2]
2156        smlal           v24.4s,  v28.4h,  v0.h[3]
2157        srshl           v24.4s,  v24.4s,  v30.4s // -(6-intermediate_bits)
2158        xtn             v24.4h,  v24.4s
2159        ret
2160.endif
2161
216240:
2163        add             \xmx, \xmx, #2
2164        ld1             {v0.s}[0],  [\xmx]
2165        b.gt            480f
2166        add             \xmy, \xmy,  #2
2167        ld1             {v1.s}[0],  [\xmy]
2168        sub             \sr2, \src, #2
2169        sub             \src, \sr2, \s_strd
2170        add             \ds2, \dst, \d_strd
2171        lsl             \s_strd, \s_strd, #1
2172        lsl             \d_strd, \d_strd, #1
2173        sxtl            v0.8h,   v0.8b
2174        sxtl            v1.8h,   v1.8b
2175        mov             x15, x30
2176
2177        // 4x2, 4x4 hv
2178        ld1             {v25.8h}, [\src], \s_strd
2179        ext             v26.16b, v25.16b, v25.16b, #2
2180        ext             v27.16b, v25.16b, v25.16b, #4
2181        ext             v28.16b, v25.16b, v25.16b, #6
2182        smull           v25.4s,  v25.4h,  v0.h[0]
2183        smlal           v25.4s,  v26.4h,  v0.h[1]
2184        smlal           v25.4s,  v27.4h,  v0.h[2]
2185        smlal           v25.4s,  v28.4h,  v0.h[3]
2186        srshl           v16.4s,  v25.4s,  v30.4s // -(6-intermediate_bits)
2187        // The intermediates from the horizontal pass fit in 16 bit without
2188        // any bias; we could just as well keep them as .4s, but narrowing
2189        // them to .4h gives a significant speedup on out of order cores
2190        // (at the cost of a smaller slowdown on in-order cores such as A53).
2191        xtn             v16.4h,  v16.4s
2192
2193        bl              L(\type\()_8tap_filter_4)
2194        mov             v17.8b,  v24.8b
2195        mov             v18.8b,  v25.8b
2196
21974:
2198        bl              L(\type\()_8tap_filter_4)
2199        smull           v2.4s,   v16.4h,  v1.h[0]
2200        smlal           v2.4s,   v17.4h,  v1.h[1]
2201        smlal           v2.4s,   v18.4h,  v1.h[2]
2202        smlal           v2.4s,   v24.4h,  v1.h[3]
2203        smull           v3.4s,   v17.4h,  v1.h[0]
2204        smlal           v3.4s,   v18.4h,  v1.h[1]
2205        smlal           v3.4s,   v24.4h,  v1.h[2]
2206        smlal           v3.4s,   v25.4h,  v1.h[3]
2207.ifc \type, put
2208        srshl           v2.4s,   v2.4s,   v29.4s // -(6+intermediate_bits)
2209        srshl           v3.4s,   v3.4s,   v29.4s // -(6+intermediate_bits)
2210        sqxtun          v2.4h,   v2.4s
2211        sqxtun2         v2.8h,   v3.4s
2212        umin            v2.8h,   v2.8h,   v31.8h
2213.else
2214        rshrn           v2.4h,   v2.4s,   #6
2215        rshrn2          v2.8h,   v3.4s,   #6
2216        sub             v2.8h,   v2.8h,   v29.8h // PREP_BIAS
2217.endif
2218        subs            \h,  \h,  #2
2219
2220        st1             {v2.d}[0], [\dst], \d_strd
2221        st1             {v2.d}[1], [\ds2], \d_strd
2222        b.le            0f
2223        mov             v16.8b,  v18.8b
2224        mov             v17.8b,  v24.8b
2225        mov             v18.8b,  v25.8b
2226        b               4b
2227
2228480:    // 4x8, 4x16, 4x32 hv
2229        ld1             {v1.8b},  [\xmy]
2230        sub             \src, \src, #2
2231        sub             \sr2, \src, \s_strd, lsl #1
2232        sub             \src, \sr2, \s_strd
2233        add             \ds2, \dst, \d_strd
2234        lsl             \s_strd, \s_strd, #1
2235        lsl             \d_strd, \d_strd, #1
2236        sxtl            v0.8h,   v0.8b
2237        sxtl            v1.8h,   v1.8b
2238        mov             x15, x30
2239
2240        ld1             {v25.8h}, [\src], \s_strd
2241        ext             v26.16b, v25.16b, v25.16b, #2
2242        ext             v27.16b, v25.16b, v25.16b, #4
2243        ext             v28.16b, v25.16b, v25.16b, #6
2244        smull           v25.4s,  v25.4h,  v0.h[0]
2245        smlal           v25.4s,  v26.4h,  v0.h[1]
2246        smlal           v25.4s,  v27.4h,  v0.h[2]
2247        smlal           v25.4s,  v28.4h,  v0.h[3]
2248        srshl           v16.4s,  v25.4s,  v30.4s // -(6-intermediate_bits)
2249        // The intermediates from the horizontal pass fit in 16 bit without
2250        // any bias; we could just as well keep them as .4s, but narrowing
2251        // them to .4h gives a significant speedup on out of order cores
2252        // (at the cost of a smaller slowdown on in-order cores such as A53).
2253        xtn             v16.4h,  v16.4s
2254
2255        bl              L(\type\()_8tap_filter_4)
2256        mov             v17.8b,  v24.8b
2257        mov             v18.8b,  v25.8b
2258        bl              L(\type\()_8tap_filter_4)
2259        mov             v19.8b,  v24.8b
2260        mov             v20.8b,  v25.8b
2261        bl              L(\type\()_8tap_filter_4)
2262        mov             v21.8b,  v24.8b
2263        mov             v22.8b,  v25.8b
2264
226548:
2266        bl              L(\type\()_8tap_filter_4)
2267        smull           v3.4s,   v16.4h,  v1.h[0]
2268        smlal           v3.4s,   v17.4h,  v1.h[1]
2269        smlal           v3.4s,   v18.4h,  v1.h[2]
2270        smlal           v3.4s,   v19.4h,  v1.h[3]
2271        smlal           v3.4s,   v20.4h,  v1.h[4]
2272        smlal           v3.4s,   v21.4h,  v1.h[5]
2273        smlal           v3.4s,   v22.4h,  v1.h[6]
2274        smlal           v3.4s,   v24.4h,  v1.h[7]
2275        smull           v4.4s,   v17.4h,  v1.h[0]
2276        smlal           v4.4s,   v18.4h,  v1.h[1]
2277        smlal           v4.4s,   v19.4h,  v1.h[2]
2278        smlal           v4.4s,   v20.4h,  v1.h[3]
2279        smlal           v4.4s,   v21.4h,  v1.h[4]
2280        smlal           v4.4s,   v22.4h,  v1.h[5]
2281        smlal           v4.4s,   v24.4h,  v1.h[6]
2282        smlal           v4.4s,   v25.4h,  v1.h[7]
2283.ifc \type, put
2284        srshl           v3.4s,   v3.4s,   v29.4s // -(6+intermediate_bits)
2285        srshl           v4.4s,   v4.4s,   v29.4s // -(6+intermediate_bits)
2286        sqxtun          v3.4h,   v3.4s
2287        sqxtun2         v3.8h,   v4.4s
2288        umin            v3.8h,   v3.8h,   v31.8h
2289.else
2290        rshrn           v3.4h,   v3.4s,   #6
2291        rshrn2          v3.8h,   v4.4s,   #6
2292        sub             v3.8h,   v3.8h,   v29.8h // PREP_BIAS
2293.endif
2294        subs            \h,  \h,  #2
2295        st1             {v3.d}[0], [\dst], \d_strd
2296        st1             {v3.d}[1], [\ds2], \d_strd
2297        b.le            0f
2298        mov             v16.8b,  v18.8b
2299        mov             v17.8b,  v19.8b
2300        mov             v18.8b,  v20.8b
2301        mov             v19.8b,  v21.8b
2302        mov             v20.8b,  v22.8b
2303        mov             v21.8b,  v24.8b
2304        mov             v22.8b,  v25.8b
2305        b               48b
23060:
2307        br              x15
2308
2309L(\type\()_8tap_filter_4):
2310        ld1             {v24.8h}, [\sr2], \s_strd
2311        ld1             {v25.8h}, [\src], \s_strd
2312        ext             v26.16b, v24.16b, v24.16b, #2
2313        ext             v27.16b, v24.16b, v24.16b, #4
2314        ext             v28.16b, v24.16b, v24.16b, #6
2315        smull           v24.4s,  v24.4h,  v0.h[0]
2316        smlal           v24.4s,  v26.4h,  v0.h[1]
2317        smlal           v24.4s,  v27.4h,  v0.h[2]
2318        smlal           v24.4s,  v28.4h,  v0.h[3]
2319        ext             v26.16b, v25.16b, v25.16b, #2
2320        ext             v27.16b, v25.16b, v25.16b, #4
2321        ext             v28.16b, v25.16b, v25.16b, #6
2322        smull           v25.4s,  v25.4h,  v0.h[0]
2323        smlal           v25.4s,  v26.4h,  v0.h[1]
2324        smlal           v25.4s,  v27.4h,  v0.h[2]
2325        smlal           v25.4s,  v28.4h,  v0.h[3]
2326        srshl           v24.4s,  v24.4s,  v30.4s // -(6-intermediate_bits)
2327        srshl           v25.4s,  v25.4s,  v30.4s // -(6-intermediate_bits)
2328        xtn             v24.4h,  v24.4s
2329        xtn             v25.4h,  v25.4s
2330        ret
2331
233280:
2333160:
2334320:
2335        b.gt            880f
2336        add             \xmy,  \xmy,  #2
2337        ld1             {v0.8b},  [\xmx]
2338        ld1             {v1.s}[0],  [\xmy]
2339        sub             \src,  \src,  #6
2340        sub             \src,  \src,  \s_strd
2341        sxtl            v0.8h,   v0.8b
2342        sxtl            v1.8h,   v1.8b
2343        mov             x15, x30
2344        mov             \my, \h
2345
2346164:    // 8x2, 8x4, 16x2, 16x4, 32x2, 32x4 hv
2347        add             \ds2,  \dst,  \d_strd
2348        add             \sr2,  \src,  \s_strd
2349        lsl             \d_strd, \d_strd, #1
2350        lsl             \s_strd, \s_strd, #1
2351
2352        ld1             {v27.8h, v28.8h},  [\src], \s_strd
2353        smull           v24.4s,  v27.4h,  v0.h[0]
2354        smull2          v25.4s,  v27.8h,  v0.h[0]
2355.irpc i, 1234567
2356        ext             v26.16b, v27.16b, v28.16b, #(2*\i)
2357        smlal           v24.4s,  v26.4h,  v0.h[\i]
2358        smlal2          v25.4s,  v26.8h,  v0.h[\i]
2359.endr
2360        srshl           v24.4s,  v24.4s,  v30.4s // -(6-intermediate_bits)
2361        srshl           v25.4s,  v25.4s,  v30.4s // -(6-intermediate_bits)
2362        // The intermediates from the horizontal pass fit in 16 bit without
2363        // any bias; we could just as well keep them as .4s, but narrowing
2364        // them to .4h gives a significant speedup on out of order cores
2365        // (at the cost of a smaller slowdown on in-order cores such as A53),
2366        // and conserves register space (no need to clobber v8-v15).
2367        xtn             v16.4h,  v24.4s
2368        xtn2            v16.8h,  v25.4s
2369
2370        bl              L(\type\()_8tap_filter_8)
2371        mov             v17.16b, v23.16b
2372        mov             v18.16b, v24.16b
2373
23748:
2375        smull           v2.4s,   v16.4h,  v1.h[0]
2376        smull2          v3.4s,   v16.8h,  v1.h[0]
2377        bl              L(\type\()_8tap_filter_8)
2378        smull           v4.4s,   v17.4h,  v1.h[0]
2379        smull2          v5.4s,   v17.8h,  v1.h[0]
2380        smlal           v2.4s,   v17.4h,  v1.h[1]
2381        smlal2          v3.4s,   v17.8h,  v1.h[1]
2382        smlal           v4.4s,   v18.4h,  v1.h[1]
2383        smlal2          v5.4s,   v18.8h,  v1.h[1]
2384        smlal           v2.4s,   v18.4h,  v1.h[2]
2385        smlal2          v3.4s,   v18.8h,  v1.h[2]
2386        smlal           v4.4s,   v23.4h,  v1.h[2]
2387        smlal2          v5.4s,   v23.8h,  v1.h[2]
2388        smlal           v2.4s,   v23.4h,  v1.h[3]
2389        smlal2          v3.4s,   v23.8h,  v1.h[3]
2390        smlal           v4.4s,   v24.4h,  v1.h[3]
2391        smlal2          v5.4s,   v24.8h,  v1.h[3]
2392.ifc \type, put
2393        srshl           v2.4s,   v2.4s,   v29.4s // -(6+intermediate_bits)
2394        srshl           v3.4s,   v3.4s,   v29.4s // -(6+intermediate_bits)
2395        srshl           v4.4s,   v4.4s,   v29.4s // -(6+intermediate_bits)
2396        srshl           v5.4s,   v5.4s,   v29.4s // -(6+intermediate_bits)
2397        sqxtun          v2.4h,   v2.4s
2398        sqxtun2         v2.8h,   v3.4s
2399        sqxtun          v3.4h,   v4.4s
2400        sqxtun2         v3.8h,   v5.4s
2401        umin            v2.8h,   v2.8h,   v31.8h
2402        umin            v3.8h,   v3.8h,   v31.8h
2403.else
2404        rshrn           v2.4h,   v2.4s,   #6
2405        rshrn2          v2.8h,   v3.4s,   #6
2406        rshrn           v3.4h,   v4.4s,   #6
2407        rshrn2          v3.8h,   v5.4s,   #6
2408        sub             v2.8h,   v2.8h,   v29.8h // PREP_BIAS
2409        sub             v3.8h,   v3.8h,   v29.8h // PREP_BIAS
2410.endif
2411        subs            \h,  \h,  #2
2412        st1             {v2.8h}, [\dst], \d_strd
2413        st1             {v3.8h}, [\ds2], \d_strd
2414        b.le            9f
2415        mov             v16.16b, v18.16b
2416        mov             v17.16b, v23.16b
2417        mov             v18.16b, v24.16b
2418        b               8b
24199:
2420        subs            \w,  \w,  #8
2421        b.le            0f
2422        asr             \s_strd,  \s_strd,  #1
2423        asr             \d_strd,  \d_strd,  #1
2424        msub            \src,  \s_strd,  \xmy,  \src
2425        msub            \dst,  \d_strd,  \xmy,  \dst
2426        sub             \src,  \src,  \s_strd,  lsl #2
2427        mov             \h,  \my
2428        add             \src,  \src,  #16
2429        add             \dst,  \dst,  #16
2430        b               164b
2431
2432880:    // 8x8, 8x16, ..., 16x8, ..., 32x8, ... hv
2433640:
24341280:
2435        ld1             {v0.8b},  [\xmx]
2436        ld1             {v1.8b},  [\xmy]
2437        sub             \src,  \src,  #6
2438        sub             \src,  \src,  \s_strd
2439        sub             \src,  \src,  \s_strd, lsl #1
2440        sxtl            v0.8h,   v0.8b
2441        sxtl            v1.8h,   v1.8b
2442        mov             x15, x30
2443        mov             \my, \h
2444
2445168:
2446        add             \ds2,  \dst,  \d_strd
2447        add             \sr2,  \src,  \s_strd
2448        lsl             \d_strd, \d_strd, #1
2449        lsl             \s_strd, \s_strd, #1
2450
2451        ld1             {v27.8h, v28.8h},  [\src], \s_strd
2452        smull           v24.4s,  v27.4h,  v0.h[0]
2453        smull2          v25.4s,  v27.8h,  v0.h[0]
2454.irpc i, 1234567
2455        ext             v26.16b, v27.16b, v28.16b, #(2*\i)
2456        smlal           v24.4s,  v26.4h,  v0.h[\i]
2457        smlal2          v25.4s,  v26.8h,  v0.h[\i]
2458.endr
2459        srshl           v24.4s,  v24.4s,  v30.4s // -(6-intermediate_bits)
2460        srshl           v25.4s,  v25.4s,  v30.4s // -(6-intermediate_bits)
2461        // The intermediates from the horizontal pass fit in 16 bit without
2462        // any bias; we could just as well keep them as .4s, but narrowing
2463        // them to .4h gives a significant speedup on out of order cores
2464        // (at the cost of a smaller slowdown on in-order cores such as A53),
2465        // and conserves register space (no need to clobber v8-v15).
2466        xtn             v16.4h,  v24.4s
2467        xtn2            v16.8h,  v25.4s
2468
2469        bl              L(\type\()_8tap_filter_8)
2470        mov             v17.16b, v23.16b
2471        mov             v18.16b, v24.16b
2472        bl              L(\type\()_8tap_filter_8)
2473        mov             v19.16b, v23.16b
2474        mov             v20.16b, v24.16b
2475        bl              L(\type\()_8tap_filter_8)
2476        mov             v21.16b, v23.16b
2477        mov             v22.16b, v24.16b
2478
247988:
2480        smull           v2.4s,   v16.4h,  v1.h[0]
2481        smull2          v3.4s,   v16.8h,  v1.h[0]
2482        bl              L(\type\()_8tap_filter_8)
2483        smull           v4.4s,   v17.4h,  v1.h[0]
2484        smull2          v5.4s,   v17.8h,  v1.h[0]
2485        smlal           v2.4s,   v17.4h,  v1.h[1]
2486        smlal2          v3.4s,   v17.8h,  v1.h[1]
2487        smlal           v4.4s,   v18.4h,  v1.h[1]
2488        smlal2          v5.4s,   v18.8h,  v1.h[1]
2489        smlal           v2.4s,   v18.4h,  v1.h[2]
2490        smlal2          v3.4s,   v18.8h,  v1.h[2]
2491        smlal           v4.4s,   v19.4h,  v1.h[2]
2492        smlal2          v5.4s,   v19.8h,  v1.h[2]
2493        smlal           v2.4s,   v19.4h,  v1.h[3]
2494        smlal2          v3.4s,   v19.8h,  v1.h[3]
2495        smlal           v4.4s,   v20.4h,  v1.h[3]
2496        smlal2          v5.4s,   v20.8h,  v1.h[3]
2497        smlal           v2.4s,   v20.4h,  v1.h[4]
2498        smlal2          v3.4s,   v20.8h,  v1.h[4]
2499        smlal           v4.4s,   v21.4h,  v1.h[4]
2500        smlal2          v5.4s,   v21.8h,  v1.h[4]
2501        smlal           v2.4s,   v21.4h,  v1.h[5]
2502        smlal2          v3.4s,   v21.8h,  v1.h[5]
2503        smlal           v4.4s,   v22.4h,  v1.h[5]
2504        smlal2          v5.4s,   v22.8h,  v1.h[5]
2505        smlal           v2.4s,   v22.4h,  v1.h[6]
2506        smlal2          v3.4s,   v22.8h,  v1.h[6]
2507        smlal           v4.4s,   v23.4h,  v1.h[6]
2508        smlal2          v5.4s,   v23.8h,  v1.h[6]
2509        smlal           v2.4s,   v23.4h,  v1.h[7]
2510        smlal2          v3.4s,   v23.8h,  v1.h[7]
2511        smlal           v4.4s,   v24.4h,  v1.h[7]
2512        smlal2          v5.4s,   v24.8h,  v1.h[7]
2513.ifc \type, put
2514        srshl           v2.4s,   v2.4s,   v29.4s // -(6+intermediate_bits)
2515        srshl           v3.4s,   v3.4s,   v29.4s // -(6+intermediate_bits)
2516        srshl           v4.4s,   v4.4s,   v29.4s // -(6+intermediate_bits)
2517        srshl           v5.4s,   v5.4s,   v29.4s // -(6+intermediate_bits)
2518        sqxtun          v2.4h,   v2.4s
2519        sqxtun2         v2.8h,   v3.4s
2520        sqxtun          v3.4h,   v4.4s
2521        sqxtun2         v3.8h,   v5.4s
2522        umin            v2.8h,   v2.8h,   v31.8h
2523        umin            v3.8h,   v3.8h,   v31.8h
2524.else
2525        rshrn           v2.4h,   v2.4s,   #6
2526        rshrn2          v2.8h,   v3.4s,   #6
2527        rshrn           v3.4h,   v4.4s,   #6
2528        rshrn2          v3.8h,   v5.4s,   #6
2529        sub             v2.8h,   v2.8h,   v29.8h // PREP_BIAS
2530        sub             v3.8h,   v3.8h,   v29.8h // PREP_BIAS
2531.endif
2532        subs            \h,  \h,  #2
2533        st1             {v2.8h}, [\dst], \d_strd
2534        st1             {v3.8h}, [\ds2], \d_strd
2535        b.le            9f
2536        mov             v16.16b, v18.16b
2537        mov             v17.16b, v19.16b
2538        mov             v18.16b, v20.16b
2539        mov             v19.16b, v21.16b
2540        mov             v20.16b, v22.16b
2541        mov             v21.16b, v23.16b
2542        mov             v22.16b, v24.16b
2543        b               88b
25449:
2545        subs            \w,  \w,  #8
2546        b.le            0f
2547        asr             \s_strd,  \s_strd,  #1
2548        asr             \d_strd,  \d_strd,  #1
2549        msub            \src,  \s_strd,  \xmy,  \src
2550        msub            \dst,  \d_strd,  \xmy,  \dst
2551        sub             \src,  \src,  \s_strd,  lsl #3
2552        mov             \h,  \my
2553        add             \src,  \src,  #16
2554        add             \dst,  \dst,  #16
2555        b               168b
25560:
2557        br              x15
2558
2559L(\type\()_8tap_filter_8):
2560        ld1             {v4.8h, v5.8h},  [\sr2], \s_strd
2561        ld1             {v6.8h, v7.8h},  [\src], \s_strd
2562        smull           v25.4s,  v4.4h,   v0.h[0]
2563        smull2          v26.4s,  v4.8h,   v0.h[0]
2564        smull           v27.4s,  v6.4h,   v0.h[0]
2565        smull2          v28.4s,  v6.8h,   v0.h[0]
2566.irpc i, 1234567
2567        ext             v23.16b, v4.16b,  v5.16b,  #(2*\i)
2568        ext             v24.16b, v6.16b,  v7.16b,  #(2*\i)
2569        smlal           v25.4s,  v23.4h,  v0.h[\i]
2570        smlal2          v26.4s,  v23.8h,  v0.h[\i]
2571        smlal           v27.4s,  v24.4h,  v0.h[\i]
2572        smlal2          v28.4s,  v24.8h,  v0.h[\i]
2573.endr
2574        srshl           v25.4s,  v25.4s,  v30.4s // -(6-intermediate_bits)
2575        srshl           v26.4s,  v26.4s,  v30.4s // -(6-intermediate_bits)
2576        srshl           v27.4s,  v27.4s,  v30.4s // -(6-intermediate_bits)
2577        srshl           v28.4s,  v28.4s,  v30.4s // -(6-intermediate_bits)
2578        xtn             v23.4h,  v25.4s
2579        xtn2            v23.8h,  v26.4s
2580        xtn             v24.4h,  v27.4s
2581        xtn2            v24.8h,  v28.4s
2582        ret
2583
2584L(\type\()_8tap_hv_tbl):
2585        .hword L(\type\()_8tap_hv_tbl) - 1280b
2586        .hword L(\type\()_8tap_hv_tbl) -  640b
2587        .hword L(\type\()_8tap_hv_tbl) -  320b
2588        .hword L(\type\()_8tap_hv_tbl) -  160b
2589        .hword L(\type\()_8tap_hv_tbl) -   80b
2590        .hword L(\type\()_8tap_hv_tbl) -   40b
2591        .hword L(\type\()_8tap_hv_tbl) -   20b
2592        .hword 0
2593endfunc
2594
2595
2596function \type\()_bilin_16bpc_neon, export=1
2597.ifc \bdmax, w8
2598        ldr             w8,  [sp]
2599.endif
2600        dup             v1.8h,   \mx
2601        dup             v3.8h,   \my
2602        mov             w10, #16
2603        sub             w9,  w10, \mx
2604        sub             w10, w10, \my
2605        dup             v0.8h,   w9
2606        dup             v2.8h,   w10
2607.ifc \type, prep
2608        uxtw            \d_strd, \w
2609        lsl             \d_strd, \d_strd, #1
2610.endif
2611
2612        clz             \bdmax,   \bdmax       // bitdepth_max
2613        clz             w9,  \w
2614        sub             \bdmax,   \bdmax,  #18 // intermediate_bits = clz(bitdepth_max) - 18
2615        mov             w11, #4
2616        sub             w9,  w9,  #24
2617        sub             w11, w11, \bdmax  // 4 - intermediate_bits
2618        add             w12, \bdmax, #4   // 4 + intermediate_bits
2619        cbnz            \mx, L(\type\()_bilin_h)
2620        cbnz            \my, L(\type\()_bilin_v)
2621        b               \type\()_neon
2622
2623L(\type\()_bilin_h):
2624        cbnz            \my, L(\type\()_bilin_hv)
2625
2626        adr             x10, L(\type\()_bilin_h_tbl)
2627        dup             v31.8h,  w11      // 4 - intermediate_bits
2628        ldrh            w9,  [x10, x9, lsl #1]
2629        neg             v31.8h,  v31.8h   // -(4-intermediate_bits)
2630.ifc \type, put
2631        dup             v30.8h,  \bdmax   // intermediate_bits
2632.else
2633        movi            v29.8h,  #(PREP_BIAS >> 8), lsl #8
2634.endif
2635        sub             x10, x10, w9, uxtw
2636.ifc \type, put
2637        neg             v30.8h,  v30.8h   // -intermediate_bits
2638.endif
2639        br              x10
2640
264120:     // 2xN h
2642.ifc \type, put
2643        add             \ds2,  \dst,  \d_strd
2644        add             \sr2,  \src,  \s_strd
2645        lsl             \d_strd,  \d_strd,  #1
2646        lsl             \s_strd,  \s_strd,  #1
26472:
2648        ld1             {v4.4h},  [\src], \s_strd
2649        ld1             {v6.4h},  [\sr2], \s_strd
2650        ext             v5.8b,   v4.8b,   v4.8b,   #2
2651        ext             v7.8b,   v6.8b,   v6.8b,   #2
2652        trn1            v4.2s,   v4.2s,   v6.2s
2653        trn1            v5.2s,   v5.2s,   v7.2s
2654        subs            \h,  \h,  #2
2655        mul             v4.4h,   v4.4h,   v0.4h
2656        mla             v4.4h,   v5.4h,   v1.4h
2657        urshl           v4.4h,   v4.4h,   v31.4h
2658        urshl           v4.4h,   v4.4h,   v30.4h
2659        st1             {v4.s}[0], [\dst], \d_strd
2660        st1             {v4.s}[1], [\ds2], \d_strd
2661        b.gt            2b
2662        ret
2663.endif
2664
266540:     // 4xN h
2666        add             \ds2,  \dst,  \d_strd
2667        add             \sr2,  \src,  \s_strd
2668        lsl             \d_strd,  \d_strd,  #1
2669        lsl             \s_strd,  \s_strd,  #1
26704:
2671        ld1             {v4.8h}, [\src], \s_strd
2672        ld1             {v6.8h}, [\sr2], \s_strd
2673        ext             v5.16b,  v4.16b,  v4.16b,  #2
2674        ext             v7.16b,  v6.16b,  v6.16b,  #2
2675        trn1            v4.2d,   v4.2d,   v6.2d
2676        trn1            v5.2d,   v5.2d,   v7.2d
2677        subs            \h,  \h,  #2
2678        mul             v4.8h,   v4.8h,   v0.8h
2679        mla             v4.8h,   v5.8h,   v1.8h
2680        urshl           v4.8h,   v4.8h,   v31.8h
2681.ifc \type, put
2682        urshl           v4.8h,   v4.8h,   v30.8h
2683.else
2684        sub             v4.8h,   v4.8h,   v29.8h
2685.endif
2686        st1             {v4.d}[0], [\dst], \d_strd
2687        st1             {v4.d}[1], [\ds2], \d_strd
2688        b.gt            4b
2689        ret
2690
269180:     // 8xN h
2692        add             \ds2,  \dst,  \d_strd
2693        add             \sr2,  \src,  \s_strd
2694        lsl             \d_strd,  \d_strd,  #1
2695        lsl             \s_strd,  \s_strd,  #1
26968:
2697        ldr             h5,  [\src, #16]
2698        ldr             h7,  [\sr2, #16]
2699        ld1             {v4.8h}, [\src], \s_strd
2700        ld1             {v6.8h}, [\sr2], \s_strd
2701        ext             v5.16b,  v4.16b,  v5.16b,  #2
2702        ext             v7.16b,  v6.16b,  v7.16b,  #2
2703        subs            \h,  \h,  #2
2704        mul             v4.8h,   v4.8h,   v0.8h
2705        mla             v4.8h,   v5.8h,   v1.8h
2706        mul             v6.8h,   v6.8h,   v0.8h
2707        mla             v6.8h,   v7.8h,   v1.8h
2708        urshl           v4.8h,   v4.8h,   v31.8h
2709        urshl           v6.8h,   v6.8h,   v31.8h
2710.ifc \type, put
2711        urshl           v4.8h,   v4.8h,   v30.8h
2712        urshl           v6.8h,   v6.8h,   v30.8h
2713.else
2714        sub             v4.8h,   v4.8h,   v29.8h
2715        sub             v6.8h,   v6.8h,   v29.8h
2716.endif
2717        st1             {v4.8h}, [\dst], \d_strd
2718        st1             {v6.8h}, [\ds2], \d_strd
2719        b.gt            8b
2720        ret
2721160:
2722320:
2723640:
27241280:   // 16xN, 32xN, ... h
2725        add             \ds2,  \dst,  \d_strd
2726        add             \sr2,  \src,  \s_strd
2727        lsl             \s_strd,  \s_strd,  #1
2728
2729        sub             \s_strd,  \s_strd,  \w, uxtw #1
2730        sub             \s_strd,  \s_strd,  #16
2731.ifc \type, put
2732        lsl             \d_strd,  \d_strd,  #1
2733        sub             \d_strd,  \d_strd,  \w, uxtw #1
2734.endif
2735161:
2736        ld1             {v16.8h},  [\src], #16
2737        ld1             {v21.8h},  [\sr2], #16
2738        mov             \mx, \w
2739
274016:
2741        ld1             {v17.8h, v18.8h},  [\src], #32
2742        ld1             {v22.8h, v23.8h},  [\sr2], #32
2743        ext             v19.16b, v16.16b, v17.16b, #2
2744        ext             v20.16b, v17.16b, v18.16b, #2
2745        ext             v24.16b, v21.16b, v22.16b, #2
2746        ext             v25.16b, v22.16b, v23.16b, #2
2747        mul             v16.8h,  v16.8h,  v0.8h
2748        mla             v16.8h,  v19.8h,  v1.8h
2749        mul             v17.8h,  v17.8h,  v0.8h
2750        mla             v17.8h,  v20.8h,  v1.8h
2751        mul             v21.8h,  v21.8h,  v0.8h
2752        mla             v21.8h,  v24.8h,  v1.8h
2753        mul             v22.8h,  v22.8h,  v0.8h
2754        mla             v22.8h,  v25.8h,  v1.8h
2755        urshl           v16.8h,  v16.8h,  v31.8h
2756        urshl           v17.8h,  v17.8h,  v31.8h
2757        urshl           v21.8h,  v21.8h,  v31.8h
2758        urshl           v22.8h,  v22.8h,  v31.8h
2759        subs            \mx, \mx, #16
2760.ifc \type, put
2761        urshl           v16.8h,  v16.8h,  v30.8h
2762        urshl           v17.8h,  v17.8h,  v30.8h
2763        urshl           v21.8h,  v21.8h,  v30.8h
2764        urshl           v22.8h,  v22.8h,  v30.8h
2765.else
2766        sub             v16.8h,  v16.8h,  v29.8h
2767        sub             v17.8h,  v17.8h,  v29.8h
2768        sub             v21.8h,  v21.8h,  v29.8h
2769        sub             v22.8h,  v22.8h,  v29.8h
2770.endif
2771        st1             {v16.8h, v17.8h}, [\dst], #32
2772        st1             {v21.8h, v22.8h}, [\ds2], #32
2773        b.le            9f
2774
2775        mov             v16.16b, v18.16b
2776        mov             v21.16b, v23.16b
2777        b               16b
2778
27799:
2780        add             \dst,  \dst,  \d_strd
2781        add             \ds2,  \ds2,  \d_strd
2782        add             \src,  \src,  \s_strd
2783        add             \sr2,  \sr2,  \s_strd
2784
2785        subs            \h,  \h,  #2
2786        b.gt            161b
2787        ret
2788
2789L(\type\()_bilin_h_tbl):
2790        .hword L(\type\()_bilin_h_tbl) - 1280b
2791        .hword L(\type\()_bilin_h_tbl) -  640b
2792        .hword L(\type\()_bilin_h_tbl) -  320b
2793        .hword L(\type\()_bilin_h_tbl) -  160b
2794        .hword L(\type\()_bilin_h_tbl) -   80b
2795        .hword L(\type\()_bilin_h_tbl) -   40b
2796        .hword L(\type\()_bilin_h_tbl) -   20b
2797        .hword 0
2798
2799
2800L(\type\()_bilin_v):
2801        cmp             \h,  #4
2802        adr             x10, L(\type\()_bilin_v_tbl)
2803.ifc \type, prep
2804        dup             v31.8h,  w11      // 4 - intermediate_bits
2805.endif
2806        ldrh            w9,  [x10, x9, lsl #1]
2807.ifc \type, prep
2808        movi            v29.8h,  #(PREP_BIAS >> 8), lsl #8
2809        neg             v31.8h,  v31.8h   // -(4-intermediate_bits)
2810.endif
2811        sub             x10, x10, w9, uxtw
2812        br              x10
2813
281420:     // 2xN v
2815.ifc \type, put
2816        cmp             \h,  #2
2817        add             \ds2,  \dst,  \d_strd
2818        add             \sr2,  \src,  \s_strd
2819        lsl             \s_strd,  \s_strd,  #1
2820        lsl             \d_strd,  \d_strd,  #1
2821
2822        // 2x2 v
2823        ld1             {v16.s}[0], [\src], \s_strd
2824        b.gt            24f
2825        ld1             {v17.s}[0], [\sr2], \s_strd
2826        ld1             {v18.s}[0], [\src], \s_strd
2827        trn1            v16.2s,  v16.2s,  v17.2s
2828        trn1            v17.2s,  v17.2s,  v18.2s
2829        mul             v4.4h,   v16.4h,  v2.4h
2830        mla             v4.4h,   v17.4h,  v3.4h
2831        urshr           v4.8h,   v4.8h,   #4
2832        st1             {v4.s}[0], [\dst]
2833        st1             {v4.s}[1], [\ds2]
2834        ret
283524:     // 2x4, 2x8, ... v
2836        ld1             {v17.s}[0], [\sr2], \s_strd
2837        ld1             {v18.s}[0], [\src], \s_strd
2838        ld1             {v19.s}[0], [\sr2], \s_strd
2839        ld1             {v20.s}[0], [\src], \s_strd
2840        trn1            v16.2s,  v16.2s,  v17.2s
2841        trn1            v17.2s,  v17.2s,  v18.2s
2842        trn1            v18.2s,  v18.2s,  v19.2s
2843        trn1            v19.2s,  v19.2s,  v20.2s
2844        trn1            v16.2d,  v16.2d,  v18.2d
2845        trn1            v17.2d,  v17.2d,  v19.2d
2846        mul             v4.8h,   v16.8h,  v2.8h
2847        mla             v4.8h,   v17.8h,  v3.8h
2848        subs            \h,  \h,  #4
2849        urshr           v4.8h,   v4.8h,   #4
2850        st1             {v4.s}[0], [\dst], \d_strd
2851        st1             {v4.s}[1], [\ds2], \d_strd
2852        st1             {v4.s}[2], [\dst], \d_strd
2853        st1             {v4.s}[3], [\ds2], \d_strd
2854        b.le            0f
2855        mov             v16.8b,  v20.8b
2856        b               24b
28570:
2858        ret
2859.endif
2860
286140:     // 4xN v
2862        add             \ds2,  \dst,  \d_strd
2863        add             \sr2,  \src,  \s_strd
2864        lsl             \s_strd,  \s_strd,  #1
2865        lsl             \d_strd,  \d_strd,  #1
2866        ld1             {v16.4h}, [\src], \s_strd
28674:
2868        ld1             {v17.4h}, [\sr2], \s_strd
2869        ld1             {v18.4h}, [\src], \s_strd
2870        trn1            v16.2d,  v16.2d,  v17.2d
2871        trn1            v17.2d,  v17.2d,  v18.2d
2872        mul             v4.8h,   v16.8h,  v2.8h
2873        mla             v4.8h,   v17.8h,  v3.8h
2874        subs            \h,  \h,  #2
2875.ifc \type, put
2876        urshr           v4.8h,   v4.8h,   #4
2877.else
2878        urshl           v4.8h,   v4.8h,   v31.8h
2879        sub             v4.8h,   v4.8h,   v29.8h
2880.endif
2881        st1             {v4.d}[0], [\dst], \d_strd
2882        st1             {v4.d}[1], [\ds2], \d_strd
2883        b.le            0f
2884        mov             v16.8b,  v18.8b
2885        b               4b
28860:
2887        ret
2888
288980:     // 8xN v
2890        add             \ds2,  \dst,  \d_strd
2891        add             \sr2,  \src,  \s_strd
2892        lsl             \s_strd,  \s_strd,  #1
2893        lsl             \d_strd,  \d_strd,  #1
2894        ld1             {v16.8h}, [\src], \s_strd
28958:
2896        ld1             {v17.8h}, [\sr2], \s_strd
2897        ld1             {v18.8h}, [\src], \s_strd
2898        mul             v4.8h,   v16.8h,  v2.8h
2899        mla             v4.8h,   v17.8h,  v3.8h
2900        mul             v5.8h,   v17.8h,  v2.8h
2901        mla             v5.8h,   v18.8h,  v3.8h
2902        subs            \h,  \h,  #2
2903.ifc \type, put
2904        urshr           v4.8h,   v4.8h,   #4
2905        urshr           v5.8h,   v5.8h,   #4
2906.else
2907        urshl           v4.8h,   v4.8h,   v31.8h
2908        urshl           v5.8h,   v5.8h,   v31.8h
2909        sub             v4.8h,   v4.8h,   v29.8h
2910        sub             v5.8h,   v5.8h,   v29.8h
2911.endif
2912        st1             {v4.8h}, [\dst], \d_strd
2913        st1             {v5.8h}, [\ds2], \d_strd
2914        b.le            0f
2915        mov             v16.16b, v18.16b
2916        b               8b
29170:
2918        ret
2919
2920160:    // 16xN, 32xN, ...
2921320:
2922640:
29231280:
2924        mov             \my, \h
29251:
2926        add             \ds2, \dst, \d_strd
2927        add             \sr2, \src, \s_strd
2928        lsl             \s_strd, \s_strd, #1
2929        lsl             \d_strd, \d_strd, #1
2930
2931        ld1             {v16.8h, v17.8h}, [\src], \s_strd
29322:
2933        ld1             {v18.8h, v19.8h}, [\sr2], \s_strd
2934        ld1             {v20.8h, v21.8h}, [\src], \s_strd
2935        mul             v4.8h,   v16.8h,  v2.8h
2936        mla             v4.8h,   v18.8h,  v3.8h
2937        mul             v5.8h,   v17.8h,  v2.8h
2938        mla             v5.8h,   v19.8h,  v3.8h
2939        mul             v6.8h,   v18.8h,  v2.8h
2940        mla             v6.8h,   v20.8h,  v3.8h
2941        mul             v7.8h,   v19.8h,  v2.8h
2942        mla             v7.8h,   v21.8h,  v3.8h
2943        subs            \h,  \h,  #2
2944.ifc \type, put
2945        urshr           v4.8h,   v4.8h,   #4
2946        urshr           v5.8h,   v5.8h,   #4
2947        urshr           v6.8h,   v6.8h,   #4
2948        urshr           v7.8h,   v7.8h,   #4
2949.else
2950        urshl           v4.8h,   v4.8h,   v31.8h
2951        urshl           v5.8h,   v5.8h,   v31.8h
2952        urshl           v6.8h,   v6.8h,   v31.8h
2953        urshl           v7.8h,   v7.8h,   v31.8h
2954        sub             v4.8h,   v4.8h,   v29.8h
2955        sub             v5.8h,   v5.8h,   v29.8h
2956        sub             v6.8h,   v6.8h,   v29.8h
2957        sub             v7.8h,   v7.8h,   v29.8h
2958.endif
2959        st1             {v4.8h, v5.8h}, [\dst], \d_strd
2960        st1             {v6.8h, v7.8h}, [\ds2], \d_strd
2961        b.le            9f
2962        mov             v16.16b, v20.16b
2963        mov             v17.16b, v21.16b
2964        b               2b
29659:
2966        subs            \w,  \w,  #16
2967        b.le            0f
2968        asr             \s_strd, \s_strd, #1
2969        asr             \d_strd, \d_strd, #1
2970        msub            \src, \s_strd, \xmy, \src
2971        msub            \dst, \d_strd, \xmy, \dst
2972        sub             \src, \src, \s_strd, lsl #1
2973        mov             \h,  \my
2974        add             \src, \src, #32
2975        add             \dst, \dst, #32
2976        b               1b
29770:
2978        ret
2979
2980L(\type\()_bilin_v_tbl):
2981        .hword L(\type\()_bilin_v_tbl) - 1280b
2982        .hword L(\type\()_bilin_v_tbl) -  640b
2983        .hword L(\type\()_bilin_v_tbl) -  320b
2984        .hword L(\type\()_bilin_v_tbl) -  160b
2985        .hword L(\type\()_bilin_v_tbl) -   80b
2986        .hword L(\type\()_bilin_v_tbl) -   40b
2987        .hword L(\type\()_bilin_v_tbl) -   20b
2988        .hword 0
2989
2990L(\type\()_bilin_hv):
2991        adr             x10, L(\type\()_bilin_hv_tbl)
2992        dup             v31.8h,  w11      // 4 - intermediate_bits
2993        ldrh            w9,  [x10, x9, lsl #1]
2994        neg             v31.8h,  v31.8h   // -(4-intermediate_bits)
2995.ifc \type, put
2996        dup             v30.4s,  w12      // 4 + intermediate_bits
2997.else
2998        movi            v29.8h,  #(PREP_BIAS >> 8), lsl #8
2999.endif
3000        sub             x10, x10, w9, uxtw
3001.ifc \type, put
3002        neg             v30.4s,  v30.4s   // -(4+intermediate_bits)
3003.endif
3004        br              x10
3005
300620:     // 2xN hv
3007.ifc \type, put
3008        add             \sr2, \src, \s_strd
3009        add             \ds2, \dst, \d_strd
3010        lsl             \s_strd, \s_strd, #1
3011        lsl             \d_strd, \d_strd, #1
3012
3013        ld1             {v20.4h},  [\src], \s_strd
3014        ext             v21.8b,  v20.8b,  v20.8b,  #2
3015        mul             v16.4h,  v20.4h,  v0.4h
3016        mla             v16.4h,  v21.4h,  v1.4h
3017        urshl           v16.4h,  v16.4h,  v31.4h
3018
30192:
3020        ld1             {v22.4h},  [\sr2], \s_strd
3021        ld1             {v24.4h},  [\src], \s_strd
3022        ext             v23.8b,  v22.8b,  v22.8b,  #2
3023        ext             v25.8b,  v24.8b,  v24.8b,  #2
3024        trn1            v22.2s,  v22.2s,  v24.2s
3025        trn1            v23.2s,  v23.2s,  v25.2s
3026        mul             v17.4h,  v22.4h,  v0.4h
3027        mla             v17.4h,  v23.4h,  v1.4h
3028        urshl           v17.4h,  v17.4h,  v31.4h
3029
3030        trn1            v16.2s,  v16.2s,  v17.2s
3031
3032        umull           v4.4s,   v16.4h,  v2.4h
3033        umlal           v4.4s,   v17.4h,  v3.4h
3034        urshl           v4.4s,   v4.4s,   v30.4s
3035        xtn             v4.4h,   v4.4s
3036        subs            \h,  \h,  #2
3037        st1             {v4.s}[0], [\dst], \d_strd
3038        st1             {v4.s}[1], [\ds2], \d_strd
3039        b.le            0f
3040        trn2            v16.2s,  v17.2s,  v17.2s
3041        b               2b
30420:
3043        ret
3044.endif
3045
304640:     // 4xN hv
3047        add             \sr2, \src, \s_strd
3048        add             \ds2, \dst, \d_strd
3049        lsl             \s_strd, \s_strd, #1
3050        lsl             \d_strd, \d_strd, #1
3051
3052        ld1             {v20.8h},  [\src], \s_strd
3053        ext             v21.16b, v20.16b, v20.16b, #2
3054        mul             v16.4h,  v20.4h,  v0.4h
3055        mla             v16.4h,  v21.4h,  v1.4h
3056        urshl           v16.4h,  v16.4h,  v31.4h
3057
30584:
3059        ld1             {v22.8h},  [\sr2], \s_strd
3060        ld1             {v24.8h},  [\src], \s_strd
3061        ext             v23.16b, v22.16b, v22.16b, #2
3062        ext             v25.16b, v24.16b, v24.16b, #2
3063        trn1            v22.2d,  v22.2d,  v24.2d
3064        trn1            v23.2d,  v23.2d,  v25.2d
3065        mul             v17.8h,  v22.8h,  v0.8h
3066        mla             v17.8h,  v23.8h,  v1.8h
3067        urshl           v17.8h,  v17.8h,  v31.8h
3068
3069        trn1            v16.2d,  v16.2d,  v17.2d
3070
3071        umull           v4.4s,   v16.4h,  v2.4h
3072        umlal           v4.4s,   v17.4h,  v3.4h
3073        umull2          v5.4s,   v16.8h,  v2.8h
3074        umlal2          v5.4s,   v17.8h,  v3.8h
3075.ifc \type, put
3076        urshl           v4.4s,   v4.4s,   v30.4s
3077        urshl           v5.4s,   v5.4s,   v30.4s
3078        xtn             v4.4h,   v4.4s
3079        xtn2            v4.8h,   v5.4s
3080.else
3081        rshrn           v4.4h,   v4.4s,   #4
3082        rshrn2          v4.8h,   v5.4s,   #4
3083        sub             v4.8h,   v4.8h,   v29.8h
3084.endif
3085        subs            \h,  \h,  #2
3086        st1             {v4.d}[0], [\dst], \d_strd
3087        st1             {v4.d}[1], [\ds2], \d_strd
3088        b.le            0f
3089        trn2            v16.2d,  v17.2d,  v17.2d
3090        b               4b
30910:
3092        ret
3093
309480:     // 8xN, 16xN, ... hv
3095160:
3096320:
3097640:
30981280:
3099        mov             \my, \h
3100
31011:
3102        add             \sr2, \src, \s_strd
3103        add             \ds2, \dst, \d_strd
3104        lsl             \s_strd, \s_strd, #1
3105        lsl             \d_strd, \d_strd, #1
3106
3107        ldr             h21, [\src, #16]
3108        ld1             {v20.8h},  [\src], \s_strd
3109        ext             v21.16b, v20.16b, v21.16b, #2
3110        mul             v16.8h,  v20.8h,  v0.8h
3111        mla             v16.8h,  v21.8h,  v1.8h
3112        urshl           v16.8h,  v16.8h,  v31.8h
3113
31142:
3115        ldr             h23, [\sr2, #16]
3116        ld1             {v22.8h},  [\sr2], \s_strd
3117        ldr             h25, [\src, #16]
3118        ld1             {v24.8h},  [\src], \s_strd
3119        ext             v23.16b, v22.16b, v23.16b, #2
3120        ext             v25.16b, v24.16b, v25.16b, #2
3121        mul             v17.8h,  v22.8h,  v0.8h
3122        mla             v17.8h,  v23.8h,  v1.8h
3123        mul             v18.8h,  v24.8h,  v0.8h
3124        mla             v18.8h,  v25.8h,  v1.8h
3125        urshl           v17.8h,  v17.8h,  v31.8h
3126        urshl           v18.8h,  v18.8h,  v31.8h
3127
3128        umull           v4.4s,   v16.4h,  v2.4h
3129        umlal           v4.4s,   v17.4h,  v3.4h
3130        umull2          v5.4s,   v16.8h,  v2.8h
3131        umlal2          v5.4s,   v17.8h,  v3.8h
3132        umull           v6.4s,   v17.4h,  v2.4h
3133        umlal           v6.4s,   v18.4h,  v3.4h
3134        umull2          v7.4s,   v17.8h,  v2.8h
3135        umlal2          v7.4s,   v18.8h,  v3.8h
3136.ifc \type, put
3137        urshl           v4.4s,   v4.4s,   v30.4s
3138        urshl           v5.4s,   v5.4s,   v30.4s
3139        urshl           v6.4s,   v6.4s,   v30.4s
3140        urshl           v7.4s,   v7.4s,   v30.4s
3141        xtn             v4.4h,   v4.4s
3142        xtn2            v4.8h,   v5.4s
3143        xtn             v5.4h,   v6.4s
3144        xtn2            v5.8h,   v7.4s
3145.else
3146        rshrn           v4.4h,   v4.4s,   #4
3147        rshrn2          v4.8h,   v5.4s,   #4
3148        rshrn           v5.4h,   v6.4s,   #4
3149        rshrn2          v5.8h,   v7.4s,   #4
3150        sub             v4.8h,   v4.8h,   v29.8h
3151        sub             v5.8h,   v5.8h,   v29.8h
3152.endif
3153        subs            \h,  \h,  #2
3154        st1             {v4.8h}, [\dst], \d_strd
3155        st1             {v5.8h}, [\ds2], \d_strd
3156        b.le            9f
3157        mov             v16.16b, v18.16b
3158        b               2b
31599:
3160        subs            \w,  \w,  #8
3161        b.le            0f
3162        asr             \s_strd,  \s_strd,  #1
3163        asr             \d_strd,  \d_strd,  #1
3164        msub            \src,  \s_strd,  \xmy,  \src
3165        msub            \dst,  \d_strd,  \xmy,  \dst
3166        sub             \src,  \src,  \s_strd,  lsl #1
3167        mov             \h,  \my
3168        add             \src,  \src,  #16
3169        add             \dst,  \dst,  #16
3170        b               1b
31710:
3172        ret
3173
3174L(\type\()_bilin_hv_tbl):
3175        .hword L(\type\()_bilin_hv_tbl) - 1280b
3176        .hword L(\type\()_bilin_hv_tbl) -  640b
3177        .hword L(\type\()_bilin_hv_tbl) -  320b
3178        .hword L(\type\()_bilin_hv_tbl) -  160b
3179        .hword L(\type\()_bilin_hv_tbl) -   80b
3180        .hword L(\type\()_bilin_hv_tbl) -   40b
3181        .hword L(\type\()_bilin_hv_tbl) -   20b
3182        .hword 0
3183endfunc
3184.endm
3185
3186filter_fn put,  x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10
3187filter_fn prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10
3188
3189.macro load_filter_row dst, src, inc
3190        asr             w13, \src, #10
3191        add             \src, \src, \inc
3192        ldr             \dst, [x11, w13, sxtw #3]
3193.endm
3194
3195function warp_filter_horz_neon
3196        add             w12, w5,  #512
3197
3198        ld1             {v16.8h, v17.8h}, [x2], x3
3199
3200        load_filter_row d0, w12, w7
3201        load_filter_row d1, w12, w7
3202        load_filter_row d2, w12, w7
3203        sxtl            v0.8h,   v0.8b
3204        load_filter_row d3, w12, w7
3205        sxtl            v1.8h,   v1.8b
3206        load_filter_row d4, w12, w7
3207        sxtl            v2.8h,   v2.8b
3208        load_filter_row d5, w12, w7
3209        sxtl            v3.8h,   v3.8b
3210        load_filter_row d6, w12, w7
3211        sxtl            v4.8h,   v4.8b
3212        load_filter_row d7, w12, w7
3213        sxtl            v5.8h,   v5.8b
3214        ext             v18.16b, v16.16b, v17.16b, #2*1
3215        smull           v8.4s,   v16.4h,  v0.4h
3216        smull2          v9.4s,   v16.8h,  v0.8h
3217        sxtl            v6.8h,   v6.8b
3218        ext             v19.16b, v16.16b, v17.16b, #2*2
3219        smull           v10.4s,  v18.4h,  v1.4h
3220        smull2          v11.4s,  v18.8h,  v1.8h
3221        sxtl            v7.8h,   v7.8b
3222        ext             v20.16b, v16.16b, v17.16b, #2*3
3223        smull           v0.4s,   v19.4h,  v2.4h
3224        smull2          v1.4s,   v19.8h,  v2.8h
3225        ext             v21.16b, v16.16b, v17.16b, #2*4
3226        addp            v8.4s,   v8.4s,   v9.4s
3227        smull           v2.4s,   v20.4h,  v3.4h
3228        smull2          v3.4s,   v20.8h,  v3.8h
3229        ext             v22.16b, v16.16b, v17.16b, #2*5
3230        addp            v9.4s,   v10.4s,  v11.4s
3231        smull           v10.4s,  v21.4h,  v4.4h
3232        smull2          v11.4s,  v21.8h,  v4.8h
3233        ext             v23.16b, v16.16b, v17.16b, #2*6
3234        addp            v0.4s,   v0.4s,   v1.4s
3235        smull           v18.4s,  v22.4h,  v5.4h
3236        smull2          v19.4s,  v22.8h,  v5.8h
3237        ext             v16.16b, v16.16b, v17.16b, #2*7
3238        addp            v1.4s,   v2.4s,   v3.4s
3239        addp            v2.4s,   v10.4s,  v11.4s
3240        smull           v20.4s,  v23.4h,  v6.4h
3241        smull2          v21.4s,  v23.8h,  v6.8h
3242        addp            v3.4s,   v18.4s,  v19.4s
3243        smull           v22.4s,  v16.4h,  v7.4h
3244        smull2          v23.4s,  v16.8h,  v7.8h
3245        addp            v4.4s,   v20.4s,  v21.4s
3246        addp            v5.4s,   v22.4s,  v23.4s
3247
3248        addp            v8.4s,   v8.4s,   v9.4s
3249        addp            v0.4s,   v0.4s,   v1.4s
3250        addp            v2.4s,   v2.4s,   v3.4s
3251        addp            v4.4s,   v4.4s,   v5.4s
3252
3253        addp            v16.4s,  v8.4s,   v0.4s
3254        addp            v17.4s,  v2.4s,   v4.4s
3255
3256        add             w5,  w5,  w8
3257
3258        srshl           v16.4s,  v16.4s,  v14.4s // -(7 - intermediate_bits)
3259        srshl           v17.4s,  v17.4s,  v14.4s // -(7 - intermediate_bits)
3260
3261        ret
3262endfunc
3263
3264// void dav1d_warp_affine_8x8_16bpc_neon(
3265//         pixel *dst, const ptrdiff_t dst_stride,
3266//         const pixel *src, const ptrdiff_t src_stride,
3267//         const int16_t *const abcd, int mx, int my,
3268//         const int bitdepth_max)
3269.macro warp t
3270function warp_affine_8x8\t\()_16bpc_neon, export=1
3271        stp             d8,  d9,  [sp, #-0x40]!
3272        stp             d10, d11, [sp, #0x10]
3273        stp             d12, d13, [sp, #0x20]
3274        stp             d14, d15, [sp, #0x30]
3275
3276.ifb \t
3277        dup             v15.8h,  w7        // bitdepth_max
3278.else
3279        movi            v15.8h,  #(PREP_BIAS >> 8), lsl #8
3280.endif
3281        clz             w7,  w7
3282                                           // intermediate_bits = clz(bitdepth_max) - 18
3283.ifb \t
3284        sub             w8,  w7,  #11      // 7 + intermediate_bits = clz(bitdepth_max) - 18 + 7
3285.endif
3286        sub             w7,  w7,  #25      // -(7 - intermediate_bits)
3287.ifb \t
3288        neg             w8,  w8            // -(7 + intermediate_bits)
3289.endif
3290        dup             v14.4s,  w7        // -(7 - intermediate_bits)
3291.ifb \t
3292        dup             v13.4s,  w8        // -(7 + intermediate_bits)
3293.endif
3294
3295        ldr             x4,  [x4]
3296        sbfx            x7,  x4, #0,  #16
3297        sbfx            x8,  x4, #16, #16
3298        sbfx            x9,  x4, #32, #16
3299        sbfx            x4,  x4, #48, #16
3300        mov             w10, #8
3301        sub             x2,  x2,  x3, lsl #1
3302        sub             x2,  x2,  x3
3303        sub             x2,  x2,  #6
3304        movrel          x11, X(mc_warp_filter), 64*8
3305        mov             x15, x30
3306.ifnb \t
3307        lsl             x1,  x1,  #1
3308.endif
3309
3310        bl              warp_filter_horz_neon
3311        xtn             v24.4h,  v16.4s
3312        xtn2            v24.8h,  v17.4s
3313        bl              warp_filter_horz_neon
3314        xtn             v25.4h,  v16.4s
3315        xtn2            v25.8h,  v17.4s
3316        bl              warp_filter_horz_neon
3317        xtn             v26.4h,  v16.4s
3318        xtn2            v26.8h,  v17.4s
3319        bl              warp_filter_horz_neon
3320        xtn             v27.4h,  v16.4s
3321        xtn2            v27.8h,  v17.4s
3322        bl              warp_filter_horz_neon
3323        xtn             v28.4h,  v16.4s
3324        xtn2            v28.8h,  v17.4s
3325        bl              warp_filter_horz_neon
3326        xtn             v29.4h,  v16.4s
3327        xtn2            v29.8h,  v17.4s
3328        bl              warp_filter_horz_neon
3329        xtn             v30.4h,  v16.4s
3330        xtn2            v30.8h,  v17.4s
3331
33321:
3333        add             w14, w6,  #512
3334        bl              warp_filter_horz_neon
3335        xtn             v31.4h,  v16.4s
3336        xtn2            v31.8h,  v17.4s
3337
3338        load_filter_row d0, w14, w9
3339        load_filter_row d1, w14, w9
3340        load_filter_row d2, w14, w9
3341        load_filter_row d3, w14, w9
3342        load_filter_row d4, w14, w9
3343        load_filter_row d5, w14, w9
3344        load_filter_row d6, w14, w9
3345        load_filter_row d7, w14, w9
3346        transpose_8x8b_xtl v0, v1, v2, v3, v4, v5, v6, v7, sxtl
3347
3348        // This ordering of smull/smlal/smull2/smlal2 is highly
3349        // beneficial for Cortex A53 here.
3350        smull           v16.4s,  v24.4h,  v0.4h
3351        smlal           v16.4s,  v25.4h,  v1.4h
3352        smlal           v16.4s,  v26.4h,  v2.4h
3353        smlal           v16.4s,  v27.4h,  v3.4h
3354        smlal           v16.4s,  v28.4h,  v4.4h
3355        smlal           v16.4s,  v29.4h,  v5.4h
3356        smlal           v16.4s,  v30.4h,  v6.4h
3357        smlal           v16.4s,  v31.4h,  v7.4h
3358        smull2          v17.4s,  v24.8h,  v0.8h
3359        smlal2          v17.4s,  v25.8h,  v1.8h
3360        smlal2          v17.4s,  v26.8h,  v2.8h
3361        smlal2          v17.4s,  v27.8h,  v3.8h
3362        smlal2          v17.4s,  v28.8h,  v4.8h
3363        smlal2          v17.4s,  v29.8h,  v5.8h
3364        smlal2          v17.4s,  v30.8h,  v6.8h
3365        smlal2          v17.4s,  v31.8h,  v7.8h
3366
3367        mov             v24.16b, v25.16b
3368        mov             v25.16b, v26.16b
3369.ifb \t
3370        srshl           v16.4s,  v16.4s,  v13.4s // -(7 + intermediate_bits)
3371        srshl           v17.4s,  v17.4s,  v13.4s // -(7 + intermediate_bits)
3372.else
3373        rshrn           v16.4h,  v16.4s,  #7
3374        rshrn2          v16.8h,  v17.4s,  #7
3375.endif
3376        mov             v26.16b, v27.16b
3377.ifb \t
3378        sqxtun          v16.4h,  v16.4s
3379        sqxtun2         v16.8h,  v17.4s
3380.else
3381        sub             v16.8h,  v16.8h,  v15.8h // PREP_BIAS
3382.endif
3383        mov             v27.16b, v28.16b
3384        mov             v28.16b, v29.16b
3385.ifb \t
3386        umin            v16.8h,  v16.8h,  v15.8h // bitdepth_max
3387.endif
3388        mov             v29.16b, v30.16b
3389        mov             v30.16b, v31.16b
3390        subs            w10, w10, #1
3391        st1             {v16.8h}, [x0], x1
3392
3393        add             w6,  w6,  w4
3394        b.gt            1b
3395
3396        ldp             d14, d15, [sp, #0x30]
3397        ldp             d12, d13, [sp, #0x20]
3398        ldp             d10, d11, [sp, #0x10]
3399        ldp             d8,  d9,  [sp], 0x40
3400
3401        br              x15
3402endfunc
3403.endm
3404
3405warp
3406warp t
3407
3408// void dav1d_emu_edge_16bpc_neon(
3409//         const intptr_t bw, const intptr_t bh,
3410//         const intptr_t iw, const intptr_t ih,
3411//         const intptr_t x, const intptr_t y,
3412//         pixel *dst, const ptrdiff_t dst_stride,
3413//         const pixel *ref, const ptrdiff_t ref_stride)
3414function emu_edge_16bpc_neon, export=1
3415        ldp             x8,  x9,  [sp]
3416
3417        // ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride)
3418        // ref += iclip(x, 0, iw - 1)
3419        sub             x12, x3,  #1           // ih - 1
3420        cmp             x5,  x3
3421        sub             x13, x2,  #1           // iw - 1
3422        csel            x12, x12, x5,  ge      // min(y, ih - 1)
3423        cmp             x4,  x2
3424        bic             x12, x12, x12, asr #63 // max(min(y, ih - 1), 0)
3425        csel            x13, x13, x4,  ge      // min(x, iw - 1)
3426        bic             x13, x13, x13, asr #63 // max(min(x, iw - 1), 0)
3427        madd            x8,  x12, x9,  x8      // ref += iclip() * stride
3428        add             x8,  x8,  x13, lsl #1  // ref += iclip()
3429
3430        // bottom_ext = iclip(y + bh - ih, 0, bh - 1)
3431        // top_ext = iclip(-y, 0, bh - 1)
3432        add             x10, x5,  x1           // y + bh
3433        neg             x5,  x5                // -y
3434        sub             x10, x10, x3           // y + bh - ih
3435        sub             x12, x1,  #1           // bh - 1
3436        cmp             x10, x1
3437        bic             x5,  x5,  x5,  asr #63 // max(-y, 0)
3438        csel            x10, x10, x12, lt      // min(y + bh - ih, bh-1)
3439        cmp             x5,  x1
3440        bic             x10, x10, x10, asr #63 // max(min(y + bh - ih, bh-1), 0)
3441        csel            x5,  x5,  x12, lt      // min(max(-y, 0), bh-1)
3442
3443        // right_ext = iclip(x + bw - iw, 0, bw - 1)
3444        // left_ext = iclip(-x, 0, bw - 1)
3445        add             x11, x4,  x0           // x + bw
3446        neg             x4,  x4                // -x
3447        sub             x11, x11, x2           // x + bw - iw
3448        sub             x13, x0,  #1           // bw - 1
3449        cmp             x11, x0
3450        bic             x4,  x4,  x4,  asr #63 // max(-x, 0)
3451        csel            x11, x11, x13, lt      // min(x + bw - iw, bw-1)
3452        cmp             x4,  x0
3453        bic             x11, x11, x11, asr #63 // max(min(x + bw - iw, bw-1), 0)
3454        csel            x4,  x4,  x13, lt      // min(max(-x, 0), bw - 1)
3455
3456        // center_h = bh - top_ext - bottom_ext
3457        // dst += top_ext * PXSTRIDE(dst_stride)
3458        // center_w = bw - left_ext - right_ext
3459        sub             x1,  x1,  x5           // bh - top_ext
3460        madd            x6,  x5,  x7,  x6
3461        sub             x2,  x0,  x4           // bw - left_ext
3462        sub             x1,  x1,  x10          // center_h = bh - top_ext - bottom_ext
3463        sub             x2,  x2,  x11          // center_w = bw - left_ext - right_ext
3464
3465        mov             x14, x6                // backup of dst
3466
3467.macro v_loop need_left, need_right
34680:
3469.if \need_left
3470        ld1r            {v0.8h}, [x8]
3471        mov             x12, x6                // out = dst
3472        mov             x3,  x4
3473        mov             v1.16b,  v0.16b
34741:
3475        subs            x3,  x3,  #16
3476        st1             {v0.8h, v1.8h}, [x12], #32
3477        b.gt            1b
3478.endif
3479        mov             x13, x8
3480        add             x12, x6,  x4, lsl #1   // out = dst + left_ext
3481        mov             x3,  x2
34821:
3483        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x13], #64
3484        subs            x3,  x3,  #32
3485        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x12], #64
3486        b.gt            1b
3487.if \need_right
3488        add             x3,  x8,  x2, lsl #1   // in + center_w
3489        sub             x3,  x3,  #2           // in + center_w - 1
3490        add             x12, x6,  x4, lsl #1   // dst + left_ext
3491        ld1r            {v0.8h}, [x3]
3492        add             x12, x12, x2, lsl #1   // out = dst + left_ext + center_w
3493        mov             x3,  x11
3494        mov             v1.16b,  v0.16b
34951:
3496        subs            x3,  x3,  #16
3497        st1             {v0.8h, v1.8h}, [x12], #32
3498        b.gt            1b
3499.endif
3500
3501        subs            x1,  x1,  #1           // center_h--
3502        add             x6,  x6,  x7
3503        add             x8,  x8,  x9
3504        b.gt            0b
3505.endm
3506
3507        cbz             x4,  2f
3508        // need_left
3509        cbz             x11, 3f
3510        // need_left + need_right
3511        v_loop          1,   1
3512        b               5f
3513
35142:
3515        // !need_left
3516        cbz             x11, 4f
3517        // !need_left + need_right
3518        v_loop          0,   1
3519        b               5f
3520
35213:
3522        // need_left + !need_right
3523        v_loop          1,   0
3524        b               5f
3525
35264:
3527        // !need_left + !need_right
3528        v_loop          0,   0
3529
35305:
3531
3532        cbz             x10, 3f
3533        // need_bottom
3534        sub             x8,  x6,  x7           // ref = dst - stride
3535        mov             x4,  x0
35361:
3537        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x8], #64
3538        mov             x3,  x10
35392:
3540        subs            x3,  x3,  #1
3541        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7
3542        b.gt            2b
3543        msub            x6,  x7,  x10,  x6     // dst -= bottom_ext * stride
3544        subs            x4,  x4,  #32          // bw -= 32
3545        add             x6,  x6,  #64          // dst += 32
3546        b.gt            1b
3547
35483:
3549        cbz             x5,  3f
3550        // need_top
3551        msub            x6,  x7,  x5,  x14     // dst = stored_dst - top_ext * stride
35521:
3553        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x14], #64
3554        mov             x3,  x5
35552:
3556        subs            x3,  x3,  #1
3557        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7
3558        b.gt            2b
3559        msub            x6,  x7,  x5,  x6      // dst -= top_ext * stride
3560        subs            x0,  x0,  #32          // bw -= 32
3561        add             x6,  x6,  #64          // dst += 32
3562        b.gt            1b
3563
35643:
3565        ret
3566endfunc
3567