1/*
2 * Copyright © 2018, VideoLAN and dav1d authors
3 * Copyright © 2018, Janne Grunau
4 * Copyright © 2020, Martin Storsjo
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice, this
11 *    list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 *    this list of conditions and the following disclaimer in the documentation
15 *    and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
21 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29#include "src/arm/asm.S"
30#include "util.S"
31
32#define PREP_BIAS 8192
33
34.macro avg d0, d1, t0, t1, t2, t3
35        ld1             {\t0\().8h,\t1\().8h},  [x2],  32
36        ld1             {\t2\().8h,\t3\().8h},  [x3],  32
37        sqadd           \t0\().8h,  \t0\().8h,  \t2\().8h
38        sqadd           \t1\().8h,  \t1\().8h,  \t3\().8h
39        smax            \t0\().8h,  \t0\().8h,  v28.8h // -2*PREP_BIAS - 1 << intermediate_bits
40        smax            \t1\().8h,  \t1\().8h,  v28.8h // -2*PREP_BIAS - 1 << intermediate_bits
41        sqsub           \t0\().8h,  \t0\().8h,  v28.8h // -2*PREP_BIAS - 1 << intermediate_bits
42        sqsub           \t1\().8h,  \t1\().8h,  v28.8h // -2*PREP_BIAS - 1 << intermediate_bits
43        sshl            \d0\().8h,  \t0\().8h,  v29.8h // -(intermediate_bits+1)
44        sshl            \d1\().8h,  \t1\().8h,  v29.8h // -(intermediate_bits+1)
45.endm
46
47.macro w_avg d0, d1, t0, t1, t2, t3
48        ld1             {\t0\().8h,\t1\().8h},  [x2],  32
49        ld1             {\t2\().8h,\t3\().8h},  [x3],  32
50        // This difference requires a 17 bit range, and all bits are
51        // significant for the following multiplication.
52        ssubl           \d0\().4s,  \t2\().4h,  \t0\().4h
53        ssubl2          \t0\().4s,  \t2\().8h,  \t0\().8h
54        ssubl           \d1\().4s,  \t3\().4h,  \t1\().4h
55        ssubl2          \t1\().4s,  \t3\().8h,  \t1\().8h
56        mul             \d0\().4s,  \d0\().4s,  v27.4s
57        mul             \t0\().4s,  \t0\().4s,  v27.4s
58        mul             \d1\().4s,  \d1\().4s,  v27.4s
59        mul             \t1\().4s,  \t1\().4s,  v27.4s
60        sshr            \d0\().4s,  \d0\().4s,  #4
61        sshr            \t0\().4s,  \t0\().4s,  #4
62        sshr            \d1\().4s,  \d1\().4s,  #4
63        sshr            \t1\().4s,  \t1\().4s,  #4
64        saddw           \d0\().4s,  \d0\().4s,  \t2\().4h
65        saddw2          \t0\().4s,  \t0\().4s,  \t2\().8h
66        saddw           \d1\().4s,  \d1\().4s,  \t3\().4h
67        saddw2          \t1\().4s,  \t1\().4s,  \t3\().8h
68        xtn             \d0\().4h,  \d0\().4s
69        xtn2            \d0\().8h,  \t0\().4s
70        xtn             \d1\().4h,  \d1\().4s
71        xtn2            \d1\().8h,  \t1\().4s
72        srshl           \d0\().8h,  \d0\().8h,  v29.8h // -intermediate_bits
73        srshl           \d1\().8h,  \d1\().8h,  v29.8h // -intermediate_bits
74        add             \d0\().8h,  \d0\().8h,  v28.8h // PREP_BIAS >> intermediate_bits
75        add             \d1\().8h,  \d1\().8h,  v28.8h // PREP_BIAS >> intermediate_bits
76        smin            \d0\().8h,  \d0\().8h,  v31.8h // bitdepth_max
77        smin            \d1\().8h,  \d1\().8h,  v31.8h // bitdepth_max
78        smax            \d0\().8h,  \d0\().8h,  v30.8h // 0
79        smax            \d1\().8h,  \d1\().8h,  v30.8h // 0
80.endm
81
82.macro mask d0, d1, t0, t1, t2, t3
83        ld1             {v27.16b}, [x6],  16
84        ld1             {\t0\().8h,\t1\().8h},  [x2],  32
85        neg             v27.16b, v27.16b
86        ld1             {\t2\().8h,\t3\().8h},  [x3],  32
87        sxtl            v26.8h,  v27.8b
88        sxtl2           v27.8h,  v27.16b
89        sxtl            v24.4s,  v26.4h
90        sxtl2           v25.4s,  v26.8h
91        sxtl            v26.4s,  v27.4h
92        sxtl2           v27.4s,  v27.8h
93        ssubl           \d0\().4s,  \t2\().4h,  \t0\().4h
94        ssubl2          \t0\().4s,  \t2\().8h,  \t0\().8h
95        ssubl           \d1\().4s,  \t3\().4h,  \t1\().4h
96        ssubl2          \t1\().4s,  \t3\().8h,  \t1\().8h
97        mul             \d0\().4s,  \d0\().4s,  v24.4s
98        mul             \t0\().4s,  \t0\().4s,  v25.4s
99        mul             \d1\().4s,  \d1\().4s,  v26.4s
100        mul             \t1\().4s,  \t1\().4s,  v27.4s
101        sshr            \d0\().4s,  \d0\().4s,  #6
102        sshr            \t0\().4s,  \t0\().4s,  #6
103        sshr            \d1\().4s,  \d1\().4s,  #6
104        sshr            \t1\().4s,  \t1\().4s,  #6
105        saddw           \d0\().4s,  \d0\().4s,  \t2\().4h
106        saddw2          \t0\().4s,  \t0\().4s,  \t2\().8h
107        saddw           \d1\().4s,  \d1\().4s,  \t3\().4h
108        saddw2          \t1\().4s,  \t1\().4s,  \t3\().8h
109        xtn             \d0\().4h,  \d0\().4s
110        xtn2            \d0\().8h,  \t0\().4s
111        xtn             \d1\().4h,  \d1\().4s
112        xtn2            \d1\().8h,  \t1\().4s
113        srshl           \d0\().8h,  \d0\().8h,  v29.8h // -intermediate_bits
114        srshl           \d1\().8h,  \d1\().8h,  v29.8h // -intermediate_bits
115        add             \d0\().8h,  \d0\().8h,  v28.8h // PREP_BIAS >> intermediate_bits
116        add             \d1\().8h,  \d1\().8h,  v28.8h // PREP_BIAS >> intermediate_bits
117        smin            \d0\().8h,  \d0\().8h,  v31.8h // bitdepth_max
118        smin            \d1\().8h,  \d1\().8h,  v31.8h // bitdepth_max
119        smax            \d0\().8h,  \d0\().8h,  v30.8h // 0
120        smax            \d1\().8h,  \d1\().8h,  v30.8h // 0
121.endm
122
123.macro bidir_fn type, bdmax
124function \type\()_16bpc_neon, export=1
125        clz             w4,  w4
126.ifnc \type, avg
127        dup             v31.8h,  \bdmax // bitdepth_max
128        movi            v30.8h,  #0
129.endif
130        clz             w7,  \bdmax
131        sub             w7,  w7,  #18   // intermediate_bits = clz(bitdepth_max) - 18
132.ifc \type, avg
133        mov             w9,  #1
134        mov             w8,  #-2*PREP_BIAS
135        lsl             w9,  w9,  w7    // 1 << intermediate_bits
136        add             w7,  w7,  #1
137        sub             w8,  w8,  w9    // -2*PREP_BIAS - 1 << intermediate_bits
138        neg             w7,  w7         // -(intermediate_bits+1)
139        dup             v28.8h,   w8    // -2*PREP_BIAS - 1 << intermediate_bits
140        dup             v29.8h,   w7    // -(intermediate_bits+1)
141.else
142        mov             w8,  #PREP_BIAS
143        lsr             w8,  w8,  w7    // PREP_BIAS >> intermediate_bits
144        neg             w7,  w7         // -intermediate_bits
145        dup             v28.8h,  w8     // PREP_BIAS >> intermediate_bits
146        dup             v29.8h,  w7     // -intermediate_bits
147.endif
148.ifc \type, w_avg
149        dup             v27.4s,  w6
150        neg             v27.4s,  v27.4s
151.endif
152        adr             x7,  L(\type\()_tbl)
153        sub             w4,  w4,  #24
154        \type           v4,  v5,  v0,  v1,  v2,  v3
155        ldrh            w4,  [x7, x4, lsl #1]
156        sub             x7,  x7,  w4, uxtw
157        br              x7
15840:
159        add             x7,  x0,  x1
160        lsl             x1,  x1,  #1
1614:
162        subs            w5,  w5,  #4
163        st1             {v4.d}[0],  [x0], x1
164        st1             {v4.d}[1],  [x7], x1
165        st1             {v5.d}[0],  [x0], x1
166        st1             {v5.d}[1],  [x7], x1
167        b.le            0f
168        \type           v4,  v5,  v0,  v1,  v2,  v3
169        b               4b
17080:
171        add             x7,  x0,  x1
172        lsl             x1,  x1,  #1
1738:
174        st1             {v4.8h},  [x0], x1
175        subs            w5,  w5,  #2
176        st1             {v5.8h},  [x7], x1
177        b.le            0f
178        \type           v4,  v5,  v0,  v1,  v2,  v3
179        b               8b
18016:
181        \type           v6,  v7,  v0,  v1,  v2,  v3
182        st1             {v4.8h, v5.8h}, [x0], x1
183        subs            w5,  w5,  #2
184        st1             {v6.8h, v7.8h}, [x0], x1
185        b.le            0f
186        \type           v4,  v5,  v0,  v1,  v2,  v3
187        b               16b
18832:
189        \type           v6,  v7,  v0,  v1,  v2,  v3
190        subs            w5,  w5,  #1
191        st1             {v4.8h, v5.8h, v6.8h, v7.8h},  [x0], x1
192        b.le            0f
193        \type           v4,  v5,  v0,  v1,  v2,  v3
194        b               32b
195640:
196        add             x7,  x0,  #64
19764:
198        \type           v6,  v7,  v0,  v1,  v2,  v3
199        \type           v16, v17, v0,  v1,  v2,  v3
200        st1             {v4.8h, v5.8h, v6.8h, v7.8h},  [x0], x1
201        \type           v18, v19, v0,  v1,  v2,  v3
202        subs            w5,  w5,  #1
203        st1             {v16.8h,v17.8h,v18.8h,v19.8h}, [x7], x1
204        b.le            0f
205        \type           v4,  v5,  v0,  v1,  v2,  v3
206        b               64b
2071280:
208        add             x7,  x0,  #64
209        mov             x8,  #128
210        sub             x1,  x1,  #128
211128:
212        \type           v6,  v7,  v0,  v1,  v2,  v3
213        \type           v16, v17, v0,  v1,  v2,  v3
214        st1             {v4.8h, v5.8h, v6.8h, v7.8h},  [x0], x8
215        \type           v18, v19, v0,  v1,  v2,  v3
216        st1             {v16.8h,v17.8h,v18.8h,v19.8h}, [x7], x8
217        \type           v4,  v5,  v0,  v1,  v2,  v3
218        \type           v6,  v7,  v0,  v1,  v2,  v3
219        \type           v16, v17, v0,  v1,  v2,  v3
220        subs            w5,  w5,  #1
221        st1             {v4.8h, v5.8h, v6.8h, v7.8h},  [x0], x1
222        \type           v18, v19, v0,  v1,  v2,  v3
223        st1             {v16.8h,v17.8h,v18.8h,v19.8h}, [x7], x1
224        b.le            0f
225        \type           v4,  v5,  v0,  v1,  v2,  v3
226        b               128b
2270:
228        ret
229L(\type\()_tbl):
230        .hword L(\type\()_tbl) - 1280b
231        .hword L(\type\()_tbl) -  640b
232        .hword L(\type\()_tbl) -   32b
233        .hword L(\type\()_tbl) -   16b
234        .hword L(\type\()_tbl) -   80b
235        .hword L(\type\()_tbl) -   40b
236endfunc
237.endm
238
239bidir_fn avg, w6
240bidir_fn w_avg, w7
241bidir_fn mask, w7
242
243
244.macro w_mask_fn type
245function w_mask_\type\()_16bpc_neon, export=1
246        ldr             w8,  [sp]
247        clz             w9,  w4
248        adr             x10, L(w_mask_\type\()_tbl)
249        dup             v31.8h,  w8   // bitdepth_max
250        sub             w9,  w9,  #24
251        clz             w8,  w8       // clz(bitdepth_max)
252        ldrh            w9,  [x10,  x9,  lsl #1]
253        sub             x10, x10, w9,  uxtw
254        sub             w8,  w8,  #12 // sh = intermediate_bits + 6 = clz(bitdepth_max) - 12
255        mov             w9,  #PREP_BIAS*64
256        neg             w8,  w8       // -sh
257        mov             w11, #27615   // (64 + 1 - 38)<<mask_sh - 1 - mask_rnd
258        dup             v30.4s,  w9   // PREP_BIAS*64
259        dup             v29.4s,  w8   // -sh
260        dup             v0.8h,   w11
261.if \type == 444
262        movi            v1.16b,  #64
263.elseif \type == 422
264        dup             v2.8b,   w7
265        movi            v3.8b,   #129
266        sub             v3.8b,   v3.8b,   v2.8b
267.elseif \type == 420
268        dup             v2.8h,   w7
269        movi            v3.8h,   #1, lsl #8
270        sub             v3.8h,   v3.8h,   v2.8h
271.endif
272        add             x12,  x0,  x1
273        lsl             x1,   x1,  #1
274        br              x10
2754:
276        ld1             {v4.8h, v5.8h}, [x2], #32 // tmp1 (four rows at once)
277        ld1             {v6.8h, v7.8h}, [x3], #32 // tmp2 (four rows at once)
278        subs            w5,  w5,  #4
279        sabd            v20.8h,  v4.8h,   v6.8h   // abs(tmp1 - tmp2)
280        sabd            v21.8h,  v5.8h,   v7.8h
281        ssubl           v16.4s,  v6.4h,   v4.4h   // tmp2 - tmp1 (requires 17 bit)
282        ssubl2          v17.4s,  v6.8h,   v4.8h
283        ssubl           v18.4s,  v7.4h,   v5.4h
284        ssubl2          v19.4s,  v7.8h,   v5.8h
285        uqsub           v20.8h,  v0.8h,   v20.8h  // 27615 - abs()
286        uqsub           v21.8h,  v0.8h,   v21.8h
287        sshll2          v7.4s,   v5.8h,   #6      // tmp1 << 6
288        sshll           v6.4s,   v5.4h,   #6
289        sshll2          v5.4s,   v4.8h,   #6
290        sshll           v4.4s,   v4.4h,   #6
291        ushr            v20.8h,  v20.8h,  #10     // 64-m = (27615 - abs()) >> mask_sh
292        ushr            v21.8h,  v21.8h,  #10
293        add             v4.4s,   v4.4s,   v30.4s  // += PREP_BIAS*64
294        add             v5.4s,   v5.4s,   v30.4s
295        add             v6.4s,   v6.4s,   v30.4s
296        add             v7.4s,   v7.4s,   v30.4s
297        uxtl            v22.4s,  v20.4h
298        uxtl2           v23.4s,  v20.8h
299        uxtl            v24.4s,  v21.4h
300        uxtl2           v25.4s,  v21.8h
301        mla             v4.4s,   v16.4s,  v22.4s  // (tmp2-tmp1)*(64-m)
302        mla             v5.4s,   v17.4s,  v23.4s
303        mla             v6.4s,   v18.4s,  v24.4s
304        mla             v7.4s,   v19.4s,  v25.4s
305        srshl           v4.4s,   v4.4s,   v29.4s  // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh
306        srshl           v5.4s,   v5.4s,   v29.4s
307        srshl           v6.4s,   v6.4s,   v29.4s
308        srshl           v7.4s,   v7.4s,   v29.4s
309        sqxtun          v4.4h,   v4.4s            // iclip_pixel
310        sqxtun2         v4.8h,   v5.4s
311        sqxtun          v5.4h,   v6.4s
312        sqxtun2         v5.8h,   v7.4s
313        umin            v4.8h,   v4.8h,   v31.8h  // iclip_pixel
314        umin            v5.8h,   v5.8h,   v31.8h
315.if \type == 444
316        xtn             v20.8b,  v20.8h           // 64 - m
317        xtn2            v20.16b, v21.8h
318        sub             v20.16b, v1.16b,  v20.16b // m
319        st1             {v20.16b}, [x6], #16
320.elseif \type == 422
321        addp            v20.8h,  v20.8h,  v21.8h  // (64 - m) + (64 - n) (column wise addition)
322        xtn             v20.8b,  v20.8h
323        uhsub           v20.8b,  v3.8b,   v20.8b  // ((129 - sign) - ((64 - m) + (64 - n)) >> 1
324        st1             {v20.8b}, [x6], #8
325.elseif \type == 420
326        trn1            v24.2d,  v20.2d,  v21.2d
327        trn2            v25.2d,  v20.2d,  v21.2d
328        add             v24.8h,  v24.8h,  v25.8h  // (64 - my1) + (64 - my2) (row wise addition)
329        addp            v20.8h,  v24.8h,  v24.8h  // (128 - m) + (128 - n) (column wise addition)
330        sub             v20.4h,  v3.4h,   v20.4h  // (256 - sign) - ((128 - m) + (128 - n))
331        rshrn           v20.8b,  v20.8h,  #2      // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
332        st1             {v20.s}[0], [x6], #4
333.endif
334        st1             {v4.d}[0],  [x0],  x1
335        st1             {v4.d}[1],  [x12], x1
336        st1             {v5.d}[0],  [x0],  x1
337        st1             {v5.d}[1],  [x12], x1
338        b.gt            4b
339        ret
3408:
341        ld1             {v4.8h, v5.8h}, [x2], #32 // tmp1
342        ld1             {v6.8h, v7.8h}, [x3], #32 // tmp2
343        subs            w5,  w5,  #2
344        sabd            v20.8h,  v4.8h,   v6.8h   // abs(tmp1 - tmp2)
345        sabd            v21.8h,  v5.8h,   v7.8h
346        ssubl           v16.4s,  v6.4h,   v4.4h   // tmp2 - tmp1 (requires 17 bit)
347        ssubl2          v17.4s,  v6.8h,   v4.8h
348        ssubl           v18.4s,  v7.4h,   v5.4h
349        ssubl2          v19.4s,  v7.8h,   v5.8h
350        uqsub           v20.8h,  v0.8h,   v20.8h  // 27615 - abs()
351        uqsub           v21.8h,  v0.8h,   v21.8h
352        sshll2          v7.4s,   v5.8h,   #6      // tmp1 << 6
353        sshll           v6.4s,   v5.4h,   #6
354        sshll2          v5.4s,   v4.8h,   #6
355        sshll           v4.4s,   v4.4h,   #6
356        ushr            v20.8h,  v20.8h,  #10     // 64-m = (27615 - abs()) >> mask_sh
357        ushr            v21.8h,  v21.8h,  #10
358        add             v4.4s,   v4.4s,   v30.4s  // += PREP_BIAS*64
359        add             v5.4s,   v5.4s,   v30.4s
360        add             v6.4s,   v6.4s,   v30.4s
361        add             v7.4s,   v7.4s,   v30.4s
362        uxtl            v22.4s,  v20.4h
363        uxtl2           v23.4s,  v20.8h
364        uxtl            v24.4s,  v21.4h
365        uxtl2           v25.4s,  v21.8h
366        mla             v4.4s,   v16.4s,  v22.4s  // (tmp2-tmp1)*(64-m)
367        mla             v5.4s,   v17.4s,  v23.4s
368        mla             v6.4s,   v18.4s,  v24.4s
369        mla             v7.4s,   v19.4s,  v25.4s
370        srshl           v4.4s,   v4.4s,   v29.4s  // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh
371        srshl           v5.4s,   v5.4s,   v29.4s
372        srshl           v6.4s,   v6.4s,   v29.4s
373        srshl           v7.4s,   v7.4s,   v29.4s
374        sqxtun          v4.4h,   v4.4s            // iclip_pixel
375        sqxtun2         v4.8h,   v5.4s
376        sqxtun          v5.4h,   v6.4s
377        sqxtun2         v5.8h,   v7.4s
378        umin            v4.8h,   v4.8h,   v31.8h  // iclip_pixel
379        umin            v5.8h,   v5.8h,   v31.8h
380.if \type == 444
381        xtn             v20.8b,  v20.8h           // 64 - m
382        xtn2            v20.16b, v21.8h
383        sub             v20.16b, v1.16b,  v20.16b // m
384        st1             {v20.16b}, [x6], #16
385.elseif \type == 422
386        addp            v20.8h,  v20.8h,  v21.8h  // (64 - m) + (64 - n) (column wise addition)
387        xtn             v20.8b,  v20.8h
388        uhsub           v20.8b,  v3.8b,   v20.8b  // ((129 - sign) - ((64 - m) + (64 - n)) >> 1
389        st1             {v20.8b}, [x6], #8
390.elseif \type == 420
391        add             v20.8h,  v20.8h,  v21.8h  // (64 - my1) + (64 - my2) (row wise addition)
392        addp            v20.8h,  v20.8h,  v20.8h  // (128 - m) + (128 - n) (column wise addition)
393        sub             v20.4h,  v3.4h,   v20.4h  // (256 - sign) - ((128 - m) + (128 - n))
394        rshrn           v20.8b,  v20.8h,  #2      // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
395        st1             {v20.s}[0], [x6], #4
396.endif
397        st1             {v4.8h}, [x0],  x1
398        st1             {v5.8h}, [x12], x1
399        b.gt            8b
400        ret
4011280:
402640:
403320:
404160:
405        mov             w11, w4
406        sub             x1,  x1,  w4,  uxtw #1
407.if \type == 444
408        add             x10, x6,  w4,  uxtw
409.elseif \type == 422
410        add             x10, x6,  x11, lsr #1
411.endif
412        add             x9,  x3,  w4,  uxtw #1
413        add             x7,  x2,  w4,  uxtw #1
414161:
415        mov             w8,  w4
41616:
417        ld1             {v4.8h,   v5.8h},  [x2], #32 // tmp1
418        ld1             {v16.8h,  v17.8h}, [x3], #32 // tmp2
419        ld1             {v6.8h,   v7.8h},  [x7], #32
420        ld1             {v18.8h,  v19.8h}, [x9], #32
421        subs            w8,  w8,  #16
422        sabd            v20.8h,  v4.8h,   v16.8h  // abs(tmp1 - tmp2)
423        sabd            v21.8h,  v5.8h,   v17.8h
424        ssubl           v22.4s,  v16.4h,  v4.4h   // tmp2 - tmp1 (requires 17 bit)
425        ssubl2          v23.4s,  v16.8h,  v4.8h
426        ssubl           v24.4s,  v17.4h,  v5.4h
427        ssubl2          v25.4s,  v17.8h,  v5.8h
428        uqsub           v20.8h,  v0.8h,   v20.8h  // 27615 - abs()
429        uqsub           v21.8h,  v0.8h,   v21.8h
430        sshll2          v27.4s,  v5.8h,   #6      // tmp1 << 6
431        sshll           v26.4s,  v5.4h,   #6
432        sshll2          v5.4s,   v4.8h,   #6
433        sshll           v4.4s,   v4.4h,   #6
434        ushr            v20.8h,  v20.8h,  #10     // 64-m = (27615 - abs()) >> mask_sh
435        ushr            v21.8h,  v21.8h,  #10
436        add             v4.4s,   v4.4s,   v30.4s  // += PREP_BIAS*64
437        add             v5.4s,   v5.4s,   v30.4s
438        add             v26.4s,  v26.4s,  v30.4s
439        add             v27.4s,  v27.4s,  v30.4s
440        uxtl            v16.4s,  v20.4h
441        uxtl2           v17.4s,  v20.8h
442        uxtl            v28.4s,  v21.4h
443        mla             v4.4s,   v22.4s,  v16.4s  // (tmp2-tmp1)*(64-m)
444        uxtl2           v16.4s,  v21.8h
445        mla             v5.4s,   v23.4s,  v17.4s
446        mla             v26.4s,  v24.4s,  v28.4s
447        mla             v27.4s,  v25.4s,  v16.4s
448        srshl           v4.4s,   v4.4s,   v29.4s  // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh
449        srshl           v5.4s,   v5.4s,   v29.4s
450        srshl           v26.4s,  v26.4s,  v29.4s
451        srshl           v27.4s,  v27.4s,  v29.4s
452        sqxtun          v4.4h,   v4.4s            // iclip_pixel
453        sqxtun2         v4.8h,   v5.4s
454        sqxtun          v5.4h,   v26.4s
455        sqxtun2         v5.8h,   v27.4s
456
457        // Start of other half
458        sabd            v22.8h,  v6.8h,   v18.8h  // abs(tmp1 - tmp2)
459        sabd            v23.8h,  v7.8h,   v19.8h
460
461        umin            v4.8h,   v4.8h,   v31.8h  // iclip_pixel
462        umin            v5.8h,   v5.8h,   v31.8h
463
464        ssubl           v16.4s,  v18.4h,  v6.4h   // tmp2 - tmp1 (requires 17 bit)
465        ssubl2          v17.4s,  v18.8h,  v6.8h
466        ssubl           v18.4s,  v19.4h,  v7.4h
467        ssubl2          v19.4s,  v19.8h,  v7.8h
468        uqsub           v22.8h,  v0.8h,   v22.8h  // 27615 - abs()
469        uqsub           v23.8h,  v0.8h,   v23.8h
470        sshll           v24.4s,  v6.4h,   #6      // tmp1 << 6
471        sshll2          v25.4s,  v6.8h,   #6
472        sshll           v26.4s,  v7.4h,   #6
473        sshll2          v27.4s,  v7.8h,   #6
474        ushr            v22.8h,  v22.8h,  #10     // 64-m = (27615 - abs()) >> mask_sh
475        ushr            v23.8h,  v23.8h,  #10
476        add             v24.4s,  v24.4s,  v30.4s  // += PREP_BIAS*64
477        add             v25.4s,  v25.4s,  v30.4s
478        add             v26.4s,  v26.4s,  v30.4s
479        add             v27.4s,  v27.4s,  v30.4s
480        uxtl            v6.4s,   v22.4h
481        uxtl2           v7.4s,   v22.8h
482        uxtl            v28.4s,  v23.4h
483        mla             v24.4s,  v16.4s,  v6.4s   // (tmp2-tmp1)*(64-m)
484        uxtl2           v6.4s,   v23.8h
485        mla             v25.4s,  v17.4s,  v7.4s
486        mla             v26.4s,  v18.4s,  v28.4s
487        mla             v27.4s,  v19.4s,  v6.4s
488        srshl           v24.4s,  v24.4s,  v29.4s  // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh
489        srshl           v25.4s,  v25.4s,  v29.4s
490        srshl           v26.4s,  v26.4s,  v29.4s
491        srshl           v27.4s,  v27.4s,  v29.4s
492        sqxtun          v6.4h,   v24.4s           // iclip_pixel
493        sqxtun2         v6.8h,   v25.4s
494        sqxtun          v7.4h,   v26.4s
495        sqxtun2         v7.8h,   v27.4s
496        umin            v6.8h,   v6.8h,   v31.8h  // iclip_pixel
497        umin            v7.8h,   v7.8h,   v31.8h
498.if \type == 444
499        xtn             v20.8b,  v20.8h           // 64 - m
500        xtn2            v20.16b, v21.8h
501        xtn             v21.8b,  v22.8h
502        xtn2            v21.16b, v23.8h
503        sub             v20.16b, v1.16b,  v20.16b // m
504        sub             v21.16b, v1.16b,  v21.16b
505        st1             {v20.16b}, [x6],  #16
506        st1             {v21.16b}, [x10], #16
507.elseif \type == 422
508        addp            v20.8h,  v20.8h,  v21.8h  // (64 - m) + (64 - n) (column wise addition)
509        addp            v21.8h,  v22.8h,  v23.8h
510        xtn             v20.8b,  v20.8h
511        xtn             v21.8b,  v21.8h
512        uhsub           v20.8b,  v3.8b,   v20.8b  // ((129 - sign) - ((64 - m) + (64 - n)) >> 1
513        uhsub           v21.8b,  v3.8b,   v21.8b
514        st1             {v20.8b}, [x6],  #8
515        st1             {v21.8b}, [x10], #8
516.elseif \type == 420
517        add             v20.8h,  v20.8h,  v22.8h  // (64 - my1) + (64 - my2) (row wise addition)
518        add             v21.8h,  v21.8h,  v23.8h
519        addp            v20.8h,  v20.8h,  v21.8h  // (128 - m) + (128 - n) (column wise addition)
520        sub             v20.8h,  v3.8h,   v20.8h  // (256 - sign) - ((128 - m) + (128 - n))
521        rshrn           v20.8b,  v20.8h,  #2      // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
522        st1             {v20.8b}, [x6], #8
523.endif
524        st1             {v4.8h, v5.8h}, [x0],  #32
525        st1             {v6.8h, v7.8h}, [x12], #32
526        b.gt            16b
527        subs            w5,  w5,  #2
528        add             x2,  x2,  w4,  uxtw #1
529        add             x3,  x3,  w4,  uxtw #1
530        add             x7,  x7,  w4,  uxtw #1
531        add             x9,  x9,  w4,  uxtw #1
532.if \type == 444
533        add             x6,  x6,  w4,  uxtw
534        add             x10, x10, w4,  uxtw
535.elseif \type == 422
536        add             x6,  x6,  x11, lsr #1
537        add             x10, x10, x11, lsr #1
538.endif
539        add             x0,  x0,  x1
540        add             x12, x12, x1
541        b.gt            161b
542        ret
543L(w_mask_\type\()_tbl):
544        .hword L(w_mask_\type\()_tbl) - 1280b
545        .hword L(w_mask_\type\()_tbl) -  640b
546        .hword L(w_mask_\type\()_tbl) -  320b
547        .hword L(w_mask_\type\()_tbl) -  160b
548        .hword L(w_mask_\type\()_tbl) -    8b
549        .hword L(w_mask_\type\()_tbl) -    4b
550endfunc
551.endm
552
553w_mask_fn 444
554w_mask_fn 422
555w_mask_fn 420
556
557
558function blend_16bpc_neon, export=1
559        adr             x6,  L(blend_tbl)
560        clz             w3,  w3
561        sub             w3,  w3,  #26
562        ldrh            w3,  [x6,  x3,  lsl #1]
563        sub             x6,  x6,  w3,  uxtw
564        add             x8,  x0,  x1
565        br              x6
56640:
567        lsl             x1,  x1,  #1
5684:
569        ld1             {v2.8b},   [x5], #8
570        ld1             {v1.8h},   [x2], #16
571        ld1             {v0.d}[0], [x0]
572        neg             v2.8b,   v2.8b            // -m
573        subs            w4,  w4,  #2
574        ld1             {v0.d}[1], [x8]
575        sxtl            v2.8h,   v2.8b
576        shl             v2.8h,   v2.8h,   #9      // -m << 9
577        sub             v1.8h,   v0.8h,   v1.8h   // a - b
578        sqrdmulh        v1.8h,   v1.8h,   v2.8h   // ((a-b)*-m + 32) >> 6
579        add             v0.8h,   v0.8h,   v1.8h
580        st1             {v0.d}[0], [x0], x1
581        st1             {v0.d}[1], [x8], x1
582        b.gt            4b
583        ret
58480:
585        lsl             x1,  x1,  #1
5868:
587        ld1             {v4.16b},       [x5], #16
588        ld1             {v2.8h, v3.8h}, [x2], #32
589        neg             v5.16b,  v4.16b           // -m
590        ld1             {v0.8h},   [x0]
591        ld1             {v1.8h},   [x8]
592        sxtl            v4.8h,   v5.8b
593        sxtl2           v5.8h,   v5.16b
594        shl             v4.8h,   v4.8h,   #9      // -m << 9
595        shl             v5.8h,   v5.8h,   #9
596        sub             v2.8h,   v0.8h,   v2.8h   // a - b
597        sub             v3.8h,   v1.8h,   v3.8h
598        subs            w4,  w4,  #2
599        sqrdmulh        v2.8h,   v2.8h,   v4.8h   // ((a-b)*-m + 32) >> 6
600        sqrdmulh        v3.8h,   v3.8h,   v5.8h
601        add             v0.8h,   v0.8h,   v2.8h
602        add             v1.8h,   v1.8h,   v3.8h
603        st1             {v0.8h}, [x0], x1
604        st1             {v1.8h}, [x8], x1
605        b.gt            8b
606        ret
607160:
608        lsl             x1,  x1,  #1
60916:
610        ld1             {v16.16b, v17.16b},           [x5], #32
611        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
612        subs            w4,  w4,  #2
613        neg             v18.16b, v16.16b          // -m
614        neg             v19.16b, v17.16b
615        ld1             {v0.8h, v1.8h}, [x0]
616        sxtl            v16.8h,  v18.8b
617        sxtl2           v17.8h,  v18.16b
618        sxtl            v18.8h,  v19.8b
619        sxtl2           v19.8h,  v19.16b
620        ld1             {v2.8h, v3.8h}, [x8]
621        shl             v16.8h,  v16.8h,  #9      // -m << 9
622        shl             v17.8h,  v17.8h,  #9
623        shl             v18.8h,  v18.8h,  #9
624        shl             v19.8h,  v19.8h,  #9
625        sub             v4.8h,   v0.8h,   v4.8h   // a - b
626        sub             v5.8h,   v1.8h,   v5.8h
627        sub             v6.8h,   v2.8h,   v6.8h
628        sub             v7.8h,   v3.8h,   v7.8h
629        sqrdmulh        v4.8h,   v4.8h,   v16.8h  // ((a-b)*-m + 32) >> 6
630        sqrdmulh        v5.8h,   v5.8h,   v17.8h
631        sqrdmulh        v6.8h,   v6.8h,   v18.8h
632        sqrdmulh        v7.8h,   v7.8h,   v19.8h
633        add             v0.8h,   v0.8h,   v4.8h
634        add             v1.8h,   v1.8h,   v5.8h
635        add             v2.8h,   v2.8h,   v6.8h
636        add             v3.8h,   v3.8h,   v7.8h
637        st1             {v0.8h, v1.8h}, [x0], x1
638        st1             {v2.8h, v3.8h}, [x8], x1
639        b.gt            16b
640        ret
64132:
642        ld1             {v16.16b, v17.16b},           [x5], #32
643        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
644        subs            w4,  w4,  #1
645        neg             v18.16b, v16.16b          // -m
646        neg             v19.16b, v17.16b
647        sxtl            v16.8h,  v18.8b
648        sxtl2           v17.8h,  v18.16b
649        sxtl            v18.8h,  v19.8b
650        sxtl2           v19.8h,  v19.16b
651        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0]
652        shl             v16.8h,  v16.8h,  #9      // -m << 9
653        shl             v17.8h,  v17.8h,  #9
654        shl             v18.8h,  v18.8h,  #9
655        shl             v19.8h,  v19.8h,  #9
656        sub             v4.8h,   v0.8h,   v4.8h   // a - b
657        sub             v5.8h,   v1.8h,   v5.8h
658        sub             v6.8h,   v2.8h,   v6.8h
659        sub             v7.8h,   v3.8h,   v7.8h
660        sqrdmulh        v4.8h,   v4.8h,   v16.8h  // ((a-b)*-m + 32) >> 6
661        sqrdmulh        v5.8h,   v5.8h,   v17.8h
662        sqrdmulh        v6.8h,   v6.8h,   v18.8h
663        sqrdmulh        v7.8h,   v7.8h,   v19.8h
664        add             v0.8h,   v0.8h,   v4.8h
665        add             v1.8h,   v1.8h,   v5.8h
666        add             v2.8h,   v2.8h,   v6.8h
667        add             v3.8h,   v3.8h,   v7.8h
668        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
669        b.gt            32b
670        ret
671L(blend_tbl):
672        .hword L(blend_tbl) -  32b
673        .hword L(blend_tbl) - 160b
674        .hword L(blend_tbl) -  80b
675        .hword L(blend_tbl) -  40b
676endfunc
677
678function blend_h_16bpc_neon, export=1
679        adr             x6,  L(blend_h_tbl)
680        movrel          x5,  X(obmc_masks)
681        add             x5,  x5,  w4,  uxtw
682        sub             w4,  w4,  w4,  lsr #2
683        clz             w7,  w3
684        add             x8,  x0,  x1
685        lsl             x1,  x1,  #1
686        sub             w7,  w7,  #24
687        ldrh            w7,  [x6,  x7,  lsl #1]
688        sub             x6,  x6,  w7, uxtw
689        br              x6
6902:
691        ld2r            {v2.8b, v3.8b}, [x5], #2
692        ld1             {v1.4h},        [x2], #8
693        ext             v2.8b,   v2.8b,   v3.8b,   #6
694        subs            w4,  w4,  #2
695        neg             v2.8b,   v2.8b            // -m
696        ld1             {v0.s}[0], [x0]
697        ld1             {v0.s}[1], [x8]
698        sxtl            v2.8h,   v2.8b
699        shl             v2.4h,   v2.4h,   #9      // -m << 9
700        sub             v1.4h,   v0.4h,   v1.4h   // a - b
701        sqrdmulh        v1.4h,   v1.4h,   v2.4h   // ((a-b)*-m + 32) >> 6
702        add             v0.4h,   v0.4h,   v1.4h
703        st1             {v0.s}[0], [x0], x1
704        st1             {v0.s}[1], [x8], x1
705        b.gt            2b
706        ret
7074:
708        ld2r            {v2.8b, v3.8b}, [x5], #2
709        ld1             {v1.8h},        [x2], #16
710        ext             v2.8b,   v2.8b,   v3.8b,   #4
711        subs            w4,  w4,  #2
712        neg             v2.8b,   v2.8b            // -m
713        ld1             {v0.d}[0],   [x0]
714        ld1             {v0.d}[1],   [x8]
715        sxtl            v2.8h,   v2.8b
716        shl             v2.8h,   v2.8h,   #9      // -m << 9
717        sub             v1.8h,   v0.8h,   v1.8h   // a - b
718        sqrdmulh        v1.8h,   v1.8h,   v2.8h   // ((a-b)*-m + 32) >> 6
719        add             v0.8h,   v0.8h,   v1.8h
720        st1             {v0.d}[0], [x0], x1
721        st1             {v0.d}[1], [x8], x1
722        b.gt            4b
723        ret
7248:
725        ld2r            {v4.8b, v5.8b}, [x5], #2
726        ld1             {v2.8h, v3.8h}, [x2], #32
727        neg             v4.8b,   v4.8b            // -m
728        neg             v5.8b,   v5.8b
729        ld1             {v0.8h}, [x0]
730        subs            w4,  w4,  #2
731        sxtl            v4.8h,   v4.8b
732        sxtl            v5.8h,   v5.8b
733        ld1             {v1.8h}, [x8]
734        shl             v4.8h,   v4.8h,   #9      // -m << 9
735        shl             v5.8h,   v5.8h,   #9
736        sub             v2.8h,   v0.8h,   v2.8h   // a - b
737        sub             v3.8h,   v1.8h,   v3.8h
738        sqrdmulh        v2.8h,   v2.8h,   v4.8h   // ((a-b)*-m + 32) >> 6
739        sqrdmulh        v3.8h,   v3.8h,   v5.8h
740        add             v0.8h,   v0.8h,   v2.8h
741        add             v1.8h,   v1.8h,   v3.8h
742        st1             {v0.8h}, [x0], x1
743        st1             {v1.8h}, [x8], x1
744        b.gt            8b
745        ret
74616:
747        ld2r            {v16.8b, v17.8b}, [x5], #2
748        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
749        neg             v16.8b,  v16.8b           // -m
750        neg             v17.8b,  v17.8b
751        ld1             {v0.8h, v1.8h},  [x0]
752        ld1             {v2.8h, v3.8h},  [x8]
753        subs            w4,  w4,  #2
754        sxtl            v16.8h,  v16.8b
755        sxtl            v17.8h,  v17.8b
756        shl             v16.8h,  v16.8h,  #9      // -m << 9
757        shl             v17.8h,  v17.8h,  #9
758        sub             v4.8h,   v0.8h,   v4.8h   // a - b
759        sub             v5.8h,   v1.8h,   v5.8h
760        sub             v6.8h,   v2.8h,   v6.8h
761        sub             v7.8h,   v3.8h,   v7.8h
762        sqrdmulh        v4.8h,   v4.8h,   v16.8h  // ((a-b)*-m + 32) >> 6
763        sqrdmulh        v5.8h,   v5.8h,   v16.8h
764        sqrdmulh        v6.8h,   v6.8h,   v17.8h
765        sqrdmulh        v7.8h,   v7.8h,   v17.8h
766        add             v0.8h,   v0.8h,   v4.8h
767        add             v1.8h,   v1.8h,   v5.8h
768        add             v2.8h,   v2.8h,   v6.8h
769        add             v3.8h,   v3.8h,   v7.8h
770        st1             {v0.8h, v1.8h}, [x0], x1
771        st1             {v2.8h, v3.8h}, [x8], x1
772        b.gt            16b
773        ret
7741280:
775640:
776320:
777        sub             x1,  x1,  w3,  uxtw #1
778        add             x7,  x2,  w3,  uxtw #1
779321:
780        ld2r            {v24.8b, v25.8b}, [x5], #2
781        mov             w6,  w3
782        neg             v24.8b,  v24.8b           // -m
783        neg             v25.8b,  v25.8b
784        sxtl            v24.8h,  v24.8b
785        sxtl            v25.8h,  v25.8b
786        shl             v24.8h,  v24.8h,  #9      // -m << 9
787        shl             v25.8h,  v25.8h,  #9
78832:
789        ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x2], #64
790        ld1             {v0.8h,  v1.8h,  v2.8h,  v3.8h},  [x0]
791        subs            w6,  w6,  #32
792        sub             v16.8h,  v0.8h,   v16.8h  // a - b
793        sub             v17.8h,  v1.8h,   v17.8h
794        sub             v18.8h,  v2.8h,   v18.8h
795        sub             v19.8h,  v3.8h,   v19.8h
796        ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x7], #64
797        ld1             {v4.8h,  v5.8h,  v6.8h,  v7.8h},  [x8]
798        sqrdmulh        v16.8h,  v16.8h,  v24.8h  // ((a-b)*-m + 32) >> 6
799        sqrdmulh        v17.8h,  v17.8h,  v24.8h
800        sqrdmulh        v18.8h,  v18.8h,  v24.8h
801        sqrdmulh        v19.8h,  v19.8h,  v24.8h
802        sub             v20.8h,  v4.8h,   v20.8h  // a - b
803        sub             v21.8h,  v5.8h,   v21.8h
804        sub             v22.8h,  v6.8h,   v22.8h
805        sub             v23.8h,  v7.8h,   v23.8h
806        add             v0.8h,   v0.8h,   v16.8h
807        add             v1.8h,   v1.8h,   v17.8h
808        add             v2.8h,   v2.8h,   v18.8h
809        add             v3.8h,   v3.8h,   v19.8h
810        sqrdmulh        v20.8h,  v20.8h,  v25.8h  // ((a-b)*-m + 32) >> 6
811        sqrdmulh        v21.8h,  v21.8h,  v25.8h
812        sqrdmulh        v22.8h,  v22.8h,  v25.8h
813        sqrdmulh        v23.8h,  v23.8h,  v25.8h
814        st1             {v0.8h,  v1.8h,  v2.8h,  v3.8h},  [x0], #64
815        add             v4.8h,   v4.8h,   v20.8h
816        add             v5.8h,   v5.8h,   v21.8h
817        add             v6.8h,   v6.8h,   v22.8h
818        add             v7.8h,   v7.8h,   v23.8h
819        st1             {v4.8h,  v5.8h,  v6.8h,  v7.8h},  [x8], #64
820        b.gt            32b
821        subs            w4,  w4,  #2
822        add             x0,  x0,  x1
823        add             x8,  x8,  x1
824        add             x2,  x2,  w3,  uxtw #1
825        add             x7,  x7,  w3,  uxtw #1
826        b.gt            321b
827        ret
828L(blend_h_tbl):
829        .hword L(blend_h_tbl) - 1280b
830        .hword L(blend_h_tbl) -  640b
831        .hword L(blend_h_tbl) -  320b
832        .hword L(blend_h_tbl) -   16b
833        .hword L(blend_h_tbl) -    8b
834        .hword L(blend_h_tbl) -    4b
835        .hword L(blend_h_tbl) -    2b
836endfunc
837
838function blend_v_16bpc_neon, export=1
839        adr             x6,  L(blend_v_tbl)
840        movrel          x5,  X(obmc_masks)
841        add             x5,  x5,  w3,  uxtw
842        clz             w3,  w3
843        add             x8,  x0,  x1
844        lsl             x1,  x1,  #1
845        sub             w3,  w3,  #26
846        ldrh            w3,  [x6,  x3,  lsl #1]
847        sub             x6,  x6,  w3,  uxtw
848        br              x6
84920:
850        ld1r            {v2.8b}, [x5]
851        neg             v2.8b,   v2.8b            // -m
852        sxtl            v2.8h,   v2.8b
853        shl             v2.4h,   v2.4h,   #9      // -m << 9
8542:
855        ld1             {v1.s}[0], [x2], #4
856        ld1             {v0.h}[0], [x0]
857        subs            w4,  w4,  #2
858        ld1             {v1.h}[1], [x2]
859        ld1             {v0.h}[1], [x8]
860        add             x2,  x2,  #4
861        sub             v1.4h,   v0.4h,   v1.4h   // a - b
862        sqrdmulh        v1.4h,   v1.4h,   v2.4h   // ((a-b)*-m + 32) >> 6
863        add             v0.4h,   v0.4h,   v1.4h
864        st1             {v0.h}[0], [x0],  x1
865        st1             {v0.h}[1], [x8],  x1
866        b.gt            2b
867        ret
86840:
869        ld1r            {v2.2s}, [x5]
870        sub             x1,  x1,  #4
871        neg             v2.8b,   v2.8b            // -m
872        sxtl            v2.8h,   v2.8b
873        shl             v2.8h,   v2.8h,   #9      // -m << 9
8744:
875        ld1             {v1.8h},   [x2], #16
876        ld1             {v0.d}[0], [x0]
877        ld1             {v0.d}[1], [x8]
878        subs            w4,  w4,  #2
879        sub             v1.8h,   v0.8h,   v1.8h   // a - b
880        sqrdmulh        v1.8h,   v1.8h,   v2.8h   // ((a-b)*-m + 32) >> 6
881        add             v0.8h,   v0.8h,   v1.8h
882        st1             {v0.s}[0], [x0], #4
883        st1             {v0.s}[2], [x8], #4
884        st1             {v0.h}[2], [x0], x1
885        st1             {v0.h}[6], [x8], x1
886        b.gt            4b
887        ret
88880:
889        ld1             {v4.8b}, [x5]
890        sub             x1,  x1,  #8
891        neg             v4.8b,   v4.8b            // -m
892        sxtl            v4.8h,   v4.8b
893        shl             v4.8h,   v4.8h,   #9      // -m << 9
8948:
895        ld1             {v2.8h, v3.8h}, [x2], #32
896        ld1             {v0.8h}, [x0]
897        ld1             {v1.8h}, [x8]
898        subs            w4,  w4,  #2
899        sub             v2.8h,   v0.8h,   v2.8h   // a - b
900        sub             v3.8h,   v1.8h,   v3.8h
901        sqrdmulh        v2.8h,   v2.8h,   v4.8h   // ((a-b)*-m + 32) >> 6
902        sqrdmulh        v3.8h,   v3.8h,   v4.8h
903        add             v0.8h,   v0.8h,   v2.8h
904        add             v1.8h,   v1.8h,   v3.8h
905        st1             {v0.d}[0], [x0], #8
906        st1             {v1.d}[0], [x8], #8
907        st1             {v0.s}[2], [x0], x1
908        st1             {v1.s}[2], [x8], x1
909        b.gt            8b
910        ret
911160:
912        ld1             {v16.8b, v17.8b}, [x5]
913        sub             x1,  x1,  #16
914        neg             v16.8b,  v16.8b           // -m
915        neg             v17.8b,  v17.8b
916        sxtl            v16.8h,  v16.8b
917        sxtl            v17.8h,  v17.8b
918        shl             v16.8h,  v16.8h,  #9      // -m << 9
919        shl             v17.4h,  v17.4h,  #9
92016:
921        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
922        ld1             {v0.8h, v1.8h}, [x0]
923        subs            w4,  w4,  #2
924        ld1             {v2.8h, v3.8h}, [x8]
925        sub             v4.8h,   v0.8h,   v4.8h   // a - b
926        sub             v5.4h,   v1.4h,   v5.4h
927        sub             v6.8h,   v2.8h,   v6.8h
928        sub             v7.4h,   v3.4h,   v7.4h
929        sqrdmulh        v4.8h,   v4.8h,   v16.8h  // ((a-b)*-m + 32) >> 6
930        sqrdmulh        v5.4h,   v5.4h,   v17.4h
931        sqrdmulh        v6.8h,   v6.8h,   v16.8h
932        sqrdmulh        v7.4h,   v7.4h,   v17.4h
933        add             v0.8h,   v0.8h,   v4.8h
934        add             v1.4h,   v1.4h,   v5.4h
935        add             v2.8h,   v2.8h,   v6.8h
936        add             v3.4h,   v3.4h,   v7.4h
937        st1             {v0.8h}, [x0], #16
938        st1             {v2.8h}, [x8], #16
939        st1             {v1.4h}, [x0], x1
940        st1             {v3.4h}, [x8], x1
941        b.gt            16b
942        ret
943320:
944        ld1             {v24.16b, v25.16b},  [x5]
945        neg             v26.16b, v24.16b          // -m
946        neg             v27.8b,  v25.8b
947        sxtl            v24.8h,  v26.8b
948        sxtl2           v25.8h,  v26.16b
949        sxtl            v26.8h,  v27.8b
950        shl             v24.8h,  v24.8h,  #9      // -m << 9
951        shl             v25.8h,  v25.8h,  #9
952        shl             v26.8h,  v26.8h,  #9
95332:
954        ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x2], #64
955        ld1             {v0.8h, v1.8h, v2.8h}, [x0]
956        ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x2], #64
957        ld1             {v4.8h, v5.8h, v6.8h}, [x8]
958        subs            w4,  w4,  #2
959        sub             v16.8h,  v0.8h,   v16.8h  // a - b
960        sub             v17.8h,  v1.8h,   v17.8h
961        sub             v18.8h,  v2.8h,   v18.8h
962        sub             v20.8h,  v4.8h,   v20.8h
963        sub             v21.8h,  v5.8h,   v21.8h
964        sub             v22.8h,  v6.8h,   v22.8h
965        sqrdmulh        v16.8h,  v16.8h,  v24.8h  // ((a-b)*-m + 32) >> 6
966        sqrdmulh        v17.8h,  v17.8h,  v25.8h
967        sqrdmulh        v18.8h,  v18.8h,  v26.8h
968        sqrdmulh        v20.8h,  v20.8h,  v24.8h
969        sqrdmulh        v21.8h,  v21.8h,  v25.8h
970        sqrdmulh        v22.8h,  v22.8h,  v26.8h
971        add             v0.8h,   v0.8h,   v16.8h
972        add             v1.8h,   v1.8h,   v17.8h
973        add             v2.8h,   v2.8h,   v18.8h
974        add             v4.8h,   v4.8h,   v20.8h
975        add             v5.8h,   v5.8h,   v21.8h
976        add             v6.8h,   v6.8h,   v22.8h
977        st1             {v0.8h, v1.8h, v2.8h}, [x0], x1
978        st1             {v4.8h, v5.8h, v6.8h}, [x8], x1
979        b.gt            32b
980        ret
981L(blend_v_tbl):
982        .hword L(blend_v_tbl) - 320b
983        .hword L(blend_v_tbl) - 160b
984        .hword L(blend_v_tbl) -  80b
985        .hword L(blend_v_tbl) -  40b
986        .hword L(blend_v_tbl) -  20b
987endfunc
988
989
990// This has got the same signature as the put_8tap functions,
991// and assumes that x9 is set to (clz(w)-24).
992function put_neon
993        adr             x10, L(put_tbl)
994        ldrh            w9, [x10, x9, lsl #1]
995        sub             x10, x10, w9, uxtw
996        br              x10
997
9982:
999        ld1             {v0.s}[0], [x2], x3
1000        ld1             {v1.s}[0], [x2], x3
1001        subs            w5,  w5,  #2
1002        st1             {v0.s}[0], [x0], x1
1003        st1             {v1.s}[0], [x0], x1
1004        b.gt            2b
1005        ret
10064:
1007        ld1             {v0.8b}, [x2], x3
1008        ld1             {v1.8b}, [x2], x3
1009        subs            w5,  w5,  #2
1010        st1             {v0.8b}, [x0], x1
1011        st1             {v1.8b}, [x0], x1
1012        b.gt            4b
1013        ret
101480:
1015        add             x8,  x0,  x1
1016        lsl             x1,  x1,  #1
1017        add             x9,  x2,  x3
1018        lsl             x3,  x3,  #1
10198:
1020        ld1             {v0.16b}, [x2], x3
1021        ld1             {v1.16b}, [x9], x3
1022        subs            w5,  w5,  #2
1023        st1             {v0.16b}, [x0], x1
1024        st1             {v1.16b}, [x8], x1
1025        b.gt            8b
1026        ret
102716:
1028        ldp             x6,  x7,  [x2]
1029        ldp             x8,  x9,  [x2, #16]
1030        stp             x6,  x7,  [x0]
1031        subs            w5,  w5,  #1
1032        stp             x8,  x9,  [x0, #16]
1033        add             x2,  x2,  x3
1034        add             x0,  x0,  x1
1035        b.gt            16b
1036        ret
103732:
1038        ldp             x6,  x7,  [x2]
1039        ldp             x8,  x9,  [x2, #16]
1040        stp             x6,  x7,  [x0]
1041        ldp             x10, x11, [x2, #32]
1042        stp             x8,  x9,  [x0, #16]
1043        subs            w5,  w5,  #1
1044        ldp             x12, x13, [x2, #48]
1045        stp             x10, x11, [x0, #32]
1046        stp             x12, x13, [x0, #48]
1047        add             x2,  x2,  x3
1048        add             x0,  x0,  x1
1049        b.gt            32b
1050        ret
105164:
1052        ldp             q0,  q1,  [x2]
1053        ldp             q2,  q3,  [x2, #32]
1054        stp             q0,  q1,  [x0]
1055        ldp             q4,  q5,  [x2, #64]
1056        stp             q2,  q3,  [x0, #32]
1057        ldp             q6,  q7,  [x2, #96]
1058        subs            w5,  w5,  #1
1059        stp             q4,  q5,  [x0, #64]
1060        stp             q6,  q7,  [x0, #96]
1061        add             x2,  x2,  x3
1062        add             x0,  x0,  x1
1063        b.gt            64b
1064        ret
1065128:
1066        ldp             q0,  q1,  [x2]
1067        ldp             q2,  q3,  [x2, #32]
1068        stp             q0,  q1,  [x0]
1069        ldp             q4,  q5,  [x2, #64]
1070        stp             q2,  q3,  [x0, #32]
1071        ldp             q6,  q7,  [x2, #96]
1072        subs            w5,  w5,  #1
1073        stp             q4,  q5,  [x0, #64]
1074        ldp             q16, q17, [x2, #128]
1075        stp             q6,  q7,  [x0, #96]
1076        ldp             q18, q19, [x2, #160]
1077        stp             q16, q17, [x0, #128]
1078        ldp             q20, q21, [x2, #192]
1079        stp             q18, q19, [x0, #160]
1080        ldp             q22, q23, [x2, #224]
1081        stp             q20, q21, [x0, #192]
1082        stp             q22, q23, [x0, #224]
1083        add             x2,  x2,  x3
1084        add             x0,  x0,  x1
1085        b.gt            128b
1086        ret
1087
1088L(put_tbl):
1089        .hword L(put_tbl) - 128b
1090        .hword L(put_tbl) -  64b
1091        .hword L(put_tbl) -  32b
1092        .hword L(put_tbl) -  16b
1093        .hword L(put_tbl) -  80b
1094        .hword L(put_tbl) -   4b
1095        .hword L(put_tbl) -   2b
1096endfunc
1097
1098
1099// This has got the same signature as the prep_8tap functions,
1100// and assumes that x9 is set to (clz(w)-24), w7 to intermediate_bits and
1101// x8 to w*2.
1102function prep_neon
1103        adr             x10, L(prep_tbl)
1104        ldrh            w9, [x10, x9, lsl #1]
1105        dup             v31.8h,  w7   // intermediate_bits
1106        movi            v30.8h,  #(PREP_BIAS >> 8), lsl #8
1107        sub             x10, x10, w9, uxtw
1108        br              x10
1109
111040:
1111        add             x9,  x1,  x2
1112        lsl             x2,  x2,  #1
11134:
1114        ld1             {v0.d}[0], [x1], x2
1115        ld1             {v0.d}[1], [x9], x2
1116        subs            w4,  w4,  #2
1117        sshl            v0.8h,   v0.8h,   v31.8h
1118        sub             v0.8h,   v0.8h,   v30.8h
1119        st1             {v0.8h}, [x0], #16
1120        b.gt            4b
1121        ret
112280:
1123        add             x9,  x1,  x2
1124        lsl             x2,  x2,  #1
11258:
1126        ld1             {v0.8h}, [x1], x2
1127        ld1             {v1.8h}, [x9], x2
1128        subs            w4,  w4,  #2
1129        sshl            v0.8h,   v0.8h,   v31.8h
1130        sshl            v1.8h,   v1.8h,   v31.8h
1131        sub             v0.8h,   v0.8h,   v30.8h
1132        sub             v1.8h,   v1.8h,   v30.8h
1133        st1             {v0.8h, v1.8h}, [x0], #32
1134        b.gt            8b
1135        ret
113616:
1137        ldp             q0,  q1,  [x1]
1138        add             x1,  x1,  x2
1139        sshl            v0.8h,   v0.8h,   v31.8h
1140        ldp             q2,  q3,  [x1]
1141        add             x1,  x1,  x2
1142        subs            w4,  w4,  #2
1143        sshl            v1.8h,   v1.8h,   v31.8h
1144        sshl            v2.8h,   v2.8h,   v31.8h
1145        sshl            v3.8h,   v3.8h,   v31.8h
1146        sub             v0.8h,   v0.8h,   v30.8h
1147        sub             v1.8h,   v1.8h,   v30.8h
1148        sub             v2.8h,   v2.8h,   v30.8h
1149        sub             v3.8h,   v3.8h,   v30.8h
1150        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
1151        b.gt            16b
1152        ret
115332:
1154        ldp             q0,  q1,  [x1]
1155        sshl            v0.8h,   v0.8h,   v31.8h
1156        ldp             q2,  q3,  [x1, #32]
1157        add             x1,  x1,  x2
1158        sshl            v1.8h,   v1.8h,   v31.8h
1159        sshl            v2.8h,   v2.8h,   v31.8h
1160        sshl            v3.8h,   v3.8h,   v31.8h
1161        subs            w4,  w4,  #1
1162        sub             v0.8h,   v0.8h,   v30.8h
1163        sub             v1.8h,   v1.8h,   v30.8h
1164        sub             v2.8h,   v2.8h,   v30.8h
1165        sub             v3.8h,   v3.8h,   v30.8h
1166        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
1167        b.gt            32b
1168        ret
116964:
1170        ldp             q0,  q1,  [x1]
1171        subs            w4,  w4,  #1
1172        sshl            v0.8h,   v0.8h,   v31.8h
1173        ldp             q2,  q3,  [x1, #32]
1174        sshl            v1.8h,   v1.8h,   v31.8h
1175        ldp             q4,  q5,  [x1, #64]
1176        sshl            v2.8h,   v2.8h,   v31.8h
1177        sshl            v3.8h,   v3.8h,   v31.8h
1178        ldp             q6,  q7,  [x1, #96]
1179        add             x1,  x1,  x2
1180        sshl            v4.8h,   v4.8h,   v31.8h
1181        sshl            v5.8h,   v5.8h,   v31.8h
1182        sshl            v6.8h,   v6.8h,   v31.8h
1183        sshl            v7.8h,   v7.8h,   v31.8h
1184        sub             v0.8h,   v0.8h,   v30.8h
1185        sub             v1.8h,   v1.8h,   v30.8h
1186        sub             v2.8h,   v2.8h,   v30.8h
1187        sub             v3.8h,   v3.8h,   v30.8h
1188        stp             q0,  q1,  [x0]
1189        sub             v4.8h,   v4.8h,   v30.8h
1190        sub             v5.8h,   v5.8h,   v30.8h
1191        stp             q2,  q3,  [x0, #32]
1192        sub             v6.8h,   v6.8h,   v30.8h
1193        sub             v7.8h,   v7.8h,   v30.8h
1194        stp             q4,  q5,  [x0, #64]
1195        stp             q6,  q7,  [x0, #96]
1196        add             x0,  x0,  x8
1197        b.gt            64b
1198        ret
1199128:
1200        ldp             q0,  q1,  [x1]
1201        subs            w4,  w4,  #1
1202        sshl            v0.8h,   v0.8h,   v31.8h
1203        ldp             q2,  q3,  [x1, #32]
1204        sshl            v1.8h,   v1.8h,   v31.8h
1205        ldp             q4,  q5,  [x1, #64]
1206        sshl            v2.8h,   v2.8h,   v31.8h
1207        sshl            v3.8h,   v3.8h,   v31.8h
1208        ldp             q6,  q7,  [x1, #96]
1209        sshl            v4.8h,   v4.8h,   v31.8h
1210        sshl            v5.8h,   v5.8h,   v31.8h
1211        ldp             q16, q17, [x1, #128]
1212        sshl            v6.8h,   v6.8h,   v31.8h
1213        sshl            v7.8h,   v7.8h,   v31.8h
1214        ldp             q18, q19, [x1, #160]
1215        sshl            v16.8h,  v16.8h,  v31.8h
1216        sshl            v17.8h,  v17.8h,  v31.8h
1217        ldp             q20, q21, [x1, #192]
1218        sshl            v18.8h,  v18.8h,  v31.8h
1219        sshl            v19.8h,  v19.8h,  v31.8h
1220        ldp             q22, q23, [x1, #224]
1221        add             x1,  x1,  x2
1222        sshl            v20.8h,  v20.8h,  v31.8h
1223        sshl            v21.8h,  v21.8h,  v31.8h
1224        sshl            v22.8h,  v22.8h,  v31.8h
1225        sshl            v23.8h,  v23.8h,  v31.8h
1226        sub             v0.8h,   v0.8h,   v30.8h
1227        sub             v1.8h,   v1.8h,   v30.8h
1228        sub             v2.8h,   v2.8h,   v30.8h
1229        sub             v3.8h,   v3.8h,   v30.8h
1230        stp             q0,  q1,  [x0]
1231        sub             v4.8h,   v4.8h,   v30.8h
1232        sub             v5.8h,   v5.8h,   v30.8h
1233        stp             q2,  q3,  [x0, #32]
1234        sub             v6.8h,   v6.8h,   v30.8h
1235        sub             v7.8h,   v7.8h,   v30.8h
1236        stp             q4,  q5,  [x0, #64]
1237        sub             v16.8h,  v16.8h,  v30.8h
1238        sub             v17.8h,  v17.8h,  v30.8h
1239        stp             q6,  q7,  [x0, #96]
1240        sub             v18.8h,  v18.8h,  v30.8h
1241        sub             v19.8h,  v19.8h,  v30.8h
1242        stp             q16, q17, [x0, #128]
1243        sub             v20.8h,  v20.8h,  v30.8h
1244        sub             v21.8h,  v21.8h,  v30.8h
1245        stp             q18, q19, [x0, #160]
1246        sub             v22.8h,  v22.8h,  v30.8h
1247        sub             v23.8h,  v23.8h,  v30.8h
1248        stp             q20, q21, [x0, #192]
1249        stp             q22, q23, [x0, #224]
1250        add             x0,  x0,  x8
1251        b.gt            128b
1252        ret
1253
1254L(prep_tbl):
1255        .hword L(prep_tbl) - 128b
1256        .hword L(prep_tbl) -  64b
1257        .hword L(prep_tbl) -  32b
1258        .hword L(prep_tbl) -  16b
1259        .hword L(prep_tbl) -  80b
1260        .hword L(prep_tbl) -  40b
1261endfunc
1262
1263
1264.macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6
1265        ld1             {\d0\wd}[0], [\s0], \strd
1266        ld1             {\d1\wd}[0], [\s1], \strd
1267.ifnb \d2
1268        ld1             {\d2\wd}[0], [\s0], \strd
1269        ld1             {\d3\wd}[0], [\s1], \strd
1270.endif
1271.ifnb \d4
1272        ld1             {\d4\wd}[0], [\s0], \strd
1273.endif
1274.ifnb \d5
1275        ld1             {\d5\wd}[0], [\s1], \strd
1276.endif
1277.ifnb \d6
1278        ld1             {\d6\wd}[0], [\s0], \strd
1279.endif
1280.endm
1281.macro load_reg s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6
1282        ld1             {\d0\wd}, [\s0], \strd
1283        ld1             {\d1\wd}, [\s1], \strd
1284.ifnb \d2
1285        ld1             {\d2\wd}, [\s0], \strd
1286        ld1             {\d3\wd}, [\s1], \strd
1287.endif
1288.ifnb \d4
1289        ld1             {\d4\wd}, [\s0], \strd
1290.endif
1291.ifnb \d5
1292        ld1             {\d5\wd}, [\s1], \strd
1293.endif
1294.ifnb \d6
1295        ld1             {\d6\wd}, [\s0], \strd
1296.endif
1297.endm
1298.macro load_regpair s0, s1, strd, wd, d0, d1, d2, d3, d4, d5
1299        ld1             {\d0\wd, \d1\wd}, [\s0], \strd
1300.ifnb \d2
1301        ld1             {\d2\wd, \d3\wd}, [\s1], \strd
1302.endif
1303.ifnb \d4
1304        ld1             {\d4\wd, \d5\wd}, [\s0], \strd
1305.endif
1306.endm
1307.macro load_s s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
1308        load_slice      \s0, \s1, \strd, .s, \d0, \d1, \d2, \d3, \d4, \d5, \d6
1309.endm
1310.macro load_4h s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
1311        load_reg        \s0, \s1, \strd, .4h, \d0, \d1, \d2, \d3, \d4, \d5, \d6
1312.endm
1313.macro load_8h s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
1314        load_reg        \s0, \s1, \strd, .8h, \d0, \d1, \d2, \d3, \d4, \d5, \d6
1315.endm
1316.macro load_16h s0, s1, strd, d0, d1, d2, d3, d4, d5
1317        load_regpair    \s0, \s1, \strd, .8h, \d0, \d1, \d2, \d3, \d4, \d5
1318.endm
1319.macro interleave_1 wd, r0, r1, r2, r3, r4
1320        trn1            \r0\wd, \r0\wd, \r1\wd
1321        trn1            \r1\wd, \r1\wd, \r2\wd
1322.ifnb \r3
1323        trn1            \r2\wd, \r2\wd, \r3\wd
1324        trn1            \r3\wd, \r3\wd, \r4\wd
1325.endif
1326.endm
1327.macro interleave_1_s r0, r1, r2, r3, r4
1328        interleave_1    .2s, \r0, \r1, \r2, \r3, \r4
1329.endm
1330.macro umin_h c, wd, r0, r1, r2, r3
1331        umin            \r0\wd,  \r0\wd,  \c\wd
1332.ifnb \r1
1333        umin            \r1\wd,  \r1\wd,  \c\wd
1334.endif
1335.ifnb \r2
1336        umin            \r2\wd,  \r2\wd,  \c\wd
1337        umin            \r3\wd,  \r3\wd,  \c\wd
1338.endif
1339.endm
1340.macro sub_h c, wd, r0, r1, r2, r3
1341        sub             \r0\wd,  \r0\wd,  \c\wd
1342.ifnb \r1
1343        sub             \r1\wd,  \r1\wd,  \c\wd
1344.endif
1345.ifnb \r2
1346        sub             \r2\wd,  \r2\wd,  \c\wd
1347        sub             \r3\wd,  \r3\wd,  \c\wd
1348.endif
1349.endm
1350.macro smull_smlal_4 d, s0, s1, s2, s3
1351        smull           \d\().4s,  \s0\().4h,  v0.h[0]
1352        smlal           \d\().4s,  \s1\().4h,  v0.h[1]
1353        smlal           \d\().4s,  \s2\().4h,  v0.h[2]
1354        smlal           \d\().4s,  \s3\().4h,  v0.h[3]
1355.endm
1356.macro smull2_smlal2_4 d, s0, s1, s2, s3
1357        smull2          \d\().4s,  \s0\().8h,  v0.h[0]
1358        smlal2          \d\().4s,  \s1\().8h,  v0.h[1]
1359        smlal2          \d\().4s,  \s2\().8h,  v0.h[2]
1360        smlal2          \d\().4s,  \s3\().8h,  v0.h[3]
1361.endm
1362.macro smull_smlal_8 d, s0, s1, s2, s3, s4, s5, s6, s7
1363        smull           \d\().4s,  \s0\().4h,  v0.h[0]
1364        smlal           \d\().4s,  \s1\().4h,  v0.h[1]
1365        smlal           \d\().4s,  \s2\().4h,  v0.h[2]
1366        smlal           \d\().4s,  \s3\().4h,  v0.h[3]
1367        smlal           \d\().4s,  \s4\().4h,  v0.h[4]
1368        smlal           \d\().4s,  \s5\().4h,  v0.h[5]
1369        smlal           \d\().4s,  \s6\().4h,  v0.h[6]
1370        smlal           \d\().4s,  \s7\().4h,  v0.h[7]
1371.endm
1372.macro smull2_smlal2_8 d, s0, s1, s2, s3, s4, s5, s6, s7
1373        smull2          \d\().4s,  \s0\().8h,  v0.h[0]
1374        smlal2          \d\().4s,  \s1\().8h,  v0.h[1]
1375        smlal2          \d\().4s,  \s2\().8h,  v0.h[2]
1376        smlal2          \d\().4s,  \s3\().8h,  v0.h[3]
1377        smlal2          \d\().4s,  \s4\().8h,  v0.h[4]
1378        smlal2          \d\().4s,  \s5\().8h,  v0.h[5]
1379        smlal2          \d\().4s,  \s6\().8h,  v0.h[6]
1380        smlal2          \d\().4s,  \s7\().8h,  v0.h[7]
1381.endm
1382.macro sqrshrun_h shift, r0, r1, r2, r3
1383        sqrshrun        \r0\().4h, \r0\().4s,  #\shift
1384.ifnb \r1
1385        sqrshrun2       \r0\().8h, \r1\().4s,  #\shift
1386.endif
1387.ifnb \r2
1388        sqrshrun        \r2\().4h, \r2\().4s,  #\shift
1389        sqrshrun2       \r2\().8h, \r3\().4s,  #\shift
1390.endif
1391.endm
1392.macro xtn_h r0, r1, r2, r3
1393        xtn             \r0\().4h,  \r0\().4s
1394        xtn2            \r0\().8h,  \r1\().4s
1395.ifnb \r2
1396        xtn             \r2\().4h,  \r2\().4s
1397        xtn2            \r2\().8h,  \r3\().4s
1398.endif
1399.endm
1400.macro srshl_s shift, r0, r1, r2, r3
1401        srshl           \r0\().4s,  \r0\().4s,  \shift\().4s
1402        srshl           \r1\().4s,  \r1\().4s,  \shift\().4s
1403.ifnb \r2
1404        srshl           \r2\().4s,  \r2\().4s,  \shift\().4s
1405        srshl           \r3\().4s,  \r3\().4s,  \shift\().4s
1406.endif
1407.endm
1408.macro st_s strd, reg, lanes
1409        st1             {\reg\().s}[0], [x0], \strd
1410        st1             {\reg\().s}[1], [x9], \strd
1411.if \lanes > 2
1412        st1             {\reg\().s}[2], [x0], \strd
1413        st1             {\reg\().s}[3], [x9], \strd
1414.endif
1415.endm
1416.macro st_d strd, r0, r1
1417        st1             {\r0\().d}[0], [x0], \strd
1418        st1             {\r0\().d}[1], [x9], \strd
1419.ifnb \r1
1420        st1             {\r1\().d}[0], [x0], \strd
1421        st1             {\r1\().d}[1], [x9], \strd
1422.endif
1423.endm
1424.macro shift_store_4 type, strd, r0, r1, r2, r3
1425.ifc \type, put
1426        sqrshrun_h      6,   \r0, \r1, \r2, \r3
1427        umin_h          v31, .8h, \r0, \r2
1428.else
1429        srshl_s         v30, \r0, \r1, \r2, \r3  // -(6-intermediate_bits)
1430        xtn_h           \r0, \r1, \r2, \r3
1431        sub_h           v29, .8h, \r0, \r2       // PREP_BIAS
1432.endif
1433        st_d            \strd, \r0, \r2
1434.endm
1435.macro st_reg strd, wd, r0, r1, r2, r3, r4, r5, r6, r7
1436        st1             {\r0\wd}, [x0], \strd
1437        st1             {\r1\wd}, [x9], \strd
1438.ifnb \r2
1439        st1             {\r2\wd}, [x0], \strd
1440        st1             {\r3\wd}, [x9], \strd
1441.endif
1442.ifnb \r4
1443        st1             {\r4\wd}, [x0], \strd
1444        st1             {\r5\wd}, [x9], \strd
1445        st1             {\r6\wd}, [x0], \strd
1446        st1             {\r7\wd}, [x9], \strd
1447.endif
1448.endm
1449.macro st_8h strd, r0, r1, r2, r3, r4, r5, r6, r7
1450        st_reg          \strd, .8h, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7
1451.endm
1452.macro shift_store_8 type, strd, r0, r1, r2, r3
1453.ifc \type, put
1454        sqrshrun_h      6,   \r0, \r1, \r2, \r3
1455        umin_h          v31, .8h, \r0, \r2
1456.else
1457        srshl_s         v30, \r0, \r1, \r2, \r3  // -(6-intermediate_bits)
1458        xtn_h           \r0, \r1, \r2, \r3
1459        sub_h           v29, .8h, \r0, \r2       // PREP_BIAS
1460.endif
1461        st_8h           \strd, \r0, \r2
1462.endm
1463.macro shift_store_16 type, strd, dst, r0, r1, r2, r3
1464.ifc \type, put
1465        sqrshrun_h      6,   \r0, \r1, \r2, \r3
1466        umin            \r0\().8h, \r0\().8h, v31.8h
1467        umin            \r1\().8h, \r2\().8h, v31.8h
1468.else
1469        srshl_s         v30, \r0, \r1, \r2, \r3  // -(6-intermediate_bits)
1470        xtn_h           \r0, \r1, \r2, \r3
1471        sub             \r0\().8h, \r0\().8h, v29.8h
1472        sub             \r1\().8h, \r2\().8h, v29.8h
1473.endif
1474        st1             {\r0\().8h, \r1\().8h}, [\dst], \strd
1475.endm
1476
1477.macro make_8tap_fn op, type, type_h, type_v
1478function \op\()_8tap_\type\()_16bpc_neon, export=1
1479        mov             w9,  \type_h
1480        mov             w10, \type_v
1481        b               \op\()_8tap_neon
1482endfunc
1483.endm
1484
1485// No spaces in these expressions, due to gas-preprocessor.
1486#define REGULAR ((0*15<<7)|3*15)
1487#define SMOOTH  ((1*15<<7)|4*15)
1488#define SHARP   ((2*15<<7)|3*15)
1489
1490.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, bdmax, ds2, sr2
1491make_8tap_fn \type, regular,        REGULAR, REGULAR
1492make_8tap_fn \type, regular_smooth, REGULAR, SMOOTH
1493make_8tap_fn \type, regular_sharp,  REGULAR, SHARP
1494make_8tap_fn \type, smooth,         SMOOTH,  SMOOTH
1495make_8tap_fn \type, smooth_regular, SMOOTH,  REGULAR
1496make_8tap_fn \type, smooth_sharp,   SMOOTH,  SHARP
1497make_8tap_fn \type, sharp,          SHARP,   SHARP
1498make_8tap_fn \type, sharp_regular,  SHARP,   REGULAR
1499make_8tap_fn \type, sharp_smooth,   SHARP,   SMOOTH
1500
1501function \type\()_8tap_neon
1502.ifc \bdmax, w8
1503        ldr             w8,  [sp]
1504.endif
1505        mov             w11,  #0x4081  // (1 << 14) | (1 << 7) | (1 << 0)
1506        mul             \mx,  \mx, w11
1507        mul             \my,  \my, w11
1508        add             \mx,  \mx, w9  // mx, 8tap_h, 4tap_h
1509        add             \my,  \my, w10 // my, 8tap_v, 4tap_v
1510.ifc \type, prep
1511        uxtw            \d_strd, \w
1512        lsl             \d_strd, \d_strd, #1
1513.endif
1514
1515        dup             v31.8h,  \bdmax        // bitdepth_max
1516        clz             \bdmax,  \bdmax
1517        clz             w9,  \w
1518        sub             \bdmax,  \bdmax,  #18  // intermediate_bits = clz(bitdepth_max) - 18
1519        mov             w12, #6
1520        tst             \mx, #(0x7f << 14)
1521        sub             w9,  w9,  #24
1522        add             w13, w12, \bdmax       // 6 + intermediate_bits
1523        sub             w12, w12, \bdmax       // 6 - intermediate_bits
1524        movrel          x11, X(mc_subpel_filters), -8
1525        b.ne            L(\type\()_8tap_h)
1526        tst             \my, #(0x7f << 14)
1527        b.ne            L(\type\()_8tap_v)
1528        b               \type\()_neon
1529
1530L(\type\()_8tap_h):
1531        cmp             \w,   #4
1532        ubfx            w10,  \mx, #7, #7
1533        and             \mx,  \mx, #0x7f
1534        b.le            4f
1535        mov             \mx,  w10
15364:
1537        tst             \my,  #(0x7f << 14)
1538        add             \xmx, x11, \mx, uxtw #3
1539        b.ne            L(\type\()_8tap_hv)
1540
1541        adr             x10, L(\type\()_8tap_h_tbl)
1542        dup             v30.4s,  w12           // 6 - intermediate_bits
1543        ldrh            w9,  [x10, x9, lsl #1]
1544        neg             v30.4s,  v30.4s        // -(6-intermediate_bits)
1545.ifc \type, put
1546        dup             v29.8h,  \bdmax        // intermediate_bits
1547.else
1548        movi            v28.8h,  #(PREP_BIAS >> 8), lsl #8
1549.endif
1550        sub             x10, x10, w9, uxtw
1551.ifc \type, put
1552        neg             v29.8h,  v29.8h        // -intermediate_bits
1553.endif
1554        br              x10
1555
155620:     // 2xN h
1557.ifc \type, put
1558        add             \xmx,  \xmx,  #2
1559        ld1             {v0.s}[0], [\xmx]
1560        sub             \src,  \src,  #2
1561        add             \ds2,  \dst,  \d_strd
1562        add             \sr2,  \src,  \s_strd
1563        lsl             \d_strd,  \d_strd,  #1
1564        lsl             \s_strd,  \s_strd,  #1
1565        sxtl            v0.8h,   v0.8b
15662:
1567        ld1             {v4.8h},  [\src], \s_strd
1568        ld1             {v6.8h},  [\sr2], \s_strd
1569        ext             v5.16b,  v4.16b,  v4.16b,  #2
1570        ext             v7.16b,  v6.16b,  v6.16b,  #2
1571        subs            \h,  \h,  #2
1572        trn1            v3.2s,   v4.2s,   v6.2s
1573        trn2            v6.2s,   v4.2s,   v6.2s
1574        trn1            v4.2s,   v5.2s,   v7.2s
1575        trn2            v7.2s,   v5.2s,   v7.2s
1576        smull           v3.4s,   v3.4h,   v0.h[0]
1577        smlal           v3.4s,   v4.4h,   v0.h[1]
1578        smlal           v3.4s,   v6.4h,   v0.h[2]
1579        smlal           v3.4s,   v7.4h,   v0.h[3]
1580        srshl           v3.4s,   v3.4s,   v30.4s // -(6-intermediate_bits)
1581        sqxtun          v3.4h,   v3.4s
1582        srshl           v3.4h,   v3.4h,   v29.4h // -intermediate_bits
1583        umin            v3.4h,   v3.4h,   v31.4h
1584        st1             {v3.s}[0], [\dst], \d_strd
1585        st1             {v3.s}[1], [\ds2], \d_strd
1586        b.gt            2b
1587        ret
1588.endif
1589
159040:     // 4xN h
1591        add             \xmx,  \xmx,  #2
1592        ld1             {v0.s}[0], [\xmx]
1593        sub             \src,  \src,  #2
1594        add             \ds2,  \dst,  \d_strd
1595        add             \sr2,  \src,  \s_strd
1596        lsl             \d_strd,  \d_strd,  #1
1597        lsl             \s_strd,  \s_strd,  #1
1598        sxtl            v0.8h,   v0.8b
15994:
1600        ld1             {v16.8h}, [\src], \s_strd
1601        ld1             {v20.8h}, [\sr2], \s_strd
1602        ext             v17.16b, v16.16b, v16.16b, #2
1603        ext             v18.16b, v16.16b, v16.16b, #4
1604        ext             v19.16b, v16.16b, v16.16b, #6
1605        ext             v21.16b, v20.16b, v20.16b, #2
1606        ext             v22.16b, v20.16b, v20.16b, #4
1607        ext             v23.16b, v20.16b, v20.16b, #6
1608        subs            \h,  \h,  #2
1609        smull           v16.4s,  v16.4h,  v0.h[0]
1610        smlal           v16.4s,  v17.4h,  v0.h[1]
1611        smlal           v16.4s,  v18.4h,  v0.h[2]
1612        smlal           v16.4s,  v19.4h,  v0.h[3]
1613        smull           v20.4s,  v20.4h,  v0.h[0]
1614        smlal           v20.4s,  v21.4h,  v0.h[1]
1615        smlal           v20.4s,  v22.4h,  v0.h[2]
1616        smlal           v20.4s,  v23.4h,  v0.h[3]
1617        srshl           v16.4s,  v16.4s,  v30.4s // -(6-intermediate_bits)
1618        srshl           v20.4s,  v20.4s,  v30.4s // -(6-intermediate_bits)
1619.ifc \type, put
1620        sqxtun          v16.4h,  v16.4s
1621        sqxtun2         v16.8h,  v20.4s
1622        srshl           v16.8h,  v16.8h,  v29.8h // -intermediate_bits
1623        umin            v16.8h,  v16.8h,  v31.8h
1624.else
1625        xtn             v16.4h,  v16.4s
1626        xtn2            v16.8h,  v20.4s
1627        sub             v16.8h,  v16.8h,  v28.8h // PREP_BIAS
1628.endif
1629        st1             {v16.d}[0], [\dst], \d_strd
1630        st1             {v16.d}[1], [\ds2], \d_strd
1631        b.gt            4b
1632        ret
1633
163480:
1635160:
1636320:
1637640:
16381280:   // 8xN, 16xN, 32xN, ... h
1639        ld1             {v0.8b}, [\xmx]
1640        sub             \src,  \src,  #6
1641        add             \ds2,  \dst,  \d_strd
1642        add             \sr2,  \src,  \s_strd
1643        lsl             \s_strd,  \s_strd,  #1
1644        sxtl            v0.8h,   v0.8b
1645
1646        sub             \s_strd,  \s_strd,  \w, uxtw #1
1647        sub             \s_strd,  \s_strd,  #16
1648.ifc \type, put
1649        lsl             \d_strd,  \d_strd,  #1
1650        sub             \d_strd,  \d_strd,  \w, uxtw #1
1651.endif
165281:
1653        ld1             {v16.8h, v17.8h},  [\src], #32
1654        ld1             {v20.8h, v21.8h},  [\sr2], #32
1655        mov             \mx, \w
1656
16578:
1658        smull           v18.4s,  v16.4h,  v0.h[0]
1659        smull2          v19.4s,  v16.8h,  v0.h[0]
1660        smull           v22.4s,  v20.4h,  v0.h[0]
1661        smull2          v23.4s,  v20.8h,  v0.h[0]
1662.irpc i, 1234567
1663        ext             v24.16b, v16.16b, v17.16b, #(2*\i)
1664        ext             v25.16b, v20.16b, v21.16b, #(2*\i)
1665        smlal           v18.4s,  v24.4h,  v0.h[\i]
1666        smlal2          v19.4s,  v24.8h,  v0.h[\i]
1667        smlal           v22.4s,  v25.4h,  v0.h[\i]
1668        smlal2          v23.4s,  v25.8h,  v0.h[\i]
1669.endr
1670        subs            \mx, \mx, #8
1671        srshl           v18.4s,  v18.4s,  v30.4s // -(6-intermediate_bits)
1672        srshl           v19.4s,  v19.4s,  v30.4s // -(6-intermediate_bits)
1673        srshl           v22.4s,  v22.4s,  v30.4s // -(6-intermediate_bits)
1674        srshl           v23.4s,  v23.4s,  v30.4s // -(6-intermediate_bits)
1675.ifc \type, put
1676        sqxtun          v18.4h,  v18.4s
1677        sqxtun2         v18.8h,  v19.4s
1678        sqxtun          v22.4h,  v22.4s
1679        sqxtun2         v22.8h,  v23.4s
1680        srshl           v18.8h,  v18.8h,  v29.8h // -intermediate_bits
1681        srshl           v22.8h,  v22.8h,  v29.8h // -intermediate_bits
1682        umin            v18.8h,  v18.8h,  v31.8h
1683        umin            v22.8h,  v22.8h,  v31.8h
1684.else
1685        xtn             v18.4h,  v18.4s
1686        xtn2            v18.8h,  v19.4s
1687        xtn             v22.4h,  v22.4s
1688        xtn2            v22.8h,  v23.4s
1689        sub             v18.8h,  v18.8h,  v28.8h // PREP_BIAS
1690        sub             v22.8h,  v22.8h,  v28.8h // PREP_BIAS
1691.endif
1692        st1             {v18.8h}, [\dst], #16
1693        st1             {v22.8h}, [\ds2], #16
1694        b.le            9f
1695
1696        mov             v16.16b, v17.16b
1697        mov             v20.16b, v21.16b
1698        ld1             {v17.8h}, [\src], #16
1699        ld1             {v21.8h}, [\sr2], #16
1700        b               8b
1701
17029:
1703        add             \dst,  \dst,  \d_strd
1704        add             \ds2,  \ds2,  \d_strd
1705        add             \src,  \src,  \s_strd
1706        add             \sr2,  \sr2,  \s_strd
1707
1708        subs            \h,  \h,  #2
1709        b.gt            81b
1710        ret
1711
1712L(\type\()_8tap_h_tbl):
1713        .hword L(\type\()_8tap_h_tbl) - 1280b
1714        .hword L(\type\()_8tap_h_tbl) -  640b
1715        .hword L(\type\()_8tap_h_tbl) -  320b
1716        .hword L(\type\()_8tap_h_tbl) -  160b
1717        .hword L(\type\()_8tap_h_tbl) -   80b
1718        .hword L(\type\()_8tap_h_tbl) -   40b
1719        .hword L(\type\()_8tap_h_tbl) -   20b
1720        .hword 0
1721
1722
1723L(\type\()_8tap_v):
1724        cmp             \h,  #4
1725        ubfx            w10, \my, #7, #7
1726        and             \my, \my, #0x7f
1727        b.le            4f
1728        mov             \my, w10
17294:
1730        add             \xmy, x11, \my, uxtw #3
1731
1732.ifc \type, prep
1733        dup             v30.4s,  w12           // 6 - intermediate_bits
1734        movi            v29.8h,  #(PREP_BIAS >> 8), lsl #8
1735.endif
1736        adr             x10, L(\type\()_8tap_v_tbl)
1737        ldrh            w9,  [x10, x9, lsl #1]
1738.ifc \type, prep
1739        neg             v30.4s,  v30.4s        // -(6-intermediate_bits)
1740.endif
1741        sub             x10, x10, w9, uxtw
1742        br              x10
1743
174420:     // 2xN v
1745.ifc \type, put
1746        b.gt            28f
1747
1748        cmp             \h,  #2
1749        add             \xmy, \xmy, #2
1750        ld1             {v0.s}[0], [\xmy]
1751        sub             \src,  \src,  \s_strd
1752        add             \ds2,  \dst,  \d_strd
1753        add             \sr2,  \src,  \s_strd
1754        lsl             \s_strd,  \s_strd,  #1
1755        lsl             \d_strd,  \d_strd,  #1
1756        sxtl            v0.8h,   v0.8b
1757
1758        // 2x2 v
1759        load_s          \src, \sr2, \s_strd, v1, v2, v3, v4, v5
1760        interleave_1_s  v1,  v2,  v3,  v4,  v5
1761        b.gt            24f
1762        smull_smlal_4   v6,  v1,  v2,  v3,  v4
1763        sqrshrun_h      6,   v6
1764        umin_h          v31, .8h, v6
1765        st_s            \d_strd, v6, 2
1766        ret
1767
176824:     // 2x4 v
1769        load_s          \sr2, \src, \s_strd, v6, v7
1770        interleave_1_s  v5,  v6,  v7
1771        smull_smlal_4   v16, v1,  v2,  v3,  v4
1772        smull_smlal_4   v17, v3,  v4,  v5,  v6
1773        sqrshrun_h      6,   v16, v17
1774        umin_h          v31, .8h, v16
1775        st_s            \d_strd, v16, 4
1776        ret
1777
177828:     // 2x8, 2x16 v
1779        ld1             {v0.8b}, [\xmy]
1780        sub             \sr2,  \src,  \s_strd, lsl #1
1781        add             \ds2,  \dst,  \d_strd
1782        sub             \src,  \sr2,  \s_strd
1783        lsl             \d_strd,  \d_strd,  #1
1784        lsl             \s_strd,  \s_strd,  #1
1785        sxtl            v0.8h,   v0.8b
1786
1787        load_s          \src, \sr2, \s_strd, v1,  v2,  v3,  v4, v5, v6, v7
1788        interleave_1_s  v1,  v2,  v3,  v4,  v5
1789        interleave_1_s  v5,  v6,  v7
1790216:
1791        subs            \h,  \h,  #8
1792        load_s          \sr2, \src, \s_strd, v16, v17, v18, v19
1793        load_s          \sr2, \src, \s_strd, v20, v21, v22, v23
1794        interleave_1_s  v7,  v16, v17, v18, v19
1795        interleave_1_s  v19, v20, v21, v22, v23
1796        smull_smlal_8   v24, v1,  v2,  v3,  v4,  v5,  v6,  v7,  v16
1797        smull_smlal_8   v25, v3,  v4,  v5,  v6,  v7,  v16, v17, v18
1798        smull_smlal_8   v26, v5,  v6,  v7,  v16, v17, v18, v19, v20
1799        smull_smlal_8   v27, v7,  v16, v17, v18, v19, v20, v21, v22
1800        sqrshrun_h      6,   v24, v25, v26, v27
1801        umin_h          v31, .8h, v24, v26
1802        st_s            \d_strd, v24, 4
1803        st_s            \d_strd, v26, 4
1804        b.le            0f
1805        mov             v1.16b,  v17.16b
1806        mov             v2.16b,  v18.16b
1807        mov             v3.16b,  v19.16b
1808        mov             v4.16b,  v20.16b
1809        mov             v5.16b,  v21.16b
1810        mov             v6.16b,  v22.16b
1811        mov             v7.16b,  v23.16b
1812        b               216b
18130:
1814        ret
1815.endif
1816
181740:
1818        b.gt            480f
1819
1820        // 4x2, 4x4 v
1821        cmp             \h,  #2
1822        add             \xmy, \xmy, #2
1823        ld1             {v0.s}[0], [\xmy]
1824        sub             \src, \src, \s_strd
1825        add             \ds2, \dst, \d_strd
1826        add             \sr2, \src, \s_strd
1827        lsl             \s_strd, \s_strd, #1
1828        lsl             \d_strd, \d_strd, #1
1829        sxtl            v0.8h,   v0.8b
1830
1831        load_4h         \src, \sr2, \s_strd, v1, v2, v3, v4, v5
1832        smull_smlal_4   v6,  v1,  v2,  v3,  v4
1833        smull_smlal_4   v7,  v2,  v3,  v4,  v5
1834        shift_store_4   \type, \d_strd, v6, v7
1835        b.le            0f
1836        load_4h         \sr2, \src, \s_strd, v6, v7
1837        smull_smlal_4   v1,  v3,  v4,  v5,  v6
1838        smull_smlal_4   v2,  v4,  v5,  v6,  v7
1839        shift_store_4   \type, \d_strd, v1, v2
18400:
1841        ret
1842
1843480:    // 4x8, 4x16 v
1844        ld1             {v0.8b}, [\xmy]
1845        sub             \sr2, \src, \s_strd, lsl #1
1846        add             \ds2, \dst, \d_strd
1847        sub             \src, \sr2, \s_strd
1848        lsl             \s_strd, \s_strd, #1
1849        lsl             \d_strd, \d_strd, #1
1850        sxtl            v0.8h,   v0.8b
1851
1852        load_4h         \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22
1853
185448:
1855        subs            \h,  \h,  #4
1856        load_4h         \sr2, \src, \s_strd, v23, v24, v25, v26
1857        smull_smlal_8   v1,  v16, v17, v18, v19, v20, v21, v22, v23
1858        smull_smlal_8   v2,  v17, v18, v19, v20, v21, v22, v23, v24
1859        smull_smlal_8   v3,  v18, v19, v20, v21, v22, v23, v24, v25
1860        smull_smlal_8   v4,  v19, v20, v21, v22, v23, v24, v25, v26
1861        shift_store_4   \type, \d_strd, v1, v2, v3, v4
1862        b.le            0f
1863        mov             v16.8b,  v20.8b
1864        mov             v17.8b,  v21.8b
1865        mov             v18.8b,  v22.8b
1866        mov             v19.8b,  v23.8b
1867        mov             v20.8b,  v24.8b
1868        mov             v21.8b,  v25.8b
1869        mov             v22.8b,  v26.8b
1870        b               48b
18710:
1872        ret
1873
187480:
1875        b.gt            880f
1876
1877        // 8x2, 8x4 v
1878        cmp             \h,  #2
1879        add             \xmy, \xmy, #2
1880        ld1             {v0.s}[0], [\xmy]
1881        sub             \src, \src, \s_strd
1882        add             \ds2, \dst, \d_strd
1883        add             \sr2, \src, \s_strd
1884        lsl             \s_strd, \s_strd, #1
1885        lsl             \d_strd, \d_strd, #1
1886        sxtl            v0.8h,   v0.8b
1887
1888        load_8h         \src, \sr2, \s_strd, v1, v2, v3, v4, v5
1889        smull_smlal_4   v16, v1,  v2,  v3,  v4
1890        smull2_smlal2_4 v17, v1,  v2,  v3,  v4
1891        smull_smlal_4   v18, v2,  v3,  v4,  v5
1892        smull2_smlal2_4 v19, v2,  v3,  v4,  v5
1893        shift_store_8   \type, \d_strd, v16, v17, v18, v19
1894        b.le            0f
1895        load_8h         \sr2, \src, \s_strd, v6, v7
1896        smull_smlal_4   v16, v3,  v4,  v5,  v6
1897        smull2_smlal2_4 v17, v3,  v4,  v5,  v6
1898        smull_smlal_4   v18, v4,  v5,  v6,  v7
1899        smull2_smlal2_4 v19, v4,  v5,  v6,  v7
1900        shift_store_8   \type, \d_strd, v16, v17, v18, v19
19010:
1902        ret
1903
1904880:    // 8x6, 8x8, 8x16, 8x32 v
19051680:   // 16x8, 16x16, ...
1906320:    // 32x8, 32x16, ...
1907640:
19081280:
1909        ld1             {v0.8b}, [\xmy]
1910        sub             \src, \src, \s_strd
1911        sub             \src, \src, \s_strd, lsl #1
1912        sxtl            v0.8h,   v0.8b
1913        mov             \my,  \h
1914168:
1915        add             \ds2, \dst, \d_strd
1916        add             \sr2, \src, \s_strd
1917        lsl             \s_strd, \s_strd, #1
1918        lsl             \d_strd, \d_strd, #1
1919
1920        load_8h         \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22
1921
192288:
1923        subs            \h,  \h,  #2
1924        load_8h         \sr2, \src, \s_strd, v23, v24
1925        smull_smlal_8   v1,  v16, v17, v18, v19, v20, v21, v22, v23
1926        smull2_smlal2_8 v2,  v16, v17, v18, v19, v20, v21, v22, v23
1927        smull_smlal_8   v3,  v17, v18, v19, v20, v21, v22, v23, v24
1928        smull2_smlal2_8 v4,  v17, v18, v19, v20, v21, v22, v23, v24
1929        shift_store_8   \type, \d_strd, v1, v2, v3, v4
1930        b.le            9f
1931        subs            \h,  \h,  #2
1932        load_8h         \sr2, \src, \s_strd, v25, v26
1933        smull_smlal_8   v1,  v18, v19, v20, v21, v22, v23, v24, v25
1934        smull2_smlal2_8 v2,  v18, v19, v20, v21, v22, v23, v24, v25
1935        smull_smlal_8   v3,  v19, v20, v21, v22, v23, v24, v25, v26
1936        smull2_smlal2_8 v4,  v19, v20, v21, v22, v23, v24, v25, v26
1937        shift_store_8   \type, \d_strd, v1, v2, v3, v4
1938        b.le            9f
1939        mov             v16.16b, v20.16b
1940        mov             v17.16b, v21.16b
1941        mov             v18.16b, v22.16b
1942        mov             v19.16b, v23.16b
1943        mov             v20.16b, v24.16b
1944        mov             v21.16b, v25.16b
1945        mov             v22.16b, v26.16b
1946        b               88b
19479:
1948        subs            \w,  \w,  #8
1949        b.le            0f
1950        asr             \s_strd, \s_strd, #1
1951        asr             \d_strd, \d_strd, #1
1952        msub            \src, \s_strd, \xmy, \src
1953        msub            \dst, \d_strd, \xmy, \dst
1954        sub             \src, \src, \s_strd, lsl #3
1955        mov             \h,  \my
1956        add             \src, \src, #16
1957        add             \dst, \dst, #16
1958        b               168b
19590:
1960        ret
1961
1962160:
1963        b.gt            1680b
1964
1965        // 16x2, 16x4 v
1966        add             \xmy, \xmy, #2
1967        ld1             {v0.s}[0], [\xmy]
1968        sub             \src, \src, \s_strd
1969        sxtl            v0.8h,   v0.8b
1970
1971        load_16h        \src, \src, \s_strd, v16, v17, v18, v19, v20, v21
197216:
1973        load_16h        \src, \src, \s_strd, v22, v23
1974        subs            \h,  \h,  #1
1975        smull_smlal_4   v1,  v16, v18, v20, v22
1976        smull2_smlal2_4 v2,  v16, v18, v20, v22
1977        smull_smlal_4   v3,  v17, v19, v21, v23
1978        smull2_smlal2_4 v4,  v17, v19, v21, v23
1979        shift_store_16  \type, \d_strd, x0, v1, v2, v3, v4
1980        b.le            0f
1981        mov             v16.16b, v18.16b
1982        mov             v17.16b, v19.16b
1983        mov             v18.16b, v20.16b
1984        mov             v19.16b, v21.16b
1985        mov             v20.16b, v22.16b
1986        mov             v21.16b, v23.16b
1987        b               16b
19880:
1989        ret
1990
1991L(\type\()_8tap_v_tbl):
1992        .hword L(\type\()_8tap_v_tbl) - 1280b
1993        .hword L(\type\()_8tap_v_tbl) -  640b
1994        .hword L(\type\()_8tap_v_tbl) -  320b
1995        .hword L(\type\()_8tap_v_tbl) -  160b
1996        .hword L(\type\()_8tap_v_tbl) -   80b
1997        .hword L(\type\()_8tap_v_tbl) -   40b
1998        .hword L(\type\()_8tap_v_tbl) -   20b
1999        .hword 0
2000
2001L(\type\()_8tap_hv):
2002        cmp             \h,  #4
2003        ubfx            w10, \my, #7, #7
2004        and             \my, \my, #0x7f
2005        b.le            4f
2006        mov             \my,  w10
20074:
2008        add             \xmy, x11, \my, uxtw #3
2009
2010        adr             x10, L(\type\()_8tap_hv_tbl)
2011        dup             v30.4s,  w12           // 6 - intermediate_bits
2012        ldrh            w9,  [x10, x9, lsl #1]
2013        neg             v30.4s,  v30.4s        // -(6-intermediate_bits)
2014.ifc \type, put
2015        dup             v29.4s,  w13           // 6 + intermediate_bits
2016.else
2017        movi            v29.8h,  #(PREP_BIAS >> 8), lsl #8
2018.endif
2019        sub             x10, x10, w9, uxtw
2020.ifc \type, put
2021        neg             v29.4s,  v29.4s        // -(6+intermediate_bits)
2022.endif
2023        br              x10
2024
202520:
2026.ifc \type, put
2027        add             \xmx,  \xmx,  #2
2028        ld1             {v0.s}[0],  [\xmx]
2029        b.gt            280f
2030        add             \xmy,  \xmy,  #2
2031        ld1             {v1.s}[0],  [\xmy]
2032
2033        // 2x2, 2x4 hv
2034        sub             \sr2, \src, #2
2035        sub             \src, \sr2, \s_strd
2036        add             \ds2, \dst, \d_strd
2037        lsl             \s_strd, \s_strd, #1
2038        lsl             \d_strd, \d_strd, #1
2039        sxtl            v0.8h,   v0.8b
2040        sxtl            v1.8h,   v1.8b
2041        mov             x15, x30
2042        sxtl            v1.4s,   v1.4h
2043
2044        ld1             {v27.8h}, [\src], \s_strd
2045        ext             v28.16b, v27.16b, v27.16b, #2
2046        smull           v27.4s,  v27.4h,  v0.4h
2047        smull           v28.4s,  v28.4h,  v0.4h
2048        addp            v27.4s,  v27.4s,  v28.4s
2049        addp            v16.4s,  v27.4s,  v27.4s
2050        srshl           v16.2s,  v16.2s,  v30.2s // -(6-intermediate_bits)
2051        bl              L(\type\()_8tap_filter_2)
2052
2053        trn1            v16.2d,  v16.2d,  v24.2d
2054        mov             v17.16b, v24.16b
2055
20562:
2057        bl              L(\type\()_8tap_filter_2)
2058
2059        ext             v18.16b, v17.16b, v24.16b, #8
2060        mov             v19.16b, v24.16b
2061        mul             v2.4s,   v16.4s,  v1.s[0]
2062        mla             v2.4s,   v17.4s,  v1.s[1]
2063        mla             v2.4s,   v18.4s,  v1.s[2]
2064        mla             v2.4s,   v19.4s,  v1.s[3]
2065
2066        srshl           v2.4s,   v2.4s,   v29.4s // -(6+intermediate_bits)
2067        sqxtun          v2.4h,   v2.4s
2068        umin            v2.4h,   v2.4h,   v31.4h
2069        subs            \h,  \h,  #2
2070        st1             {v2.s}[0], [\dst], \d_strd
2071        st1             {v2.s}[1], [\ds2], \d_strd
2072        b.le            0f
2073        mov             v16.16b, v18.16b
2074        mov             v17.16b, v19.16b
2075        b               2b
2076
2077280:    // 2x8, 2x16, 2x32 hv
2078        ld1             {v1.8b},  [\xmy]
2079        sub             \src, \src, #2
2080        sub             \sr2, \src, \s_strd, lsl #1
2081        sub             \src, \sr2, \s_strd
2082        add             \ds2, \dst, \d_strd
2083        lsl             \s_strd, \s_strd, #1
2084        lsl             \d_strd, \d_strd, #1
2085        sxtl            v0.8h,   v0.8b
2086        sxtl            v1.8h,   v1.8b
2087        mov             x15, x30
2088        sxtl2           v2.4s,   v1.8h
2089        sxtl            v1.4s,   v1.4h
2090
2091        ld1             {v27.8h}, [\src], \s_strd
2092        ext             v28.16b, v27.16b, v27.16b, #2
2093        smull           v27.4s,  v27.4h,  v0.4h
2094        smull           v28.4s,  v28.4h,  v0.4h
2095        addp            v27.4s,  v27.4s,  v28.4s
2096        addp            v16.4s,  v27.4s,  v27.4s
2097        srshl           v16.2s,  v16.2s,  v30.2s // -(6-intermediate_bits)
2098
2099        bl              L(\type\()_8tap_filter_2)
2100        trn1            v16.2d,  v16.2d,  v24.2d
2101        mov             v17.16b, v24.16b
2102        bl              L(\type\()_8tap_filter_2)
2103        ext             v18.16b, v17.16b, v24.16b, #8
2104        mov             v19.16b, v24.16b
2105        bl              L(\type\()_8tap_filter_2)
2106        ext             v20.16b, v19.16b, v24.16b, #8
2107        mov             v21.16b, v24.16b
2108
210928:
2110        bl              L(\type\()_8tap_filter_2)
2111        ext             v22.16b, v21.16b, v24.16b, #8
2112        mov             v23.16b, v24.16b
2113        mul             v3.4s,   v16.4s,  v1.s[0]
2114        mla             v3.4s,   v17.4s,  v1.s[1]
2115        mla             v3.4s,   v18.4s,  v1.s[2]
2116        mla             v3.4s,   v19.4s,  v1.s[3]
2117        mla             v3.4s,   v20.4s,  v2.s[0]
2118        mla             v3.4s,   v21.4s,  v2.s[1]
2119        mla             v3.4s,   v22.4s,  v2.s[2]
2120        mla             v3.4s,   v23.4s,  v2.s[3]
2121
2122        srshl           v3.4s,   v3.4s,   v29.4s // -(6+intermediate_bits)
2123        sqxtun          v3.4h,   v3.4s
2124        umin            v3.4h,   v3.4h,   v31.4h
2125        subs            \h,  \h,  #2
2126        st1             {v3.s}[0], [\dst], \d_strd
2127        st1             {v3.s}[1], [\ds2], \d_strd
2128        b.le            0f
2129        mov             v16.16b, v18.16b
2130        mov             v17.16b, v19.16b
2131        mov             v18.16b, v20.16b
2132        mov             v19.16b, v21.16b
2133        mov             v20.16b, v22.16b
2134        mov             v21.16b, v23.16b
2135        b               28b
2136
21370:
2138        br              x15
2139
2140L(\type\()_8tap_filter_2):
2141        ld1             {v25.8h},  [\sr2], \s_strd
2142        ld1             {v27.8h},  [\src], \s_strd
2143        ext             v26.16b, v25.16b, v25.16b, #2
2144        ext             v28.16b, v27.16b, v27.16b, #2
2145        trn1            v24.2s,  v25.2s,  v27.2s
2146        trn2            v27.2s,  v25.2s,  v27.2s
2147        trn1            v25.2s,  v26.2s,  v28.2s
2148        trn2            v28.2s,  v26.2s,  v28.2s
2149        smull           v24.4s,  v24.4h,  v0.h[0]
2150        smlal           v24.4s,  v25.4h,  v0.h[1]
2151        smlal           v24.4s,  v27.4h,  v0.h[2]
2152        smlal           v24.4s,  v28.4h,  v0.h[3]
2153        srshl           v24.4s,  v24.4s,  v30.4s // -(6-intermediate_bits)
2154        ret
2155.endif
2156
215740:
2158        add             \xmx, \xmx, #2
2159        ld1             {v0.s}[0],  [\xmx]
2160        b.gt            480f
2161        add             \xmy, \xmy,  #2
2162        ld1             {v1.s}[0],  [\xmy]
2163        sub             \sr2, \src, #2
2164        sub             \src, \sr2, \s_strd
2165        add             \ds2, \dst, \d_strd
2166        lsl             \s_strd, \s_strd, #1
2167        lsl             \d_strd, \d_strd, #1
2168        sxtl            v0.8h,   v0.8b
2169        sxtl            v1.8h,   v1.8b
2170        mov             x15, x30
2171
2172        // 4x2, 4x4 hv
2173        ld1             {v25.8h}, [\src], \s_strd
2174        ext             v26.16b, v25.16b, v25.16b, #2
2175        ext             v27.16b, v25.16b, v25.16b, #4
2176        ext             v28.16b, v25.16b, v25.16b, #6
2177        smull           v25.4s,  v25.4h,  v0.h[0]
2178        smlal           v25.4s,  v26.4h,  v0.h[1]
2179        smlal           v25.4s,  v27.4h,  v0.h[2]
2180        smlal           v25.4s,  v28.4h,  v0.h[3]
2181        srshl           v16.4s,  v25.4s,  v30.4s // -(6-intermediate_bits)
2182        // The intermediates from the horizontal pass fit in 16 bit without
2183        // any bias; we could just as well keep them as .4s, but narrowing
2184        // them to .4h gives a significant speedup on out of order cores
2185        // (at the cost of a smaller slowdown on in-order cores such as A53).
2186        xtn             v16.4h,  v16.4s
2187
2188        bl              L(\type\()_8tap_filter_4)
2189        mov             v17.8b,  v24.8b
2190        mov             v18.8b,  v25.8b
2191
21924:
2193        bl              L(\type\()_8tap_filter_4)
2194        smull           v2.4s,   v16.4h,  v1.h[0]
2195        smlal           v2.4s,   v17.4h,  v1.h[1]
2196        smlal           v2.4s,   v18.4h,  v1.h[2]
2197        smlal           v2.4s,   v24.4h,  v1.h[3]
2198        smull           v3.4s,   v17.4h,  v1.h[0]
2199        smlal           v3.4s,   v18.4h,  v1.h[1]
2200        smlal           v3.4s,   v24.4h,  v1.h[2]
2201        smlal           v3.4s,   v25.4h,  v1.h[3]
2202.ifc \type, put
2203        srshl           v2.4s,   v2.4s,   v29.4s // -(6+intermediate_bits)
2204        srshl           v3.4s,   v3.4s,   v29.4s // -(6+intermediate_bits)
2205        sqxtun          v2.4h,   v2.4s
2206        sqxtun2         v2.8h,   v3.4s
2207        umin            v2.8h,   v2.8h,   v31.8h
2208.else
2209        rshrn           v2.4h,   v2.4s,   #6
2210        rshrn2          v2.8h,   v3.4s,   #6
2211        sub             v2.8h,   v2.8h,   v29.8h // PREP_BIAS
2212.endif
2213        subs            \h,  \h,  #2
2214
2215        st1             {v2.d}[0], [\dst], \d_strd
2216        st1             {v2.d}[1], [\ds2], \d_strd
2217        b.le            0f
2218        mov             v16.8b,  v18.8b
2219        mov             v17.8b,  v24.8b
2220        mov             v18.8b,  v25.8b
2221        b               4b
2222
2223480:    // 4x8, 4x16, 4x32 hv
2224        ld1             {v1.8b},  [\xmy]
2225        sub             \src, \src, #2
2226        sub             \sr2, \src, \s_strd, lsl #1
2227        sub             \src, \sr2, \s_strd
2228        add             \ds2, \dst, \d_strd
2229        lsl             \s_strd, \s_strd, #1
2230        lsl             \d_strd, \d_strd, #1
2231        sxtl            v0.8h,   v0.8b
2232        sxtl            v1.8h,   v1.8b
2233        mov             x15, x30
2234
2235        ld1             {v25.8h}, [\src], \s_strd
2236        ext             v26.16b, v25.16b, v25.16b, #2
2237        ext             v27.16b, v25.16b, v25.16b, #4
2238        ext             v28.16b, v25.16b, v25.16b, #6
2239        smull           v25.4s,  v25.4h,  v0.h[0]
2240        smlal           v25.4s,  v26.4h,  v0.h[1]
2241        smlal           v25.4s,  v27.4h,  v0.h[2]
2242        smlal           v25.4s,  v28.4h,  v0.h[3]
2243        srshl           v16.4s,  v25.4s,  v30.4s // -(6-intermediate_bits)
2244        // The intermediates from the horizontal pass fit in 16 bit without
2245        // any bias; we could just as well keep them as .4s, but narrowing
2246        // them to .4h gives a significant speedup on out of order cores
2247        // (at the cost of a smaller slowdown on in-order cores such as A53).
2248        xtn             v16.4h,  v16.4s
2249
2250        bl              L(\type\()_8tap_filter_4)
2251        mov             v17.8b,  v24.8b
2252        mov             v18.8b,  v25.8b
2253        bl              L(\type\()_8tap_filter_4)
2254        mov             v19.8b,  v24.8b
2255        mov             v20.8b,  v25.8b
2256        bl              L(\type\()_8tap_filter_4)
2257        mov             v21.8b,  v24.8b
2258        mov             v22.8b,  v25.8b
2259
226048:
2261        bl              L(\type\()_8tap_filter_4)
2262        smull           v3.4s,   v16.4h,  v1.h[0]
2263        smlal           v3.4s,   v17.4h,  v1.h[1]
2264        smlal           v3.4s,   v18.4h,  v1.h[2]
2265        smlal           v3.4s,   v19.4h,  v1.h[3]
2266        smlal           v3.4s,   v20.4h,  v1.h[4]
2267        smlal           v3.4s,   v21.4h,  v1.h[5]
2268        smlal           v3.4s,   v22.4h,  v1.h[6]
2269        smlal           v3.4s,   v24.4h,  v1.h[7]
2270        smull           v4.4s,   v17.4h,  v1.h[0]
2271        smlal           v4.4s,   v18.4h,  v1.h[1]
2272        smlal           v4.4s,   v19.4h,  v1.h[2]
2273        smlal           v4.4s,   v20.4h,  v1.h[3]
2274        smlal           v4.4s,   v21.4h,  v1.h[4]
2275        smlal           v4.4s,   v22.4h,  v1.h[5]
2276        smlal           v4.4s,   v24.4h,  v1.h[6]
2277        smlal           v4.4s,   v25.4h,  v1.h[7]
2278.ifc \type, put
2279        srshl           v3.4s,   v3.4s,   v29.4s // -(6+intermediate_bits)
2280        srshl           v4.4s,   v4.4s,   v29.4s // -(6+intermediate_bits)
2281        sqxtun          v3.4h,   v3.4s
2282        sqxtun2         v3.8h,   v4.4s
2283        umin            v3.8h,   v3.8h,   v31.8h
2284.else
2285        rshrn           v3.4h,   v3.4s,   #6
2286        rshrn2          v3.8h,   v4.4s,   #6
2287        sub             v3.8h,   v3.8h,   v29.8h // PREP_BIAS
2288.endif
2289        subs            \h,  \h,  #2
2290        st1             {v3.d}[0], [\dst], \d_strd
2291        st1             {v3.d}[1], [\ds2], \d_strd
2292        b.le            0f
2293        mov             v16.8b,  v18.8b
2294        mov             v17.8b,  v19.8b
2295        mov             v18.8b,  v20.8b
2296        mov             v19.8b,  v21.8b
2297        mov             v20.8b,  v22.8b
2298        mov             v21.8b,  v24.8b
2299        mov             v22.8b,  v25.8b
2300        b               48b
23010:
2302        br              x15
2303
2304L(\type\()_8tap_filter_4):
2305        ld1             {v24.8h}, [\sr2], \s_strd
2306        ld1             {v25.8h}, [\src], \s_strd
2307        ext             v26.16b, v24.16b, v24.16b, #2
2308        ext             v27.16b, v24.16b, v24.16b, #4
2309        ext             v28.16b, v24.16b, v24.16b, #6
2310        smull           v24.4s,  v24.4h,  v0.h[0]
2311        smlal           v24.4s,  v26.4h,  v0.h[1]
2312        smlal           v24.4s,  v27.4h,  v0.h[2]
2313        smlal           v24.4s,  v28.4h,  v0.h[3]
2314        ext             v26.16b, v25.16b, v25.16b, #2
2315        ext             v27.16b, v25.16b, v25.16b, #4
2316        ext             v28.16b, v25.16b, v25.16b, #6
2317        smull           v25.4s,  v25.4h,  v0.h[0]
2318        smlal           v25.4s,  v26.4h,  v0.h[1]
2319        smlal           v25.4s,  v27.4h,  v0.h[2]
2320        smlal           v25.4s,  v28.4h,  v0.h[3]
2321        srshl           v24.4s,  v24.4s,  v30.4s // -(6-intermediate_bits)
2322        srshl           v25.4s,  v25.4s,  v30.4s // -(6-intermediate_bits)
2323        xtn             v24.4h,  v24.4s
2324        xtn             v25.4h,  v25.4s
2325        ret
2326
232780:
2328160:
2329320:
2330        b.gt            880f
2331        add             \xmy,  \xmy,  #2
2332        ld1             {v0.8b},  [\xmx]
2333        ld1             {v1.s}[0],  [\xmy]
2334        sub             \src,  \src,  #6
2335        sub             \src,  \src,  \s_strd
2336        sxtl            v0.8h,   v0.8b
2337        sxtl            v1.8h,   v1.8b
2338        mov             x15, x30
2339        mov             \my, \h
2340
2341164:    // 8x2, 8x4, 16x2, 16x4, 32x2, 32x4 hv
2342        add             \ds2,  \dst,  \d_strd
2343        add             \sr2,  \src,  \s_strd
2344        lsl             \d_strd, \d_strd, #1
2345        lsl             \s_strd, \s_strd, #1
2346
2347        ld1             {v27.8h, v28.8h},  [\src], \s_strd
2348        smull           v24.4s,  v27.4h,  v0.h[0]
2349        smull2          v25.4s,  v27.8h,  v0.h[0]
2350.irpc i, 1234567
2351        ext             v26.16b, v27.16b, v28.16b, #(2*\i)
2352        smlal           v24.4s,  v26.4h,  v0.h[\i]
2353        smlal2          v25.4s,  v26.8h,  v0.h[\i]
2354.endr
2355        srshl           v24.4s,  v24.4s,  v30.4s // -(6-intermediate_bits)
2356        srshl           v25.4s,  v25.4s,  v30.4s // -(6-intermediate_bits)
2357        // The intermediates from the horizontal pass fit in 16 bit without
2358        // any bias; we could just as well keep them as .4s, but narrowing
2359        // them to .4h gives a significant speedup on out of order cores
2360        // (at the cost of a smaller slowdown on in-order cores such as A53),
2361        // and conserves register space (no need to clobber v8-v15).
2362        xtn             v16.4h,  v24.4s
2363        xtn2            v16.8h,  v25.4s
2364
2365        bl              L(\type\()_8tap_filter_8)
2366        mov             v17.16b, v23.16b
2367        mov             v18.16b, v24.16b
2368
23698:
2370        smull           v2.4s,   v16.4h,  v1.h[0]
2371        smull2          v3.4s,   v16.8h,  v1.h[0]
2372        bl              L(\type\()_8tap_filter_8)
2373        smull           v4.4s,   v17.4h,  v1.h[0]
2374        smull2          v5.4s,   v17.8h,  v1.h[0]
2375        smlal           v2.4s,   v17.4h,  v1.h[1]
2376        smlal2          v3.4s,   v17.8h,  v1.h[1]
2377        smlal           v4.4s,   v18.4h,  v1.h[1]
2378        smlal2          v5.4s,   v18.8h,  v1.h[1]
2379        smlal           v2.4s,   v18.4h,  v1.h[2]
2380        smlal2          v3.4s,   v18.8h,  v1.h[2]
2381        smlal           v4.4s,   v23.4h,  v1.h[2]
2382        smlal2          v5.4s,   v23.8h,  v1.h[2]
2383        smlal           v2.4s,   v23.4h,  v1.h[3]
2384        smlal2          v3.4s,   v23.8h,  v1.h[3]
2385        smlal           v4.4s,   v24.4h,  v1.h[3]
2386        smlal2          v5.4s,   v24.8h,  v1.h[3]
2387.ifc \type, put
2388        srshl           v2.4s,   v2.4s,   v29.4s // -(6+intermediate_bits)
2389        srshl           v3.4s,   v3.4s,   v29.4s // -(6+intermediate_bits)
2390        srshl           v4.4s,   v4.4s,   v29.4s // -(6+intermediate_bits)
2391        srshl           v5.4s,   v5.4s,   v29.4s // -(6+intermediate_bits)
2392        sqxtun          v2.4h,   v2.4s
2393        sqxtun2         v2.8h,   v3.4s
2394        sqxtun          v3.4h,   v4.4s
2395        sqxtun2         v3.8h,   v5.4s
2396        umin            v2.8h,   v2.8h,   v31.8h
2397        umin            v3.8h,   v3.8h,   v31.8h
2398.else
2399        rshrn           v2.4h,   v2.4s,   #6
2400        rshrn2          v2.8h,   v3.4s,   #6
2401        rshrn           v3.4h,   v4.4s,   #6
2402        rshrn2          v3.8h,   v5.4s,   #6
2403        sub             v2.8h,   v2.8h,   v29.8h // PREP_BIAS
2404        sub             v3.8h,   v3.8h,   v29.8h // PREP_BIAS
2405.endif
2406        subs            \h,  \h,  #2
2407        st1             {v2.8h}, [\dst], \d_strd
2408        st1             {v3.8h}, [\ds2], \d_strd
2409        b.le            9f
2410        mov             v16.16b, v18.16b
2411        mov             v17.16b, v23.16b
2412        mov             v18.16b, v24.16b
2413        b               8b
24149:
2415        subs            \w,  \w,  #8
2416        b.le            0f
2417        asr             \s_strd,  \s_strd,  #1
2418        asr             \d_strd,  \d_strd,  #1
2419        msub            \src,  \s_strd,  \xmy,  \src
2420        msub            \dst,  \d_strd,  \xmy,  \dst
2421        sub             \src,  \src,  \s_strd,  lsl #2
2422        mov             \h,  \my
2423        add             \src,  \src,  #16
2424        add             \dst,  \dst,  #16
2425        b               164b
2426
2427880:    // 8x8, 8x16, ..., 16x8, ..., 32x8, ... hv
2428640:
24291280:
2430        ld1             {v0.8b},  [\xmx]
2431        ld1             {v1.8b},  [\xmy]
2432        sub             \src,  \src,  #6
2433        sub             \src,  \src,  \s_strd
2434        sub             \src,  \src,  \s_strd, lsl #1
2435        sxtl            v0.8h,   v0.8b
2436        sxtl            v1.8h,   v1.8b
2437        mov             x15, x30
2438        mov             \my, \h
2439
2440168:
2441        add             \ds2,  \dst,  \d_strd
2442        add             \sr2,  \src,  \s_strd
2443        lsl             \d_strd, \d_strd, #1
2444        lsl             \s_strd, \s_strd, #1
2445
2446        ld1             {v27.8h, v28.8h},  [\src], \s_strd
2447        smull           v24.4s,  v27.4h,  v0.h[0]
2448        smull2          v25.4s,  v27.8h,  v0.h[0]
2449.irpc i, 1234567
2450        ext             v26.16b, v27.16b, v28.16b, #(2*\i)
2451        smlal           v24.4s,  v26.4h,  v0.h[\i]
2452        smlal2          v25.4s,  v26.8h,  v0.h[\i]
2453.endr
2454        srshl           v24.4s,  v24.4s,  v30.4s // -(6-intermediate_bits)
2455        srshl           v25.4s,  v25.4s,  v30.4s // -(6-intermediate_bits)
2456        // The intermediates from the horizontal pass fit in 16 bit without
2457        // any bias; we could just as well keep them as .4s, but narrowing
2458        // them to .4h gives a significant speedup on out of order cores
2459        // (at the cost of a smaller slowdown on in-order cores such as A53),
2460        // and conserves register space (no need to clobber v8-v15).
2461        xtn             v16.4h,  v24.4s
2462        xtn2            v16.8h,  v25.4s
2463
2464        bl              L(\type\()_8tap_filter_8)
2465        mov             v17.16b, v23.16b
2466        mov             v18.16b, v24.16b
2467        bl              L(\type\()_8tap_filter_8)
2468        mov             v19.16b, v23.16b
2469        mov             v20.16b, v24.16b
2470        bl              L(\type\()_8tap_filter_8)
2471        mov             v21.16b, v23.16b
2472        mov             v22.16b, v24.16b
2473
247488:
2475        smull           v2.4s,   v16.4h,  v1.h[0]
2476        smull2          v3.4s,   v16.8h,  v1.h[0]
2477        bl              L(\type\()_8tap_filter_8)
2478        smull           v4.4s,   v17.4h,  v1.h[0]
2479        smull2          v5.4s,   v17.8h,  v1.h[0]
2480        smlal           v2.4s,   v17.4h,  v1.h[1]
2481        smlal2          v3.4s,   v17.8h,  v1.h[1]
2482        smlal           v4.4s,   v18.4h,  v1.h[1]
2483        smlal2          v5.4s,   v18.8h,  v1.h[1]
2484        smlal           v2.4s,   v18.4h,  v1.h[2]
2485        smlal2          v3.4s,   v18.8h,  v1.h[2]
2486        smlal           v4.4s,   v19.4h,  v1.h[2]
2487        smlal2          v5.4s,   v19.8h,  v1.h[2]
2488        smlal           v2.4s,   v19.4h,  v1.h[3]
2489        smlal2          v3.4s,   v19.8h,  v1.h[3]
2490        smlal           v4.4s,   v20.4h,  v1.h[3]
2491        smlal2          v5.4s,   v20.8h,  v1.h[3]
2492        smlal           v2.4s,   v20.4h,  v1.h[4]
2493        smlal2          v3.4s,   v20.8h,  v1.h[4]
2494        smlal           v4.4s,   v21.4h,  v1.h[4]
2495        smlal2          v5.4s,   v21.8h,  v1.h[4]
2496        smlal           v2.4s,   v21.4h,  v1.h[5]
2497        smlal2          v3.4s,   v21.8h,  v1.h[5]
2498        smlal           v4.4s,   v22.4h,  v1.h[5]
2499        smlal2          v5.4s,   v22.8h,  v1.h[5]
2500        smlal           v2.4s,   v22.4h,  v1.h[6]
2501        smlal2          v3.4s,   v22.8h,  v1.h[6]
2502        smlal           v4.4s,   v23.4h,  v1.h[6]
2503        smlal2          v5.4s,   v23.8h,  v1.h[6]
2504        smlal           v2.4s,   v23.4h,  v1.h[7]
2505        smlal2          v3.4s,   v23.8h,  v1.h[7]
2506        smlal           v4.4s,   v24.4h,  v1.h[7]
2507        smlal2          v5.4s,   v24.8h,  v1.h[7]
2508.ifc \type, put
2509        srshl           v2.4s,   v2.4s,   v29.4s // -(6+intermediate_bits)
2510        srshl           v3.4s,   v3.4s,   v29.4s // -(6+intermediate_bits)
2511        srshl           v4.4s,   v4.4s,   v29.4s // -(6+intermediate_bits)
2512        srshl           v5.4s,   v5.4s,   v29.4s // -(6+intermediate_bits)
2513        sqxtun          v2.4h,   v2.4s
2514        sqxtun2         v2.8h,   v3.4s
2515        sqxtun          v3.4h,   v4.4s
2516        sqxtun2         v3.8h,   v5.4s
2517        umin            v2.8h,   v2.8h,   v31.8h
2518        umin            v3.8h,   v3.8h,   v31.8h
2519.else
2520        rshrn           v2.4h,   v2.4s,   #6
2521        rshrn2          v2.8h,   v3.4s,   #6
2522        rshrn           v3.4h,   v4.4s,   #6
2523        rshrn2          v3.8h,   v5.4s,   #6
2524        sub             v2.8h,   v2.8h,   v29.8h // PREP_BIAS
2525        sub             v3.8h,   v3.8h,   v29.8h // PREP_BIAS
2526.endif
2527        subs            \h,  \h,  #2
2528        st1             {v2.8h}, [\dst], \d_strd
2529        st1             {v3.8h}, [\ds2], \d_strd
2530        b.le            9f
2531        mov             v16.16b, v18.16b
2532        mov             v17.16b, v19.16b
2533        mov             v18.16b, v20.16b
2534        mov             v19.16b, v21.16b
2535        mov             v20.16b, v22.16b
2536        mov             v21.16b, v23.16b
2537        mov             v22.16b, v24.16b
2538        b               88b
25399:
2540        subs            \w,  \w,  #8
2541        b.le            0f
2542        asr             \s_strd,  \s_strd,  #1
2543        asr             \d_strd,  \d_strd,  #1
2544        msub            \src,  \s_strd,  \xmy,  \src
2545        msub            \dst,  \d_strd,  \xmy,  \dst
2546        sub             \src,  \src,  \s_strd,  lsl #3
2547        mov             \h,  \my
2548        add             \src,  \src,  #16
2549        add             \dst,  \dst,  #16
2550        b               168b
25510:
2552        br              x15
2553
2554L(\type\()_8tap_filter_8):
2555        ld1             {v4.8h, v5.8h},  [\sr2], \s_strd
2556        ld1             {v6.8h, v7.8h},  [\src], \s_strd
2557        smull           v25.4s,  v4.4h,   v0.h[0]
2558        smull2          v26.4s,  v4.8h,   v0.h[0]
2559        smull           v27.4s,  v6.4h,   v0.h[0]
2560        smull2          v28.4s,  v6.8h,   v0.h[0]
2561.irpc i, 1234567
2562        ext             v23.16b, v4.16b,  v5.16b,  #(2*\i)
2563        ext             v24.16b, v6.16b,  v7.16b,  #(2*\i)
2564        smlal           v25.4s,  v23.4h,  v0.h[\i]
2565        smlal2          v26.4s,  v23.8h,  v0.h[\i]
2566        smlal           v27.4s,  v24.4h,  v0.h[\i]
2567        smlal2          v28.4s,  v24.8h,  v0.h[\i]
2568.endr
2569        srshl           v25.4s,  v25.4s,  v30.4s // -(6-intermediate_bits)
2570        srshl           v26.4s,  v26.4s,  v30.4s // -(6-intermediate_bits)
2571        srshl           v27.4s,  v27.4s,  v30.4s // -(6-intermediate_bits)
2572        srshl           v28.4s,  v28.4s,  v30.4s // -(6-intermediate_bits)
2573        xtn             v23.4h,  v25.4s
2574        xtn2            v23.8h,  v26.4s
2575        xtn             v24.4h,  v27.4s
2576        xtn2            v24.8h,  v28.4s
2577        ret
2578
2579L(\type\()_8tap_hv_tbl):
2580        .hword L(\type\()_8tap_hv_tbl) - 1280b
2581        .hword L(\type\()_8tap_hv_tbl) -  640b
2582        .hword L(\type\()_8tap_hv_tbl) -  320b
2583        .hword L(\type\()_8tap_hv_tbl) -  160b
2584        .hword L(\type\()_8tap_hv_tbl) -   80b
2585        .hword L(\type\()_8tap_hv_tbl) -   40b
2586        .hword L(\type\()_8tap_hv_tbl) -   20b
2587        .hword 0
2588endfunc
2589
2590
2591function \type\()_bilin_16bpc_neon, export=1
2592.ifc \bdmax, w8
2593        ldr             w8,  [sp]
2594.endif
2595        dup             v1.8h,   \mx
2596        dup             v3.8h,   \my
2597        mov             w10, #16
2598        sub             w9,  w10, \mx
2599        sub             w10, w10, \my
2600        dup             v0.8h,   w9
2601        dup             v2.8h,   w10
2602.ifc \type, prep
2603        uxtw            \d_strd, \w
2604        lsl             \d_strd, \d_strd, #1
2605.endif
2606
2607        clz             \bdmax,   \bdmax       // bitdepth_max
2608        clz             w9,  \w
2609        sub             \bdmax,   \bdmax,  #18 // intermediate_bits = clz(bitdepth_max) - 18
2610        mov             w11, #4
2611        sub             w9,  w9,  #24
2612        sub             w11, w11, \bdmax  // 4 - intermediate_bits
2613        add             w12, \bdmax, #4   // 4 + intermediate_bits
2614        cbnz            \mx, L(\type\()_bilin_h)
2615        cbnz            \my, L(\type\()_bilin_v)
2616        b               \type\()_neon
2617
2618L(\type\()_bilin_h):
2619        cbnz            \my, L(\type\()_bilin_hv)
2620
2621        adr             x10, L(\type\()_bilin_h_tbl)
2622        dup             v31.8h,  w11      // 4 - intermediate_bits
2623        ldrh            w9,  [x10, x9, lsl #1]
2624        neg             v31.8h,  v31.8h   // -(4-intermediate_bits)
2625.ifc \type, put
2626        dup             v30.8h,  \bdmax   // intermediate_bits
2627.else
2628        movi            v29.8h,  #(PREP_BIAS >> 8), lsl #8
2629.endif
2630        sub             x10, x10, w9, uxtw
2631.ifc \type, put
2632        neg             v30.8h,  v30.8h   // -intermediate_bits
2633.endif
2634        br              x10
2635
263620:     // 2xN h
2637.ifc \type, put
2638        add             \ds2,  \dst,  \d_strd
2639        add             \sr2,  \src,  \s_strd
2640        lsl             \d_strd,  \d_strd,  #1
2641        lsl             \s_strd,  \s_strd,  #1
26422:
2643        ld1             {v4.4h},  [\src], \s_strd
2644        ld1             {v6.4h},  [\sr2], \s_strd
2645        ext             v5.8b,   v4.8b,   v4.8b,   #2
2646        ext             v7.8b,   v6.8b,   v6.8b,   #2
2647        trn1            v4.2s,   v4.2s,   v6.2s
2648        trn1            v5.2s,   v5.2s,   v7.2s
2649        subs            \h,  \h,  #2
2650        mul             v4.4h,   v4.4h,   v0.4h
2651        mla             v4.4h,   v5.4h,   v1.4h
2652        urshl           v4.4h,   v4.4h,   v31.4h
2653        urshl           v4.4h,   v4.4h,   v30.4h
2654        st1             {v4.s}[0], [\dst], \d_strd
2655        st1             {v4.s}[1], [\ds2], \d_strd
2656        b.gt            2b
2657        ret
2658.endif
2659
266040:     // 4xN h
2661        add             \ds2,  \dst,  \d_strd
2662        add             \sr2,  \src,  \s_strd
2663        lsl             \d_strd,  \d_strd,  #1
2664        lsl             \s_strd,  \s_strd,  #1
26654:
2666        ld1             {v4.8h}, [\src], \s_strd
2667        ld1             {v6.8h}, [\sr2], \s_strd
2668        ext             v5.16b,  v4.16b,  v4.16b,  #2
2669        ext             v7.16b,  v6.16b,  v6.16b,  #2
2670        trn1            v4.2d,   v4.2d,   v6.2d
2671        trn1            v5.2d,   v5.2d,   v7.2d
2672        subs            \h,  \h,  #2
2673        mul             v4.8h,   v4.8h,   v0.8h
2674        mla             v4.8h,   v5.8h,   v1.8h
2675        urshl           v4.8h,   v4.8h,   v31.8h
2676.ifc \type, put
2677        urshl           v4.8h,   v4.8h,   v30.8h
2678.else
2679        sub             v4.8h,   v4.8h,   v29.8h
2680.endif
2681        st1             {v4.d}[0], [\dst], \d_strd
2682        st1             {v4.d}[1], [\ds2], \d_strd
2683        b.gt            4b
2684        ret
2685
268680:     // 8xN h
2687        add             \ds2,  \dst,  \d_strd
2688        add             \sr2,  \src,  \s_strd
2689        lsl             \d_strd,  \d_strd,  #1
2690        lsl             \s_strd,  \s_strd,  #1
26918:
2692        ldr             h5,  [\src, #16]
2693        ldr             h7,  [\sr2, #16]
2694        ld1             {v4.8h}, [\src], \s_strd
2695        ld1             {v6.8h}, [\sr2], \s_strd
2696        ext             v5.16b,  v4.16b,  v5.16b,  #2
2697        ext             v7.16b,  v6.16b,  v7.16b,  #2
2698        subs            \h,  \h,  #2
2699        mul             v4.8h,   v4.8h,   v0.8h
2700        mla             v4.8h,   v5.8h,   v1.8h
2701        mul             v6.8h,   v6.8h,   v0.8h
2702        mla             v6.8h,   v7.8h,   v1.8h
2703        urshl           v4.8h,   v4.8h,   v31.8h
2704        urshl           v6.8h,   v6.8h,   v31.8h
2705.ifc \type, put
2706        urshl           v4.8h,   v4.8h,   v30.8h
2707        urshl           v6.8h,   v6.8h,   v30.8h
2708.else
2709        sub             v4.8h,   v4.8h,   v29.8h
2710        sub             v6.8h,   v6.8h,   v29.8h
2711.endif
2712        st1             {v4.8h}, [\dst], \d_strd
2713        st1             {v6.8h}, [\ds2], \d_strd
2714        b.gt            8b
2715        ret
2716160:
2717320:
2718640:
27191280:   // 16xN, 32xN, ... h
2720        add             \ds2,  \dst,  \d_strd
2721        add             \sr2,  \src,  \s_strd
2722        lsl             \s_strd,  \s_strd,  #1
2723
2724        sub             \s_strd,  \s_strd,  \w, uxtw #1
2725        sub             \s_strd,  \s_strd,  #16
2726.ifc \type, put
2727        lsl             \d_strd,  \d_strd,  #1
2728        sub             \d_strd,  \d_strd,  \w, uxtw #1
2729.endif
2730161:
2731        ld1             {v16.8h},  [\src], #16
2732        ld1             {v21.8h},  [\sr2], #16
2733        mov             \mx, \w
2734
273516:
2736        ld1             {v17.8h, v18.8h},  [\src], #32
2737        ld1             {v22.8h, v23.8h},  [\sr2], #32
2738        ext             v19.16b, v16.16b, v17.16b, #2
2739        ext             v20.16b, v17.16b, v18.16b, #2
2740        ext             v24.16b, v21.16b, v22.16b, #2
2741        ext             v25.16b, v22.16b, v23.16b, #2
2742        mul             v16.8h,  v16.8h,  v0.8h
2743        mla             v16.8h,  v19.8h,  v1.8h
2744        mul             v17.8h,  v17.8h,  v0.8h
2745        mla             v17.8h,  v20.8h,  v1.8h
2746        mul             v21.8h,  v21.8h,  v0.8h
2747        mla             v21.8h,  v24.8h,  v1.8h
2748        mul             v22.8h,  v22.8h,  v0.8h
2749        mla             v22.8h,  v25.8h,  v1.8h
2750        urshl           v16.8h,  v16.8h,  v31.8h
2751        urshl           v17.8h,  v17.8h,  v31.8h
2752        urshl           v21.8h,  v21.8h,  v31.8h
2753        urshl           v22.8h,  v22.8h,  v31.8h
2754        subs            \mx, \mx, #16
2755.ifc \type, put
2756        urshl           v16.8h,  v16.8h,  v30.8h
2757        urshl           v17.8h,  v17.8h,  v30.8h
2758        urshl           v21.8h,  v21.8h,  v30.8h
2759        urshl           v22.8h,  v22.8h,  v30.8h
2760.else
2761        sub             v16.8h,  v16.8h,  v29.8h
2762        sub             v17.8h,  v17.8h,  v29.8h
2763        sub             v21.8h,  v21.8h,  v29.8h
2764        sub             v22.8h,  v22.8h,  v29.8h
2765.endif
2766        st1             {v16.8h, v17.8h}, [\dst], #32
2767        st1             {v21.8h, v22.8h}, [\ds2], #32
2768        b.le            9f
2769
2770        mov             v16.16b, v18.16b
2771        mov             v21.16b, v23.16b
2772        b               16b
2773
27749:
2775        add             \dst,  \dst,  \d_strd
2776        add             \ds2,  \ds2,  \d_strd
2777        add             \src,  \src,  \s_strd
2778        add             \sr2,  \sr2,  \s_strd
2779
2780        subs            \h,  \h,  #2
2781        b.gt            161b
2782        ret
2783
2784L(\type\()_bilin_h_tbl):
2785        .hword L(\type\()_bilin_h_tbl) - 1280b
2786        .hword L(\type\()_bilin_h_tbl) -  640b
2787        .hword L(\type\()_bilin_h_tbl) -  320b
2788        .hword L(\type\()_bilin_h_tbl) -  160b
2789        .hword L(\type\()_bilin_h_tbl) -   80b
2790        .hword L(\type\()_bilin_h_tbl) -   40b
2791        .hword L(\type\()_bilin_h_tbl) -   20b
2792        .hword 0
2793
2794
2795L(\type\()_bilin_v):
2796        cmp             \h,  #4
2797        adr             x10, L(\type\()_bilin_v_tbl)
2798.ifc \type, prep
2799        dup             v31.8h,  w11      // 4 - intermediate_bits
2800.endif
2801        ldrh            w9,  [x10, x9, lsl #1]
2802.ifc \type, prep
2803        movi            v29.8h,  #(PREP_BIAS >> 8), lsl #8
2804        neg             v31.8h,  v31.8h   // -(4-intermediate_bits)
2805.endif
2806        sub             x10, x10, w9, uxtw
2807        br              x10
2808
280920:     // 2xN v
2810.ifc \type, put
2811        cmp             \h,  #2
2812        add             \ds2,  \dst,  \d_strd
2813        add             \sr2,  \src,  \s_strd
2814        lsl             \s_strd,  \s_strd,  #1
2815        lsl             \d_strd,  \d_strd,  #1
2816
2817        // 2x2 v
2818        ld1             {v16.s}[0], [\src], \s_strd
2819        b.gt            24f
2820        ld1             {v17.s}[0], [\sr2], \s_strd
2821        ld1             {v18.s}[0], [\src], \s_strd
2822        trn1            v16.2s,  v16.2s,  v17.2s
2823        trn1            v17.2s,  v17.2s,  v18.2s
2824        mul             v4.4h,   v16.4h,  v2.4h
2825        mla             v4.4h,   v17.4h,  v3.4h
2826        urshr           v4.8h,   v4.8h,   #4
2827        st1             {v4.s}[0], [\dst]
2828        st1             {v4.s}[1], [\ds2]
2829        ret
283024:     // 2x4, 2x8, ... v
2831        ld1             {v17.s}[0], [\sr2], \s_strd
2832        ld1             {v18.s}[0], [\src], \s_strd
2833        ld1             {v19.s}[0], [\sr2], \s_strd
2834        ld1             {v20.s}[0], [\src], \s_strd
2835        trn1            v16.2s,  v16.2s,  v17.2s
2836        trn1            v17.2s,  v17.2s,  v18.2s
2837        trn1            v18.2s,  v18.2s,  v19.2s
2838        trn1            v19.2s,  v19.2s,  v20.2s
2839        trn1            v16.2d,  v16.2d,  v18.2d
2840        trn1            v17.2d,  v17.2d,  v19.2d
2841        mul             v4.8h,   v16.8h,  v2.8h
2842        mla             v4.8h,   v17.8h,  v3.8h
2843        subs            \h,  \h,  #4
2844        urshr           v4.8h,   v4.8h,   #4
2845        st1             {v4.s}[0], [\dst], \d_strd
2846        st1             {v4.s}[1], [\ds2], \d_strd
2847        st1             {v4.s}[2], [\dst], \d_strd
2848        st1             {v4.s}[3], [\ds2], \d_strd
2849        b.le            0f
2850        mov             v16.8b,  v20.8b
2851        b               24b
28520:
2853        ret
2854.endif
2855
285640:     // 4xN v
2857        add             \ds2,  \dst,  \d_strd
2858        add             \sr2,  \src,  \s_strd
2859        lsl             \s_strd,  \s_strd,  #1
2860        lsl             \d_strd,  \d_strd,  #1
2861        ld1             {v16.4h}, [\src], \s_strd
28624:
2863        ld1             {v17.4h}, [\sr2], \s_strd
2864        ld1             {v18.4h}, [\src], \s_strd
2865        trn1            v16.2d,  v16.2d,  v17.2d
2866        trn1            v17.2d,  v17.2d,  v18.2d
2867        mul             v4.8h,   v16.8h,  v2.8h
2868        mla             v4.8h,   v17.8h,  v3.8h
2869        subs            \h,  \h,  #2
2870.ifc \type, put
2871        urshr           v4.8h,   v4.8h,   #4
2872.else
2873        urshl           v4.8h,   v4.8h,   v31.8h
2874        sub             v4.8h,   v4.8h,   v29.8h
2875.endif
2876        st1             {v4.d}[0], [\dst], \d_strd
2877        st1             {v4.d}[1], [\ds2], \d_strd
2878        b.le            0f
2879        mov             v16.8b,  v18.8b
2880        b               4b
28810:
2882        ret
2883
288480:     // 8xN v
2885        add             \ds2,  \dst,  \d_strd
2886        add             \sr2,  \src,  \s_strd
2887        lsl             \s_strd,  \s_strd,  #1
2888        lsl             \d_strd,  \d_strd,  #1
2889        ld1             {v16.8h}, [\src], \s_strd
28908:
2891        ld1             {v17.8h}, [\sr2], \s_strd
2892        ld1             {v18.8h}, [\src], \s_strd
2893        mul             v4.8h,   v16.8h,  v2.8h
2894        mla             v4.8h,   v17.8h,  v3.8h
2895        mul             v5.8h,   v17.8h,  v2.8h
2896        mla             v5.8h,   v18.8h,  v3.8h
2897        subs            \h,  \h,  #2
2898.ifc \type, put
2899        urshr           v4.8h,   v4.8h,   #4
2900        urshr           v5.8h,   v5.8h,   #4
2901.else
2902        urshl           v4.8h,   v4.8h,   v31.8h
2903        urshl           v5.8h,   v5.8h,   v31.8h
2904        sub             v4.8h,   v4.8h,   v29.8h
2905        sub             v5.8h,   v5.8h,   v29.8h
2906.endif
2907        st1             {v4.8h}, [\dst], \d_strd
2908        st1             {v5.8h}, [\ds2], \d_strd
2909        b.le            0f
2910        mov             v16.16b, v18.16b
2911        b               8b
29120:
2913        ret
2914
2915160:    // 16xN, 32xN, ...
2916320:
2917640:
29181280:
2919        mov             \my, \h
29201:
2921        add             \ds2, \dst, \d_strd
2922        add             \sr2, \src, \s_strd
2923        lsl             \s_strd, \s_strd, #1
2924        lsl             \d_strd, \d_strd, #1
2925
2926        ld1             {v16.8h, v17.8h}, [\src], \s_strd
29272:
2928        ld1             {v18.8h, v19.8h}, [\sr2], \s_strd
2929        ld1             {v20.8h, v21.8h}, [\src], \s_strd
2930        mul             v4.8h,   v16.8h,  v2.8h
2931        mla             v4.8h,   v18.8h,  v3.8h
2932        mul             v5.8h,   v17.8h,  v2.8h
2933        mla             v5.8h,   v19.8h,  v3.8h
2934        mul             v6.8h,   v18.8h,  v2.8h
2935        mla             v6.8h,   v20.8h,  v3.8h
2936        mul             v7.8h,   v19.8h,  v2.8h
2937        mla             v7.8h,   v21.8h,  v3.8h
2938        subs            \h,  \h,  #2
2939.ifc \type, put
2940        urshr           v4.8h,   v4.8h,   #4
2941        urshr           v5.8h,   v5.8h,   #4
2942        urshr           v6.8h,   v6.8h,   #4
2943        urshr           v7.8h,   v7.8h,   #4
2944.else
2945        urshl           v4.8h,   v4.8h,   v31.8h
2946        urshl           v5.8h,   v5.8h,   v31.8h
2947        urshl           v6.8h,   v6.8h,   v31.8h
2948        urshl           v7.8h,   v7.8h,   v31.8h
2949        sub             v4.8h,   v4.8h,   v29.8h
2950        sub             v5.8h,   v5.8h,   v29.8h
2951        sub             v6.8h,   v6.8h,   v29.8h
2952        sub             v7.8h,   v7.8h,   v29.8h
2953.endif
2954        st1             {v4.8h, v5.8h}, [\dst], \d_strd
2955        st1             {v6.8h, v7.8h}, [\ds2], \d_strd
2956        b.le            9f
2957        mov             v16.16b, v20.16b
2958        mov             v17.16b, v21.16b
2959        b               2b
29609:
2961        subs            \w,  \w,  #16
2962        b.le            0f
2963        asr             \s_strd, \s_strd, #1
2964        asr             \d_strd, \d_strd, #1
2965        msub            \src, \s_strd, \xmy, \src
2966        msub            \dst, \d_strd, \xmy, \dst
2967        sub             \src, \src, \s_strd, lsl #1
2968        mov             \h,  \my
2969        add             \src, \src, #32
2970        add             \dst, \dst, #32
2971        b               1b
29720:
2973        ret
2974
2975L(\type\()_bilin_v_tbl):
2976        .hword L(\type\()_bilin_v_tbl) - 1280b
2977        .hword L(\type\()_bilin_v_tbl) -  640b
2978        .hword L(\type\()_bilin_v_tbl) -  320b
2979        .hword L(\type\()_bilin_v_tbl) -  160b
2980        .hword L(\type\()_bilin_v_tbl) -   80b
2981        .hword L(\type\()_bilin_v_tbl) -   40b
2982        .hword L(\type\()_bilin_v_tbl) -   20b
2983        .hword 0
2984
2985L(\type\()_bilin_hv):
2986        adr             x10, L(\type\()_bilin_hv_tbl)
2987        dup             v31.8h,  w11      // 4 - intermediate_bits
2988        ldrh            w9,  [x10, x9, lsl #1]
2989        neg             v31.8h,  v31.8h   // -(4-intermediate_bits)
2990.ifc \type, put
2991        dup             v30.4s,  w12      // 4 + intermediate_bits
2992.else
2993        movi            v29.8h,  #(PREP_BIAS >> 8), lsl #8
2994.endif
2995        sub             x10, x10, w9, uxtw
2996.ifc \type, put
2997        neg             v30.4s,  v30.4s   // -(4+intermediate_bits)
2998.endif
2999        br              x10
3000
300120:     // 2xN hv
3002.ifc \type, put
3003        add             \sr2, \src, \s_strd
3004        add             \ds2, \dst, \d_strd
3005        lsl             \s_strd, \s_strd, #1
3006        lsl             \d_strd, \d_strd, #1
3007
3008        ld1             {v20.4h},  [\src], \s_strd
3009        ext             v21.8b,  v20.8b,  v20.8b,  #2
3010        mul             v16.4h,  v20.4h,  v0.4h
3011        mla             v16.4h,  v21.4h,  v1.4h
3012        urshl           v16.4h,  v16.4h,  v31.4h
3013
30142:
3015        ld1             {v22.4h},  [\sr2], \s_strd
3016        ld1             {v24.4h},  [\src], \s_strd
3017        ext             v23.8b,  v22.8b,  v22.8b,  #2
3018        ext             v25.8b,  v24.8b,  v24.8b,  #2
3019        trn1            v22.2s,  v22.2s,  v24.2s
3020        trn1            v23.2s,  v23.2s,  v25.2s
3021        mul             v17.4h,  v22.4h,  v0.4h
3022        mla             v17.4h,  v23.4h,  v1.4h
3023        urshl           v17.4h,  v17.4h,  v31.4h
3024
3025        trn1            v16.2s,  v16.2s,  v17.2s
3026
3027        umull           v4.4s,   v16.4h,  v2.4h
3028        umlal           v4.4s,   v17.4h,  v3.4h
3029        urshl           v4.4s,   v4.4s,   v30.4s
3030        xtn             v4.4h,   v4.4s
3031        subs            \h,  \h,  #2
3032        st1             {v4.s}[0], [\dst], \d_strd
3033        st1             {v4.s}[1], [\ds2], \d_strd
3034        b.le            0f
3035        trn2            v16.2s,  v17.2s,  v17.2s
3036        b               2b
30370:
3038        ret
3039.endif
3040
304140:     // 4xN hv
3042        add             \sr2, \src, \s_strd
3043        add             \ds2, \dst, \d_strd
3044        lsl             \s_strd, \s_strd, #1
3045        lsl             \d_strd, \d_strd, #1
3046
3047        ld1             {v20.8h},  [\src], \s_strd
3048        ext             v21.16b, v20.16b, v20.16b, #2
3049        mul             v16.4h,  v20.4h,  v0.4h
3050        mla             v16.4h,  v21.4h,  v1.4h
3051        urshl           v16.4h,  v16.4h,  v31.4h
3052
30534:
3054        ld1             {v22.8h},  [\sr2], \s_strd
3055        ld1             {v24.8h},  [\src], \s_strd
3056        ext             v23.16b, v22.16b, v22.16b, #2
3057        ext             v25.16b, v24.16b, v24.16b, #2
3058        trn1            v22.2d,  v22.2d,  v24.2d
3059        trn1            v23.2d,  v23.2d,  v25.2d
3060        mul             v17.8h,  v22.8h,  v0.8h
3061        mla             v17.8h,  v23.8h,  v1.8h
3062        urshl           v17.8h,  v17.8h,  v31.8h
3063
3064        trn1            v16.2d,  v16.2d,  v17.2d
3065
3066        umull           v4.4s,   v16.4h,  v2.4h
3067        umlal           v4.4s,   v17.4h,  v3.4h
3068        umull2          v5.4s,   v16.8h,  v2.8h
3069        umlal2          v5.4s,   v17.8h,  v3.8h
3070.ifc \type, put
3071        urshl           v4.4s,   v4.4s,   v30.4s
3072        urshl           v5.4s,   v5.4s,   v30.4s
3073        xtn             v4.4h,   v4.4s
3074        xtn2            v4.8h,   v5.4s
3075.else
3076        rshrn           v4.4h,   v4.4s,   #4
3077        rshrn2          v4.8h,   v5.4s,   #4
3078        sub             v4.8h,   v4.8h,   v29.8h
3079.endif
3080        subs            \h,  \h,  #2
3081        st1             {v4.d}[0], [\dst], \d_strd
3082        st1             {v4.d}[1], [\ds2], \d_strd
3083        b.le            0f
3084        trn2            v16.2d,  v17.2d,  v17.2d
3085        b               4b
30860:
3087        ret
3088
308980:     // 8xN, 16xN, ... hv
3090160:
3091320:
3092640:
30931280:
3094        mov             \my, \h
3095
30961:
3097        add             \sr2, \src, \s_strd
3098        add             \ds2, \dst, \d_strd
3099        lsl             \s_strd, \s_strd, #1
3100        lsl             \d_strd, \d_strd, #1
3101
3102        ldr             h21, [\src, #16]
3103        ld1             {v20.8h},  [\src], \s_strd
3104        ext             v21.16b, v20.16b, v21.16b, #2
3105        mul             v16.8h,  v20.8h,  v0.8h
3106        mla             v16.8h,  v21.8h,  v1.8h
3107        urshl           v16.8h,  v16.8h,  v31.8h
3108
31092:
3110        ldr             h23, [\sr2, #16]
3111        ld1             {v22.8h},  [\sr2], \s_strd
3112        ldr             h25, [\src, #16]
3113        ld1             {v24.8h},  [\src], \s_strd
3114        ext             v23.16b, v22.16b, v23.16b, #2
3115        ext             v25.16b, v24.16b, v25.16b, #2
3116        mul             v17.8h,  v22.8h,  v0.8h
3117        mla             v17.8h,  v23.8h,  v1.8h
3118        mul             v18.8h,  v24.8h,  v0.8h
3119        mla             v18.8h,  v25.8h,  v1.8h
3120        urshl           v17.8h,  v17.8h,  v31.8h
3121        urshl           v18.8h,  v18.8h,  v31.8h
3122
3123        umull           v4.4s,   v16.4h,  v2.4h
3124        umlal           v4.4s,   v17.4h,  v3.4h
3125        umull2          v5.4s,   v16.8h,  v2.8h
3126        umlal2          v5.4s,   v17.8h,  v3.8h
3127        umull           v6.4s,   v17.4h,  v2.4h
3128        umlal           v6.4s,   v18.4h,  v3.4h
3129        umull2          v7.4s,   v17.8h,  v2.8h
3130        umlal2          v7.4s,   v18.8h,  v3.8h
3131.ifc \type, put
3132        urshl           v4.4s,   v4.4s,   v30.4s
3133        urshl           v5.4s,   v5.4s,   v30.4s
3134        urshl           v6.4s,   v6.4s,   v30.4s
3135        urshl           v7.4s,   v7.4s,   v30.4s
3136        xtn             v4.4h,   v4.4s
3137        xtn2            v4.8h,   v5.4s
3138        xtn             v5.4h,   v6.4s
3139        xtn2            v5.8h,   v7.4s
3140.else
3141        rshrn           v4.4h,   v4.4s,   #4
3142        rshrn2          v4.8h,   v5.4s,   #4
3143        rshrn           v5.4h,   v6.4s,   #4
3144        rshrn2          v5.8h,   v7.4s,   #4
3145        sub             v4.8h,   v4.8h,   v29.8h
3146        sub             v5.8h,   v5.8h,   v29.8h
3147.endif
3148        subs            \h,  \h,  #2
3149        st1             {v4.8h}, [\dst], \d_strd
3150        st1             {v5.8h}, [\ds2], \d_strd
3151        b.le            9f
3152        mov             v16.16b, v18.16b
3153        b               2b
31549:
3155        subs            \w,  \w,  #8
3156        b.le            0f
3157        asr             \s_strd,  \s_strd,  #1
3158        asr             \d_strd,  \d_strd,  #1
3159        msub            \src,  \s_strd,  \xmy,  \src
3160        msub            \dst,  \d_strd,  \xmy,  \dst
3161        sub             \src,  \src,  \s_strd,  lsl #1
3162        mov             \h,  \my
3163        add             \src,  \src,  #16
3164        add             \dst,  \dst,  #16
3165        b               1b
31660:
3167        ret
3168
3169L(\type\()_bilin_hv_tbl):
3170        .hword L(\type\()_bilin_hv_tbl) - 1280b
3171        .hword L(\type\()_bilin_hv_tbl) -  640b
3172        .hword L(\type\()_bilin_hv_tbl) -  320b
3173        .hword L(\type\()_bilin_hv_tbl) -  160b
3174        .hword L(\type\()_bilin_hv_tbl) -   80b
3175        .hword L(\type\()_bilin_hv_tbl) -   40b
3176        .hword L(\type\()_bilin_hv_tbl) -   20b
3177        .hword 0
3178endfunc
3179.endm
3180
3181filter_fn put,  x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10
3182filter_fn prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10
3183
3184.macro load_filter_row dst, src, inc
3185        asr             w13, \src, #10
3186        ldr             \dst, [x11, w13, sxtw #3]
3187        add             \src, \src, \inc
3188.endm
3189
3190function warp_filter_horz_neon
3191        add             w12, w5,  #512
3192
3193        ld1             {v16.8h, v17.8h}, [x2], x3
3194
3195        load_filter_row d0, w12, w7
3196        load_filter_row d1, w12, w7
3197        load_filter_row d2, w12, w7
3198        sxtl            v0.8h,   v0.8b
3199        load_filter_row d3, w12, w7
3200        sxtl            v1.8h,   v1.8b
3201        load_filter_row d4, w12, w7
3202        sxtl            v2.8h,   v2.8b
3203        load_filter_row d5, w12, w7
3204        sxtl            v3.8h,   v3.8b
3205        load_filter_row d6, w12, w7
3206        sxtl            v4.8h,   v4.8b
3207        load_filter_row d7, w12, w7
3208        sxtl            v5.8h,   v5.8b
3209        ext             v18.16b, v16.16b, v17.16b, #2*1
3210        smull           v8.4s,   v16.4h,  v0.4h
3211        smull2          v9.4s,   v16.8h,  v0.8h
3212        sxtl            v6.8h,   v6.8b
3213        ext             v19.16b, v16.16b, v17.16b, #2*2
3214        smull           v10.4s,  v18.4h,  v1.4h
3215        smull2          v11.4s,  v18.8h,  v1.8h
3216        sxtl            v7.8h,   v7.8b
3217        ext             v20.16b, v16.16b, v17.16b, #2*3
3218        smull           v0.4s,   v19.4h,  v2.4h
3219        smull2          v1.4s,   v19.8h,  v2.8h
3220        ext             v21.16b, v16.16b, v17.16b, #2*4
3221        addp            v8.4s,   v8.4s,   v9.4s
3222        smull           v2.4s,   v20.4h,  v3.4h
3223        smull2          v3.4s,   v20.8h,  v3.8h
3224        ext             v22.16b, v16.16b, v17.16b, #2*5
3225        addp            v9.4s,   v10.4s,  v11.4s
3226        smull           v10.4s,  v21.4h,  v4.4h
3227        smull2          v11.4s,  v21.8h,  v4.8h
3228        ext             v23.16b, v16.16b, v17.16b, #2*6
3229        addp            v0.4s,   v0.4s,   v1.4s
3230        smull           v18.4s,  v22.4h,  v5.4h
3231        smull2          v19.4s,  v22.8h,  v5.8h
3232        ext             v16.16b, v16.16b, v17.16b, #2*7
3233        addp            v1.4s,   v2.4s,   v3.4s
3234        addp            v2.4s,   v10.4s,  v11.4s
3235        smull           v20.4s,  v23.4h,  v6.4h
3236        smull2          v21.4s,  v23.8h,  v6.8h
3237        addp            v3.4s,   v18.4s,  v19.4s
3238        smull           v22.4s,  v16.4h,  v7.4h
3239        smull2          v23.4s,  v16.8h,  v7.8h
3240        addp            v4.4s,   v20.4s,  v21.4s
3241        addp            v5.4s,   v22.4s,  v23.4s
3242
3243        addp            v8.4s,   v8.4s,   v9.4s
3244        addp            v0.4s,   v0.4s,   v1.4s
3245        addp            v2.4s,   v2.4s,   v3.4s
3246        addp            v4.4s,   v4.4s,   v5.4s
3247
3248        addp            v16.4s,  v8.4s,   v0.4s
3249        addp            v17.4s,  v2.4s,   v4.4s
3250
3251        add             w5,  w5,  w8
3252
3253        srshl           v16.4s,  v16.4s,  v14.4s // -(7 - intermediate_bits)
3254        srshl           v17.4s,  v17.4s,  v14.4s // -(7 - intermediate_bits)
3255
3256        ret
3257endfunc
3258
3259// void dav1d_warp_affine_8x8_16bpc_neon(
3260//         pixel *dst, const ptrdiff_t dst_stride,
3261//         const pixel *src, const ptrdiff_t src_stride,
3262//         const int16_t *const abcd, int mx, int my,
3263//         const int bitdepth_max)
3264.macro warp t
3265function warp_affine_8x8\t\()_16bpc_neon, export=1
3266        stp             d8,  d9,  [sp, #-0x40]!
3267        stp             d10, d11, [sp, #0x10]
3268        stp             d12, d13, [sp, #0x20]
3269        stp             d14, d15, [sp, #0x30]
3270
3271.ifb \t
3272        dup             v15.8h,  w7        // bitdepth_max
3273.else
3274        movi            v15.8h,  #(PREP_BIAS >> 8), lsl #8
3275.endif
3276        clz             w7,  w7
3277                                           // intermediate_bits = clz(bitdepth_max) - 18
3278.ifb \t
3279        sub             w8,  w7,  #11      // 7 + intermediate_bits = clz(bitdepth_max) - 18 + 7
3280.endif
3281        sub             w7,  w7,  #25      // -(7 - intermediate_bits)
3282.ifb \t
3283        neg             w8,  w8            // -(7 + intermediate_bits)
3284.endif
3285        dup             v14.4s,  w7        // -(7 - intermediate_bits)
3286.ifb \t
3287        dup             v13.4s,  w8        // -(7 + intermediate_bits)
3288.endif
3289
3290        ldr             x4,  [x4]
3291        sbfx            x7,  x4, #0,  #16
3292        sbfx            x8,  x4, #16, #16
3293        sbfx            x9,  x4, #32, #16
3294        sbfx            x4,  x4, #48, #16
3295        mov             w10, #8
3296        sub             x2,  x2,  x3, lsl #1
3297        sub             x2,  x2,  x3
3298        sub             x2,  x2,  #6
3299        movrel          x11, X(mc_warp_filter), 64*8
3300        mov             x15, x30
3301.ifnb \t
3302        lsl             x1,  x1,  #1
3303.endif
3304
3305        bl              warp_filter_horz_neon
3306        xtn             v24.4h,  v16.4s
3307        xtn2            v24.8h,  v17.4s
3308        bl              warp_filter_horz_neon
3309        xtn             v25.4h,  v16.4s
3310        xtn2            v25.8h,  v17.4s
3311        bl              warp_filter_horz_neon
3312        xtn             v26.4h,  v16.4s
3313        xtn2            v26.8h,  v17.4s
3314        bl              warp_filter_horz_neon
3315        xtn             v27.4h,  v16.4s
3316        xtn2            v27.8h,  v17.4s
3317        bl              warp_filter_horz_neon
3318        xtn             v28.4h,  v16.4s
3319        xtn2            v28.8h,  v17.4s
3320        bl              warp_filter_horz_neon
3321        xtn             v29.4h,  v16.4s
3322        xtn2            v29.8h,  v17.4s
3323        bl              warp_filter_horz_neon
3324        xtn             v30.4h,  v16.4s
3325        xtn2            v30.8h,  v17.4s
3326
33271:
3328        add             w14, w6,  #512
3329        bl              warp_filter_horz_neon
3330        xtn             v31.4h,  v16.4s
3331        xtn2            v31.8h,  v17.4s
3332
3333        load_filter_row d0, w14, w9
3334        load_filter_row d1, w14, w9
3335        load_filter_row d2, w14, w9
3336        load_filter_row d3, w14, w9
3337        load_filter_row d4, w14, w9
3338        load_filter_row d5, w14, w9
3339        load_filter_row d6, w14, w9
3340        load_filter_row d7, w14, w9
3341        transpose_8x8b  v0, v1, v2, v3, v4, v5, v6, v7, v16, v17
3342        sxtl            v0.8h,   v0.8b
3343        sxtl            v1.8h,   v1.8b
3344        sxtl            v2.8h,   v2.8b
3345        sxtl            v3.8h,   v3.8b
3346        sxtl            v4.8h,   v4.8b
3347        sxtl            v5.8h,   v5.8b
3348        sxtl            v6.8h,   v6.8b
3349        sxtl            v7.8h,   v7.8b
3350
3351        // This ordering of smull/smlal/smull2/smlal2 is highly
3352        // beneficial for Cortex A53 here.
3353        smull           v16.4s,  v24.4h,  v0.4h
3354        smlal           v16.4s,  v25.4h,  v1.4h
3355        smlal           v16.4s,  v26.4h,  v2.4h
3356        smlal           v16.4s,  v27.4h,  v3.4h
3357        smlal           v16.4s,  v28.4h,  v4.4h
3358        smlal           v16.4s,  v29.4h,  v5.4h
3359        smlal           v16.4s,  v30.4h,  v6.4h
3360        smlal           v16.4s,  v31.4h,  v7.4h
3361        smull2          v17.4s,  v24.8h,  v0.8h
3362        smlal2          v17.4s,  v25.8h,  v1.8h
3363        smlal2          v17.4s,  v26.8h,  v2.8h
3364        smlal2          v17.4s,  v27.8h,  v3.8h
3365        smlal2          v17.4s,  v28.8h,  v4.8h
3366        smlal2          v17.4s,  v29.8h,  v5.8h
3367        smlal2          v17.4s,  v30.8h,  v6.8h
3368        smlal2          v17.4s,  v31.8h,  v7.8h
3369
3370        mov             v24.16b, v25.16b
3371        mov             v25.16b, v26.16b
3372.ifb \t
3373        srshl           v16.4s,  v16.4s,  v13.4s // -(7 + intermediate_bits)
3374        srshl           v17.4s,  v17.4s,  v13.4s // -(7 + intermediate_bits)
3375.else
3376        rshrn           v16.4h,  v16.4s,  #7
3377        rshrn2          v16.8h,  v17.4s,  #7
3378.endif
3379        mov             v26.16b, v27.16b
3380.ifb \t
3381        sqxtun          v16.4h,  v16.4s
3382        sqxtun2         v16.8h,  v17.4s
3383.else
3384        sub             v16.8h,  v16.8h,  v15.8h // PREP_BIAS
3385.endif
3386        mov             v27.16b, v28.16b
3387        mov             v28.16b, v29.16b
3388.ifb \t
3389        umin            v16.8h,  v16.8h,  v15.8h // bitdepth_max
3390.endif
3391        mov             v29.16b, v30.16b
3392        mov             v30.16b, v31.16b
3393        subs            w10, w10, #1
3394        st1             {v16.8h}, [x0], x1
3395
3396        add             w6,  w6,  w4
3397        b.gt            1b
3398
3399        ldp             d14, d15, [sp, #0x30]
3400        ldp             d12, d13, [sp, #0x20]
3401        ldp             d10, d11, [sp, #0x10]
3402        ldp             d8,  d9,  [sp], 0x40
3403
3404        br              x15
3405endfunc
3406.endm
3407
3408warp
3409warp t
3410