1/*
2 * Copyright © 2018, VideoLAN and dav1d authors
3 * Copyright © 2018, Janne Grunau
4 * Copyright © 2020, Martin Storsjo
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice, this
11 *    list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 *    this list of conditions and the following disclaimer in the documentation
15 *    and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
21 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29#include "src/arm/asm.S"
30#include "util.S"
31
32#define PREP_BIAS 8192
33
34.macro avg d0, d1, t0, t1, t2, t3
35        ld1             {\t0\().8h,\t1\().8h},  [x2],  32
36        ld1             {\t2\().8h,\t3\().8h},  [x3],  32
37        sqadd           \t0\().8h,  \t0\().8h,  \t2\().8h
38        sqadd           \t1\().8h,  \t1\().8h,  \t3\().8h
39        smax            \t0\().8h,  \t0\().8h,  v28.8h // -2*PREP_BIAS - 1 << intermediate_bits
40        smax            \t1\().8h,  \t1\().8h,  v28.8h // -2*PREP_BIAS - 1 << intermediate_bits
41        sqsub           \t0\().8h,  \t0\().8h,  v28.8h // -2*PREP_BIAS - 1 << intermediate_bits
42        sqsub           \t1\().8h,  \t1\().8h,  v28.8h // -2*PREP_BIAS - 1 << intermediate_bits
43        sshl            \d0\().8h,  \t0\().8h,  v29.8h // -(intermediate_bits+1)
44        sshl            \d1\().8h,  \t1\().8h,  v29.8h // -(intermediate_bits+1)
45.endm
46
47.macro w_avg d0, d1, t0, t1, t2, t3
48        ld1             {\t0\().8h,\t1\().8h},  [x2],  32
49        ld1             {\t2\().8h,\t3\().8h},  [x3],  32
50        // This difference requires a 17 bit range, and all bits are
51        // significant for the following multiplication.
52        ssubl           \d0\().4s,  \t2\().4h,  \t0\().4h
53        ssubl2          \t0\().4s,  \t2\().8h,  \t0\().8h
54        ssubl           \d1\().4s,  \t3\().4h,  \t1\().4h
55        ssubl2          \t1\().4s,  \t3\().8h,  \t1\().8h
56        mul             \d0\().4s,  \d0\().4s,  v27.4s
57        mul             \t0\().4s,  \t0\().4s,  v27.4s
58        mul             \d1\().4s,  \d1\().4s,  v27.4s
59        mul             \t1\().4s,  \t1\().4s,  v27.4s
60        sshr            \d0\().4s,  \d0\().4s,  #4
61        sshr            \t0\().4s,  \t0\().4s,  #4
62        sshr            \d1\().4s,  \d1\().4s,  #4
63        sshr            \t1\().4s,  \t1\().4s,  #4
64        saddw           \d0\().4s,  \d0\().4s,  \t2\().4h
65        saddw2          \t0\().4s,  \t0\().4s,  \t2\().8h
66        saddw           \d1\().4s,  \d1\().4s,  \t3\().4h
67        saddw2          \t1\().4s,  \t1\().4s,  \t3\().8h
68        xtn             \d0\().4h,  \d0\().4s
69        xtn2            \d0\().8h,  \t0\().4s
70        xtn             \d1\().4h,  \d1\().4s
71        xtn2            \d1\().8h,  \t1\().4s
72        srshl           \d0\().8h,  \d0\().8h,  v29.8h // -intermediate_bits
73        srshl           \d1\().8h,  \d1\().8h,  v29.8h // -intermediate_bits
74        add             \d0\().8h,  \d0\().8h,  v28.8h // PREP_BIAS >> intermediate_bits
75        add             \d1\().8h,  \d1\().8h,  v28.8h // PREP_BIAS >> intermediate_bits
76        smin            \d0\().8h,  \d0\().8h,  v31.8h // bitdepth_max
77        smin            \d1\().8h,  \d1\().8h,  v31.8h // bitdepth_max
78        smax            \d0\().8h,  \d0\().8h,  v30.8h // 0
79        smax            \d1\().8h,  \d1\().8h,  v30.8h // 0
80.endm
81
82.macro mask d0, d1, t0, t1, t2, t3
83        ld1             {v27.16b}, [x6],  16
84        ld1             {\t0\().8h,\t1\().8h},  [x2],  32
85        neg             v27.16b, v27.16b
86        ld1             {\t2\().8h,\t3\().8h},  [x3],  32
87        sxtl            v26.8h,  v27.8b
88        sxtl2           v27.8h,  v27.16b
89        sxtl            v24.4s,  v26.4h
90        sxtl2           v25.4s,  v26.8h
91        sxtl            v26.4s,  v27.4h
92        sxtl2           v27.4s,  v27.8h
93        ssubl           \d0\().4s,  \t2\().4h,  \t0\().4h
94        ssubl2          \t0\().4s,  \t2\().8h,  \t0\().8h
95        ssubl           \d1\().4s,  \t3\().4h,  \t1\().4h
96        ssubl2          \t1\().4s,  \t3\().8h,  \t1\().8h
97        mul             \d0\().4s,  \d0\().4s,  v24.4s
98        mul             \t0\().4s,  \t0\().4s,  v25.4s
99        mul             \d1\().4s,  \d1\().4s,  v26.4s
100        mul             \t1\().4s,  \t1\().4s,  v27.4s
101        sshr            \d0\().4s,  \d0\().4s,  #6
102        sshr            \t0\().4s,  \t0\().4s,  #6
103        sshr            \d1\().4s,  \d1\().4s,  #6
104        sshr            \t1\().4s,  \t1\().4s,  #6
105        saddw           \d0\().4s,  \d0\().4s,  \t2\().4h
106        saddw2          \t0\().4s,  \t0\().4s,  \t2\().8h
107        saddw           \d1\().4s,  \d1\().4s,  \t3\().4h
108        saddw2          \t1\().4s,  \t1\().4s,  \t3\().8h
109        xtn             \d0\().4h,  \d0\().4s
110        xtn2            \d0\().8h,  \t0\().4s
111        xtn             \d1\().4h,  \d1\().4s
112        xtn2            \d1\().8h,  \t1\().4s
113        srshl           \d0\().8h,  \d0\().8h,  v29.8h // -intermediate_bits
114        srshl           \d1\().8h,  \d1\().8h,  v29.8h // -intermediate_bits
115        add             \d0\().8h,  \d0\().8h,  v28.8h // PREP_BIAS >> intermediate_bits
116        add             \d1\().8h,  \d1\().8h,  v28.8h // PREP_BIAS >> intermediate_bits
117        smin            \d0\().8h,  \d0\().8h,  v31.8h // bitdepth_max
118        smin            \d1\().8h,  \d1\().8h,  v31.8h // bitdepth_max
119        smax            \d0\().8h,  \d0\().8h,  v30.8h // 0
120        smax            \d1\().8h,  \d1\().8h,  v30.8h // 0
121.endm
122
123.macro bidir_fn type, bdmax
124function \type\()_16bpc_neon, export=1
125        clz             w4,  w4
126.ifnc \type, avg
127        dup             v31.8h,  \bdmax // bitdepth_max
128        movi            v30.8h,  #0
129.endif
130        clz             w7,  \bdmax
131        sub             w7,  w7,  #18   // intermediate_bits = clz(bitdepth_max) - 18
132.ifc \type, avg
133        mov             w9,  #1
134        mov             w8,  #-2*PREP_BIAS
135        lsl             w9,  w9,  w7    // 1 << intermediate_bits
136        add             w7,  w7,  #1
137        sub             w8,  w8,  w9    // -2*PREP_BIAS - 1 << intermediate_bits
138        neg             w7,  w7         // -(intermediate_bits+1)
139        dup             v28.8h,   w8    // -2*PREP_BIAS - 1 << intermediate_bits
140        dup             v29.8h,   w7    // -(intermediate_bits+1)
141.else
142        mov             w8,  #PREP_BIAS
143        lsr             w8,  w8,  w7    // PREP_BIAS >> intermediate_bits
144        neg             w7,  w7         // -intermediate_bits
145        dup             v28.8h,  w8     // PREP_BIAS >> intermediate_bits
146        dup             v29.8h,  w7     // -intermediate_bits
147.endif
148.ifc \type, w_avg
149        dup             v27.4s,  w6
150        neg             v27.4s,  v27.4s
151.endif
152        adr             x7,  L(\type\()_tbl)
153        sub             w4,  w4,  #24
154        \type           v4,  v5,  v0,  v1,  v2,  v3
155        ldrh            w4,  [x7, x4, lsl #1]
156        sub             x7,  x7,  w4, uxtw
157        br              x7
15840:
159        add             x7,  x0,  x1
160        lsl             x1,  x1,  #1
1614:
162        subs            w5,  w5,  #4
163        st1             {v4.d}[0],  [x0], x1
164        st1             {v4.d}[1],  [x7], x1
165        st1             {v5.d}[0],  [x0], x1
166        st1             {v5.d}[1],  [x7], x1
167        b.le            0f
168        \type           v4,  v5,  v0,  v1,  v2,  v3
169        b               4b
17080:
171        add             x7,  x0,  x1
172        lsl             x1,  x1,  #1
1738:
174        st1             {v4.8h},  [x0], x1
175        subs            w5,  w5,  #2
176        st1             {v5.8h},  [x7], x1
177        b.le            0f
178        \type           v4,  v5,  v0,  v1,  v2,  v3
179        b               8b
18016:
181        \type           v6,  v7,  v0,  v1,  v2,  v3
182        st1             {v4.8h, v5.8h}, [x0], x1
183        subs            w5,  w5,  #2
184        st1             {v6.8h, v7.8h}, [x0], x1
185        b.le            0f
186        \type           v4,  v5,  v0,  v1,  v2,  v3
187        b               16b
18832:
189        \type           v6,  v7,  v0,  v1,  v2,  v3
190        subs            w5,  w5,  #1
191        st1             {v4.8h, v5.8h, v6.8h, v7.8h},  [x0], x1
192        b.le            0f
193        \type           v4,  v5,  v0,  v1,  v2,  v3
194        b               32b
195640:
196        add             x7,  x0,  #64
19764:
198        \type           v6,  v7,  v0,  v1,  v2,  v3
199        \type           v16, v17, v0,  v1,  v2,  v3
200        st1             {v4.8h, v5.8h, v6.8h, v7.8h},  [x0], x1
201        \type           v18, v19, v0,  v1,  v2,  v3
202        subs            w5,  w5,  #1
203        st1             {v16.8h,v17.8h,v18.8h,v19.8h}, [x7], x1
204        b.le            0f
205        \type           v4,  v5,  v0,  v1,  v2,  v3
206        b               64b
2071280:
208        add             x7,  x0,  #64
209        mov             x8,  #128
210        sub             x1,  x1,  #128
211128:
212        \type           v6,  v7,  v0,  v1,  v2,  v3
213        \type           v16, v17, v0,  v1,  v2,  v3
214        st1             {v4.8h, v5.8h, v6.8h, v7.8h},  [x0], x8
215        \type           v18, v19, v0,  v1,  v2,  v3
216        st1             {v16.8h,v17.8h,v18.8h,v19.8h}, [x7], x8
217        \type           v4,  v5,  v0,  v1,  v2,  v3
218        \type           v6,  v7,  v0,  v1,  v2,  v3
219        \type           v16, v17, v0,  v1,  v2,  v3
220        subs            w5,  w5,  #1
221        st1             {v4.8h, v5.8h, v6.8h, v7.8h},  [x0], x1
222        \type           v18, v19, v0,  v1,  v2,  v3
223        st1             {v16.8h,v17.8h,v18.8h,v19.8h}, [x7], x1
224        b.le            0f
225        \type           v4,  v5,  v0,  v1,  v2,  v3
226        b               128b
2270:
228        ret
229L(\type\()_tbl):
230        .hword L(\type\()_tbl) - 1280b
231        .hword L(\type\()_tbl) -  640b
232        .hword L(\type\()_tbl) -   32b
233        .hword L(\type\()_tbl) -   16b
234        .hword L(\type\()_tbl) -   80b
235        .hword L(\type\()_tbl) -   40b
236endfunc
237.endm
238
239bidir_fn avg, w6
240bidir_fn w_avg, w7
241bidir_fn mask, w7
242
243
244.macro w_mask_fn type
245function w_mask_\type\()_16bpc_neon, export=1
246        ldr             w8,  [sp]
247        clz             w9,  w4
248        adr             x10, L(w_mask_\type\()_tbl)
249        dup             v31.8h,  w8   // bitdepth_max
250        sub             w9,  w9,  #24
251        clz             w8,  w8       // clz(bitdepth_max)
252        ldrh            w9,  [x10,  x9,  lsl #1]
253        sub             x10, x10, w9,  uxtw
254        sub             w8,  w8,  #12 // sh = intermediate_bits + 6 = clz(bitdepth_max) - 12
255        mov             w9,  #PREP_BIAS*64
256        neg             w8,  w8       // -sh
257        mov             w11, #27615   // (64 + 1 - 38)<<mask_sh - 1 - mask_rnd
258        dup             v30.4s,  w9   // PREP_BIAS*64
259        dup             v29.4s,  w8   // -sh
260        dup             v0.8h,   w11
261.if \type == 444
262        movi            v1.16b,  #64
263.elseif \type == 422
264        dup             v2.8b,   w7
265        movi            v3.8b,   #129
266        sub             v3.8b,   v3.8b,   v2.8b
267.elseif \type == 420
268        dup             v2.8h,   w7
269        movi            v3.8h,   #1, lsl #8
270        sub             v3.8h,   v3.8h,   v2.8h
271.endif
272        add             x12,  x0,  x1
273        lsl             x1,   x1,  #1
274        br              x10
2754:
276        ld1             {v4.8h, v5.8h}, [x2], #32 // tmp1 (four rows at once)
277        ld1             {v6.8h, v7.8h}, [x3], #32 // tmp2 (four rows at once)
278        subs            w5,  w5,  #4
279        sabd            v20.8h,  v4.8h,   v6.8h   // abs(tmp1 - tmp2)
280        sabd            v21.8h,  v5.8h,   v7.8h
281        ssubl           v16.4s,  v6.4h,   v4.4h   // tmp2 - tmp1 (requires 17 bit)
282        ssubl2          v17.4s,  v6.8h,   v4.8h
283        ssubl           v18.4s,  v7.4h,   v5.4h
284        ssubl2          v19.4s,  v7.8h,   v5.8h
285        uqsub           v20.8h,  v0.8h,   v20.8h  // 27615 - abs()
286        uqsub           v21.8h,  v0.8h,   v21.8h
287        sshll2          v7.4s,   v5.8h,   #6      // tmp1 << 6
288        sshll           v6.4s,   v5.4h,   #6
289        sshll2          v5.4s,   v4.8h,   #6
290        sshll           v4.4s,   v4.4h,   #6
291        ushr            v20.8h,  v20.8h,  #10     // 64-m = (27615 - abs()) >> mask_sh
292        ushr            v21.8h,  v21.8h,  #10
293        add             v4.4s,   v4.4s,   v30.4s  // += PREP_BIAS*64
294        add             v5.4s,   v5.4s,   v30.4s
295        add             v6.4s,   v6.4s,   v30.4s
296        add             v7.4s,   v7.4s,   v30.4s
297        uxtl            v22.4s,  v20.4h
298        uxtl2           v23.4s,  v20.8h
299        uxtl            v24.4s,  v21.4h
300        uxtl2           v25.4s,  v21.8h
301        mla             v4.4s,   v16.4s,  v22.4s  // (tmp2-tmp1)*(64-m)
302        mla             v5.4s,   v17.4s,  v23.4s
303        mla             v6.4s,   v18.4s,  v24.4s
304        mla             v7.4s,   v19.4s,  v25.4s
305        srshl           v4.4s,   v4.4s,   v29.4s  // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh
306        srshl           v5.4s,   v5.4s,   v29.4s
307        srshl           v6.4s,   v6.4s,   v29.4s
308        srshl           v7.4s,   v7.4s,   v29.4s
309        sqxtun          v4.4h,   v4.4s            // iclip_pixel
310        sqxtun2         v4.8h,   v5.4s
311        sqxtun          v5.4h,   v6.4s
312        sqxtun2         v5.8h,   v7.4s
313        umin            v4.8h,   v4.8h,   v31.8h  // iclip_pixel
314        umin            v5.8h,   v5.8h,   v31.8h
315.if \type == 444
316        xtn             v20.8b,  v20.8h           // 64 - m
317        xtn2            v20.16b, v21.8h
318        sub             v20.16b, v1.16b,  v20.16b // m
319        st1             {v20.16b}, [x6], #16
320.elseif \type == 422
321        addp            v20.8h,  v20.8h,  v21.8h  // (64 - m) + (64 - n) (column wise addition)
322        xtn             v20.8b,  v20.8h
323        uhsub           v20.8b,  v3.8b,   v20.8b  // ((129 - sign) - ((64 - m) + (64 - n)) >> 1
324        st1             {v20.8b}, [x6], #8
325.elseif \type == 420
326        trn1            v24.2d,  v20.2d,  v21.2d
327        trn2            v25.2d,  v20.2d,  v21.2d
328        add             v24.8h,  v24.8h,  v25.8h  // (64 - my1) + (64 - my2) (row wise addition)
329        addp            v20.8h,  v24.8h,  v24.8h  // (128 - m) + (128 - n) (column wise addition)
330        sub             v20.4h,  v3.4h,   v20.4h  // (256 - sign) - ((128 - m) + (128 - n))
331        rshrn           v20.8b,  v20.8h,  #2      // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
332        st1             {v20.s}[0], [x6], #4
333.endif
334        st1             {v4.d}[0],  [x0],  x1
335        st1             {v4.d}[1],  [x12], x1
336        st1             {v5.d}[0],  [x0],  x1
337        st1             {v5.d}[1],  [x12], x1
338        b.gt            4b
339        ret
3408:
341        ld1             {v4.8h, v5.8h}, [x2], #32 // tmp1
342        ld1             {v6.8h, v7.8h}, [x3], #32 // tmp2
343        subs            w5,  w5,  #2
344        sabd            v20.8h,  v4.8h,   v6.8h   // abs(tmp1 - tmp2)
345        sabd            v21.8h,  v5.8h,   v7.8h
346        ssubl           v16.4s,  v6.4h,   v4.4h   // tmp2 - tmp1 (requires 17 bit)
347        ssubl2          v17.4s,  v6.8h,   v4.8h
348        ssubl           v18.4s,  v7.4h,   v5.4h
349        ssubl2          v19.4s,  v7.8h,   v5.8h
350        uqsub           v20.8h,  v0.8h,   v20.8h  // 27615 - abs()
351        uqsub           v21.8h,  v0.8h,   v21.8h
352        sshll2          v7.4s,   v5.8h,   #6      // tmp1 << 6
353        sshll           v6.4s,   v5.4h,   #6
354        sshll2          v5.4s,   v4.8h,   #6
355        sshll           v4.4s,   v4.4h,   #6
356        ushr            v20.8h,  v20.8h,  #10     // 64-m = (27615 - abs()) >> mask_sh
357        ushr            v21.8h,  v21.8h,  #10
358        add             v4.4s,   v4.4s,   v30.4s  // += PREP_BIAS*64
359        add             v5.4s,   v5.4s,   v30.4s
360        add             v6.4s,   v6.4s,   v30.4s
361        add             v7.4s,   v7.4s,   v30.4s
362        uxtl            v22.4s,  v20.4h
363        uxtl2           v23.4s,  v20.8h
364        uxtl            v24.4s,  v21.4h
365        uxtl2           v25.4s,  v21.8h
366        mla             v4.4s,   v16.4s,  v22.4s  // (tmp2-tmp1)*(64-m)
367        mla             v5.4s,   v17.4s,  v23.4s
368        mla             v6.4s,   v18.4s,  v24.4s
369        mla             v7.4s,   v19.4s,  v25.4s
370        srshl           v4.4s,   v4.4s,   v29.4s  // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh
371        srshl           v5.4s,   v5.4s,   v29.4s
372        srshl           v6.4s,   v6.4s,   v29.4s
373        srshl           v7.4s,   v7.4s,   v29.4s
374        sqxtun          v4.4h,   v4.4s            // iclip_pixel
375        sqxtun2         v4.8h,   v5.4s
376        sqxtun          v5.4h,   v6.4s
377        sqxtun2         v5.8h,   v7.4s
378        umin            v4.8h,   v4.8h,   v31.8h  // iclip_pixel
379        umin            v5.8h,   v5.8h,   v31.8h
380.if \type == 444
381        xtn             v20.8b,  v20.8h           // 64 - m
382        xtn2            v20.16b, v21.8h
383        sub             v20.16b, v1.16b,  v20.16b // m
384        st1             {v20.16b}, [x6], #16
385.elseif \type == 422
386        addp            v20.8h,  v20.8h,  v21.8h  // (64 - m) + (64 - n) (column wise addition)
387        xtn             v20.8b,  v20.8h
388        uhsub           v20.8b,  v3.8b,   v20.8b  // ((129 - sign) - ((64 - m) + (64 - n)) >> 1
389        st1             {v20.8b}, [x6], #8
390.elseif \type == 420
391        add             v20.8h,  v20.8h,  v21.8h  // (64 - my1) + (64 - my2) (row wise addition)
392        addp            v20.8h,  v20.8h,  v20.8h  // (128 - m) + (128 - n) (column wise addition)
393        sub             v20.4h,  v3.4h,   v20.4h  // (256 - sign) - ((128 - m) + (128 - n))
394        rshrn           v20.8b,  v20.8h,  #2      // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
395        st1             {v20.s}[0], [x6], #4
396.endif
397        st1             {v4.8h}, [x0],  x1
398        st1             {v5.8h}, [x12], x1
399        b.gt            8b
400        ret
4011280:
402640:
403320:
404160:
405        mov             w11, w4
406        sub             x1,  x1,  w4,  uxtw #1
407.if \type == 444
408        add             x10, x6,  w4,  uxtw
409.elseif \type == 422
410        add             x10, x6,  x11, lsr #1
411.endif
412        add             x9,  x3,  w4,  uxtw #1
413        add             x7,  x2,  w4,  uxtw #1
414161:
415        mov             w8,  w4
41616:
417        ld1             {v4.8h,   v5.8h},  [x2], #32 // tmp1
418        ld1             {v16.8h,  v17.8h}, [x3], #32 // tmp2
419        ld1             {v6.8h,   v7.8h},  [x7], #32
420        ld1             {v18.8h,  v19.8h}, [x9], #32
421        subs            w8,  w8,  #16
422        sabd            v20.8h,  v4.8h,   v16.8h  // abs(tmp1 - tmp2)
423        sabd            v21.8h,  v5.8h,   v17.8h
424        ssubl           v22.4s,  v16.4h,  v4.4h   // tmp2 - tmp1 (requires 17 bit)
425        ssubl2          v23.4s,  v16.8h,  v4.8h
426        ssubl           v24.4s,  v17.4h,  v5.4h
427        ssubl2          v25.4s,  v17.8h,  v5.8h
428        uqsub           v20.8h,  v0.8h,   v20.8h  // 27615 - abs()
429        uqsub           v21.8h,  v0.8h,   v21.8h
430        sshll2          v27.4s,  v5.8h,   #6      // tmp1 << 6
431        sshll           v26.4s,  v5.4h,   #6
432        sshll2          v5.4s,   v4.8h,   #6
433        sshll           v4.4s,   v4.4h,   #6
434        ushr            v20.8h,  v20.8h,  #10     // 64-m = (27615 - abs()) >> mask_sh
435        ushr            v21.8h,  v21.8h,  #10
436        add             v4.4s,   v4.4s,   v30.4s  // += PREP_BIAS*64
437        add             v5.4s,   v5.4s,   v30.4s
438        add             v26.4s,  v26.4s,  v30.4s
439        add             v27.4s,  v27.4s,  v30.4s
440        uxtl            v16.4s,  v20.4h
441        uxtl2           v17.4s,  v20.8h
442        uxtl            v28.4s,  v21.4h
443        mla             v4.4s,   v22.4s,  v16.4s  // (tmp2-tmp1)*(64-m)
444        uxtl2           v16.4s,  v21.8h
445        mla             v5.4s,   v23.4s,  v17.4s
446        mla             v26.4s,  v24.4s,  v28.4s
447        mla             v27.4s,  v25.4s,  v16.4s
448        srshl           v4.4s,   v4.4s,   v29.4s  // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh
449        srshl           v5.4s,   v5.4s,   v29.4s
450        srshl           v26.4s,  v26.4s,  v29.4s
451        srshl           v27.4s,  v27.4s,  v29.4s
452        sqxtun          v4.4h,   v4.4s            // iclip_pixel
453        sqxtun2         v4.8h,   v5.4s
454        sqxtun          v5.4h,   v26.4s
455        sqxtun2         v5.8h,   v27.4s
456
457        // Start of other half
458        sabd            v22.8h,  v6.8h,   v18.8h  // abs(tmp1 - tmp2)
459        sabd            v23.8h,  v7.8h,   v19.8h
460
461        umin            v4.8h,   v4.8h,   v31.8h  // iclip_pixel
462        umin            v5.8h,   v5.8h,   v31.8h
463
464        ssubl           v16.4s,  v18.4h,  v6.4h   // tmp2 - tmp1 (requires 17 bit)
465        ssubl2          v17.4s,  v18.8h,  v6.8h
466        ssubl           v18.4s,  v19.4h,  v7.4h
467        ssubl2          v19.4s,  v19.8h,  v7.8h
468        uqsub           v22.8h,  v0.8h,   v22.8h  // 27615 - abs()
469        uqsub           v23.8h,  v0.8h,   v23.8h
470        sshll           v24.4s,  v6.4h,   #6      // tmp1 << 6
471        sshll2          v25.4s,  v6.8h,   #6
472        sshll           v26.4s,  v7.4h,   #6
473        sshll2          v27.4s,  v7.8h,   #6
474        ushr            v22.8h,  v22.8h,  #10     // 64-m = (27615 - abs()) >> mask_sh
475        ushr            v23.8h,  v23.8h,  #10
476        add             v24.4s,  v24.4s,  v30.4s  // += PREP_BIAS*64
477        add             v25.4s,  v25.4s,  v30.4s
478        add             v26.4s,  v26.4s,  v30.4s
479        add             v27.4s,  v27.4s,  v30.4s
480        uxtl            v6.4s,   v22.4h
481        uxtl2           v7.4s,   v22.8h
482        uxtl            v28.4s,  v23.4h
483        mla             v24.4s,  v16.4s,  v6.4s   // (tmp2-tmp1)*(64-m)
484        uxtl2           v6.4s,   v23.8h
485        mla             v25.4s,  v17.4s,  v7.4s
486        mla             v26.4s,  v18.4s,  v28.4s
487        mla             v27.4s,  v19.4s,  v6.4s
488        srshl           v24.4s,  v24.4s,  v29.4s  // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh
489        srshl           v25.4s,  v25.4s,  v29.4s
490        srshl           v26.4s,  v26.4s,  v29.4s
491        srshl           v27.4s,  v27.4s,  v29.4s
492        sqxtun          v6.4h,   v24.4s           // iclip_pixel
493        sqxtun2         v6.8h,   v25.4s
494        sqxtun          v7.4h,   v26.4s
495        sqxtun2         v7.8h,   v27.4s
496        umin            v6.8h,   v6.8h,   v31.8h  // iclip_pixel
497        umin            v7.8h,   v7.8h,   v31.8h
498.if \type == 444
499        xtn             v20.8b,  v20.8h           // 64 - m
500        xtn2            v20.16b, v21.8h
501        xtn             v21.8b,  v22.8h
502        xtn2            v21.16b, v23.8h
503        sub             v20.16b, v1.16b,  v20.16b // m
504        sub             v21.16b, v1.16b,  v21.16b
505        st1             {v20.16b}, [x6],  #16
506        st1             {v21.16b}, [x10], #16
507.elseif \type == 422
508        addp            v20.8h,  v20.8h,  v21.8h  // (64 - m) + (64 - n) (column wise addition)
509        addp            v21.8h,  v22.8h,  v23.8h
510        xtn             v20.8b,  v20.8h
511        xtn             v21.8b,  v21.8h
512        uhsub           v20.8b,  v3.8b,   v20.8b  // ((129 - sign) - ((64 - m) + (64 - n)) >> 1
513        uhsub           v21.8b,  v3.8b,   v21.8b
514        st1             {v20.8b}, [x6],  #8
515        st1             {v21.8b}, [x10], #8
516.elseif \type == 420
517        add             v20.8h,  v20.8h,  v22.8h  // (64 - my1) + (64 - my2) (row wise addition)
518        add             v21.8h,  v21.8h,  v23.8h
519        addp            v20.8h,  v20.8h,  v21.8h  // (128 - m) + (128 - n) (column wise addition)
520        sub             v20.8h,  v3.8h,   v20.8h  // (256 - sign) - ((128 - m) + (128 - n))
521        rshrn           v20.8b,  v20.8h,  #2      // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
522        st1             {v20.8b}, [x6], #8
523.endif
524        st1             {v4.8h, v5.8h}, [x0],  #32
525        st1             {v6.8h, v7.8h}, [x12], #32
526        b.gt            16b
527        subs            w5,  w5,  #2
528        add             x2,  x2,  w4,  uxtw #1
529        add             x3,  x3,  w4,  uxtw #1
530        add             x7,  x7,  w4,  uxtw #1
531        add             x9,  x9,  w4,  uxtw #1
532.if \type == 444
533        add             x6,  x6,  w4,  uxtw
534        add             x10, x10, w4,  uxtw
535.elseif \type == 422
536        add             x6,  x6,  x11, lsr #1
537        add             x10, x10, x11, lsr #1
538.endif
539        add             x0,  x0,  x1
540        add             x12, x12, x1
541        b.gt            161b
542        ret
543L(w_mask_\type\()_tbl):
544        .hword L(w_mask_\type\()_tbl) - 1280b
545        .hword L(w_mask_\type\()_tbl) -  640b
546        .hword L(w_mask_\type\()_tbl) -  320b
547        .hword L(w_mask_\type\()_tbl) -  160b
548        .hword L(w_mask_\type\()_tbl) -    8b
549        .hword L(w_mask_\type\()_tbl) -    4b
550endfunc
551.endm
552
553w_mask_fn 444
554w_mask_fn 422
555w_mask_fn 420
556
557
558function blend_16bpc_neon, export=1
559        adr             x6,  L(blend_tbl)
560        clz             w3,  w3
561        sub             w3,  w3,  #26
562        ldrh            w3,  [x6,  x3,  lsl #1]
563        sub             x6,  x6,  w3,  uxtw
564        add             x8,  x0,  x1
565        br              x6
56640:
567        lsl             x1,  x1,  #1
5684:
569        ld1             {v2.8b},   [x5], #8
570        ld1             {v1.8h},   [x2], #16
571        ld1             {v0.d}[0], [x0]
572        neg             v2.8b,   v2.8b            // -m
573        subs            w4,  w4,  #2
574        ld1             {v0.d}[1], [x8]
575        sxtl            v2.8h,   v2.8b
576        shl             v2.8h,   v2.8h,   #9      // -m << 9
577        sub             v1.8h,   v0.8h,   v1.8h   // a - b
578        sqrdmulh        v1.8h,   v1.8h,   v2.8h   // ((a-b)*-m + 32) >> 6
579        add             v0.8h,   v0.8h,   v1.8h
580        st1             {v0.d}[0], [x0], x1
581        st1             {v0.d}[1], [x8], x1
582        b.gt            4b
583        ret
58480:
585        lsl             x1,  x1,  #1
5868:
587        ld1             {v4.16b},       [x5], #16
588        ld1             {v2.8h, v3.8h}, [x2], #32
589        neg             v5.16b,  v4.16b           // -m
590        ld1             {v0.8h},   [x0]
591        ld1             {v1.8h},   [x8]
592        sxtl            v4.8h,   v5.8b
593        sxtl2           v5.8h,   v5.16b
594        shl             v4.8h,   v4.8h,   #9      // -m << 9
595        shl             v5.8h,   v5.8h,   #9
596        sub             v2.8h,   v0.8h,   v2.8h   // a - b
597        sub             v3.8h,   v1.8h,   v3.8h
598        subs            w4,  w4,  #2
599        sqrdmulh        v2.8h,   v2.8h,   v4.8h   // ((a-b)*-m + 32) >> 6
600        sqrdmulh        v3.8h,   v3.8h,   v5.8h
601        add             v0.8h,   v0.8h,   v2.8h
602        add             v1.8h,   v1.8h,   v3.8h
603        st1             {v0.8h}, [x0], x1
604        st1             {v1.8h}, [x8], x1
605        b.gt            8b
606        ret
607160:
608        lsl             x1,  x1,  #1
60916:
610        ld1             {v16.16b, v17.16b},           [x5], #32
611        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
612        subs            w4,  w4,  #2
613        neg             v18.16b, v16.16b          // -m
614        neg             v19.16b, v17.16b
615        ld1             {v0.8h, v1.8h}, [x0]
616        sxtl            v16.8h,  v18.8b
617        sxtl2           v17.8h,  v18.16b
618        sxtl            v18.8h,  v19.8b
619        sxtl2           v19.8h,  v19.16b
620        ld1             {v2.8h, v3.8h}, [x8]
621        shl             v16.8h,  v16.8h,  #9      // -m << 9
622        shl             v17.8h,  v17.8h,  #9
623        shl             v18.8h,  v18.8h,  #9
624        shl             v19.8h,  v19.8h,  #9
625        sub             v4.8h,   v0.8h,   v4.8h   // a - b
626        sub             v5.8h,   v1.8h,   v5.8h
627        sub             v6.8h,   v2.8h,   v6.8h
628        sub             v7.8h,   v3.8h,   v7.8h
629        sqrdmulh        v4.8h,   v4.8h,   v16.8h  // ((a-b)*-m + 32) >> 6
630        sqrdmulh        v5.8h,   v5.8h,   v17.8h
631        sqrdmulh        v6.8h,   v6.8h,   v18.8h
632        sqrdmulh        v7.8h,   v7.8h,   v19.8h
633        add             v0.8h,   v0.8h,   v4.8h
634        add             v1.8h,   v1.8h,   v5.8h
635        add             v2.8h,   v2.8h,   v6.8h
636        add             v3.8h,   v3.8h,   v7.8h
637        st1             {v0.8h, v1.8h}, [x0], x1
638        st1             {v2.8h, v3.8h}, [x8], x1
639        b.gt            16b
640        ret
64132:
642        ld1             {v16.16b, v17.16b},           [x5], #32
643        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
644        subs            w4,  w4,  #1
645        neg             v18.16b, v16.16b          // -m
646        neg             v19.16b, v17.16b
647        sxtl            v16.8h,  v18.8b
648        sxtl2           v17.8h,  v18.16b
649        sxtl            v18.8h,  v19.8b
650        sxtl2           v19.8h,  v19.16b
651        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0]
652        shl             v16.8h,  v16.8h,  #9      // -m << 9
653        shl             v17.8h,  v17.8h,  #9
654        shl             v18.8h,  v18.8h,  #9
655        shl             v19.8h,  v19.8h,  #9
656        sub             v4.8h,   v0.8h,   v4.8h   // a - b
657        sub             v5.8h,   v1.8h,   v5.8h
658        sub             v6.8h,   v2.8h,   v6.8h
659        sub             v7.8h,   v3.8h,   v7.8h
660        sqrdmulh        v4.8h,   v4.8h,   v16.8h  // ((a-b)*-m + 32) >> 6
661        sqrdmulh        v5.8h,   v5.8h,   v17.8h
662        sqrdmulh        v6.8h,   v6.8h,   v18.8h
663        sqrdmulh        v7.8h,   v7.8h,   v19.8h
664        add             v0.8h,   v0.8h,   v4.8h
665        add             v1.8h,   v1.8h,   v5.8h
666        add             v2.8h,   v2.8h,   v6.8h
667        add             v3.8h,   v3.8h,   v7.8h
668        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
669        b.gt            32b
670        ret
671L(blend_tbl):
672        .hword L(blend_tbl) -  32b
673        .hword L(blend_tbl) - 160b
674        .hword L(blend_tbl) -  80b
675        .hword L(blend_tbl) -  40b
676endfunc
677
678function blend_h_16bpc_neon, export=1
679        adr             x6,  L(blend_h_tbl)
680        movrel          x5,  X(obmc_masks)
681        add             x5,  x5,  w4,  uxtw
682        sub             w4,  w4,  w4,  lsr #2
683        clz             w7,  w3
684        add             x8,  x0,  x1
685        lsl             x1,  x1,  #1
686        sub             w7,  w7,  #24
687        ldrh            w7,  [x6,  x7,  lsl #1]
688        sub             x6,  x6,  w7, uxtw
689        br              x6
6902:
691        ld2r            {v2.8b, v3.8b}, [x5], #2
692        ld1             {v1.4h},        [x2], #8
693        ext             v2.8b,   v2.8b,   v3.8b,   #6
694        subs            w4,  w4,  #2
695        neg             v2.8b,   v2.8b            // -m
696        ld1             {v0.s}[0], [x0]
697        ld1             {v0.s}[1], [x8]
698        sxtl            v2.8h,   v2.8b
699        shl             v2.4h,   v2.4h,   #9      // -m << 9
700        sub             v1.4h,   v0.4h,   v1.4h   // a - b
701        sqrdmulh        v1.4h,   v1.4h,   v2.4h   // ((a-b)*-m + 32) >> 6
702        add             v0.4h,   v0.4h,   v1.4h
703        st1             {v0.s}[0], [x0], x1
704        st1             {v0.s}[1], [x8], x1
705        b.gt            2b
706        ret
7074:
708        ld2r            {v2.8b, v3.8b}, [x5], #2
709        ld1             {v1.8h},        [x2], #16
710        ext             v2.8b,   v2.8b,   v3.8b,   #4
711        subs            w4,  w4,  #2
712        neg             v2.8b,   v2.8b            // -m
713        ld1             {v0.d}[0],   [x0]
714        ld1             {v0.d}[1],   [x8]
715        sxtl            v2.8h,   v2.8b
716        shl             v2.8h,   v2.8h,   #9      // -m << 9
717        sub             v1.8h,   v0.8h,   v1.8h   // a - b
718        sqrdmulh        v1.8h,   v1.8h,   v2.8h   // ((a-b)*-m + 32) >> 6
719        add             v0.8h,   v0.8h,   v1.8h
720        st1             {v0.d}[0], [x0], x1
721        st1             {v0.d}[1], [x8], x1
722        b.gt            4b
723        ret
7248:
725        ld2r            {v4.8b, v5.8b}, [x5], #2
726        ld1             {v2.8h, v3.8h}, [x2], #32
727        neg             v4.8b,   v4.8b            // -m
728        neg             v5.8b,   v5.8b
729        ld1             {v0.8h}, [x0]
730        subs            w4,  w4,  #2
731        sxtl            v4.8h,   v4.8b
732        sxtl            v5.8h,   v5.8b
733        ld1             {v1.8h}, [x8]
734        shl             v4.8h,   v4.8h,   #9      // -m << 9
735        shl             v5.8h,   v5.8h,   #9
736        sub             v2.8h,   v0.8h,   v2.8h   // a - b
737        sub             v3.8h,   v1.8h,   v3.8h
738        sqrdmulh        v2.8h,   v2.8h,   v4.8h   // ((a-b)*-m + 32) >> 6
739        sqrdmulh        v3.8h,   v3.8h,   v5.8h
740        add             v0.8h,   v0.8h,   v2.8h
741        add             v1.8h,   v1.8h,   v3.8h
742        st1             {v0.8h}, [x0], x1
743        st1             {v1.8h}, [x8], x1
744        b.gt            8b
745        ret
74616:
747        ld2r            {v16.8b, v17.8b}, [x5], #2
748        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
749        neg             v16.8b,  v16.8b           // -m
750        neg             v17.8b,  v17.8b
751        ld1             {v0.8h, v1.8h},  [x0]
752        ld1             {v2.8h, v3.8h},  [x8]
753        subs            w4,  w4,  #2
754        sxtl            v16.8h,  v16.8b
755        sxtl            v17.8h,  v17.8b
756        shl             v16.8h,  v16.8h,  #9      // -m << 9
757        shl             v17.8h,  v17.8h,  #9
758        sub             v4.8h,   v0.8h,   v4.8h   // a - b
759        sub             v5.8h,   v1.8h,   v5.8h
760        sub             v6.8h,   v2.8h,   v6.8h
761        sub             v7.8h,   v3.8h,   v7.8h
762        sqrdmulh        v4.8h,   v4.8h,   v16.8h  // ((a-b)*-m + 32) >> 6
763        sqrdmulh        v5.8h,   v5.8h,   v16.8h
764        sqrdmulh        v6.8h,   v6.8h,   v17.8h
765        sqrdmulh        v7.8h,   v7.8h,   v17.8h
766        add             v0.8h,   v0.8h,   v4.8h
767        add             v1.8h,   v1.8h,   v5.8h
768        add             v2.8h,   v2.8h,   v6.8h
769        add             v3.8h,   v3.8h,   v7.8h
770        st1             {v0.8h, v1.8h}, [x0], x1
771        st1             {v2.8h, v3.8h}, [x8], x1
772        b.gt            16b
773        ret
7741280:
775640:
776320:
777        sub             x1,  x1,  w3,  uxtw #1
778        add             x7,  x2,  w3,  uxtw #1
779321:
780        ld2r            {v24.8b, v25.8b}, [x5], #2
781        mov             w6,  w3
782        neg             v24.8b,  v24.8b           // -m
783        neg             v25.8b,  v25.8b
784        sxtl            v24.8h,  v24.8b
785        sxtl            v25.8h,  v25.8b
786        shl             v24.8h,  v24.8h,  #9      // -m << 9
787        shl             v25.8h,  v25.8h,  #9
78832:
789        ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x2], #64
790        ld1             {v0.8h,  v1.8h,  v2.8h,  v3.8h},  [x0]
791        subs            w6,  w6,  #32
792        sub             v16.8h,  v0.8h,   v16.8h  // a - b
793        sub             v17.8h,  v1.8h,   v17.8h
794        sub             v18.8h,  v2.8h,   v18.8h
795        sub             v19.8h,  v3.8h,   v19.8h
796        ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x7], #64
797        ld1             {v4.8h,  v5.8h,  v6.8h,  v7.8h},  [x8]
798        sqrdmulh        v16.8h,  v16.8h,  v24.8h  // ((a-b)*-m + 32) >> 6
799        sqrdmulh        v17.8h,  v17.8h,  v24.8h
800        sqrdmulh        v18.8h,  v18.8h,  v24.8h
801        sqrdmulh        v19.8h,  v19.8h,  v24.8h
802        sub             v20.8h,  v4.8h,   v20.8h  // a - b
803        sub             v21.8h,  v5.8h,   v21.8h
804        sub             v22.8h,  v6.8h,   v22.8h
805        sub             v23.8h,  v7.8h,   v23.8h
806        add             v0.8h,   v0.8h,   v16.8h
807        add             v1.8h,   v1.8h,   v17.8h
808        add             v2.8h,   v2.8h,   v18.8h
809        add             v3.8h,   v3.8h,   v19.8h
810        sqrdmulh        v20.8h,  v20.8h,  v25.8h  // ((a-b)*-m + 32) >> 6
811        sqrdmulh        v21.8h,  v21.8h,  v25.8h
812        sqrdmulh        v22.8h,  v22.8h,  v25.8h
813        sqrdmulh        v23.8h,  v23.8h,  v25.8h
814        st1             {v0.8h,  v1.8h,  v2.8h,  v3.8h},  [x0], #64
815        add             v4.8h,   v4.8h,   v20.8h
816        add             v5.8h,   v5.8h,   v21.8h
817        add             v6.8h,   v6.8h,   v22.8h
818        add             v7.8h,   v7.8h,   v23.8h
819        st1             {v4.8h,  v5.8h,  v6.8h,  v7.8h},  [x8], #64
820        b.gt            32b
821        subs            w4,  w4,  #2
822        add             x0,  x0,  x1
823        add             x8,  x8,  x1
824        add             x2,  x2,  w3,  uxtw #1
825        add             x7,  x7,  w3,  uxtw #1
826        b.gt            321b
827        ret
828L(blend_h_tbl):
829        .hword L(blend_h_tbl) - 1280b
830        .hword L(blend_h_tbl) -  640b
831        .hword L(blend_h_tbl) -  320b
832        .hword L(blend_h_tbl) -   16b
833        .hword L(blend_h_tbl) -    8b
834        .hword L(blend_h_tbl) -    4b
835        .hword L(blend_h_tbl) -    2b
836endfunc
837
838function blend_v_16bpc_neon, export=1
839        adr             x6,  L(blend_v_tbl)
840        movrel          x5,  X(obmc_masks)
841        add             x5,  x5,  w3,  uxtw
842        clz             w3,  w3
843        add             x8,  x0,  x1
844        lsl             x1,  x1,  #1
845        sub             w3,  w3,  #26
846        ldrh            w3,  [x6,  x3,  lsl #1]
847        sub             x6,  x6,  w3,  uxtw
848        br              x6
84920:
850        ld1r            {v2.8b}, [x5]
851        neg             v2.8b,   v2.8b            // -m
852        sxtl            v2.8h,   v2.8b
853        shl             v2.4h,   v2.4h,   #9      // -m << 9
8542:
855        ld1             {v1.s}[0], [x2], #4
856        ld1             {v0.h}[0], [x0]
857        subs            w4,  w4,  #2
858        ld1             {v1.h}[1], [x2]
859        ld1             {v0.h}[1], [x8]
860        add             x2,  x2,  #4
861        sub             v1.4h,   v0.4h,   v1.4h   // a - b
862        sqrdmulh        v1.4h,   v1.4h,   v2.4h   // ((a-b)*-m + 32) >> 6
863        add             v0.4h,   v0.4h,   v1.4h
864        st1             {v0.h}[0], [x0],  x1
865        st1             {v0.h}[1], [x8],  x1
866        b.gt            2b
867        ret
86840:
869        ld1r            {v2.2s}, [x5]
870        sub             x1,  x1,  #4
871        neg             v2.8b,   v2.8b            // -m
872        sxtl            v2.8h,   v2.8b
873        shl             v2.8h,   v2.8h,   #9      // -m << 9
8744:
875        ld1             {v1.8h},   [x2], #16
876        ld1             {v0.d}[0], [x0]
877        ld1             {v0.d}[1], [x8]
878        subs            w4,  w4,  #2
879        sub             v1.8h,   v0.8h,   v1.8h   // a - b
880        sqrdmulh        v1.8h,   v1.8h,   v2.8h   // ((a-b)*-m + 32) >> 6
881        add             v0.8h,   v0.8h,   v1.8h
882        st1             {v0.s}[0], [x0], #4
883        st1             {v0.s}[2], [x8], #4
884        st1             {v0.h}[2], [x0], x1
885        st1             {v0.h}[6], [x8], x1
886        b.gt            4b
887        ret
88880:
889        ld1             {v4.8b}, [x5]
890        sub             x1,  x1,  #8
891        neg             v4.8b,   v4.8b            // -m
892        sxtl            v4.8h,   v4.8b
893        shl             v4.8h,   v4.8h,   #9      // -m << 9
8948:
895        ld1             {v2.8h, v3.8h}, [x2], #32
896        ld1             {v0.8h}, [x0]
897        ld1             {v1.8h}, [x8]
898        subs            w4,  w4,  #2
899        sub             v2.8h,   v0.8h,   v2.8h   // a - b
900        sub             v3.8h,   v1.8h,   v3.8h
901        sqrdmulh        v2.8h,   v2.8h,   v4.8h   // ((a-b)*-m + 32) >> 6
902        sqrdmulh        v3.8h,   v3.8h,   v4.8h
903        add             v0.8h,   v0.8h,   v2.8h
904        add             v1.8h,   v1.8h,   v3.8h
905        st1             {v0.d}[0], [x0], #8
906        st1             {v1.d}[0], [x8], #8
907        st1             {v0.s}[2], [x0], x1
908        st1             {v1.s}[2], [x8], x1
909        b.gt            8b
910        ret
911160:
912        ld1             {v16.8b, v17.8b}, [x5]
913        sub             x1,  x1,  #16
914        neg             v16.8b,  v16.8b           // -m
915        neg             v17.8b,  v17.8b
916        sxtl            v16.8h,  v16.8b
917        sxtl            v17.8h,  v17.8b
918        shl             v16.8h,  v16.8h,  #9      // -m << 9
919        shl             v17.4h,  v17.4h,  #9
92016:
921        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
922        ld1             {v0.8h, v1.8h}, [x0]
923        subs            w4,  w4,  #2
924        ld1             {v2.8h, v3.8h}, [x8]
925        sub             v4.8h,   v0.8h,   v4.8h   // a - b
926        sub             v5.4h,   v1.4h,   v5.4h
927        sub             v6.8h,   v2.8h,   v6.8h
928        sub             v7.4h,   v3.4h,   v7.4h
929        sqrdmulh        v4.8h,   v4.8h,   v16.8h  // ((a-b)*-m + 32) >> 6
930        sqrdmulh        v5.4h,   v5.4h,   v17.4h
931        sqrdmulh        v6.8h,   v6.8h,   v16.8h
932        sqrdmulh        v7.4h,   v7.4h,   v17.4h
933        add             v0.8h,   v0.8h,   v4.8h
934        add             v1.4h,   v1.4h,   v5.4h
935        add             v2.8h,   v2.8h,   v6.8h
936        add             v3.4h,   v3.4h,   v7.4h
937        st1             {v0.8h}, [x0], #16
938        st1             {v2.8h}, [x8], #16
939        st1             {v1.4h}, [x0], x1
940        st1             {v3.4h}, [x8], x1
941        b.gt            16b
942        ret
943320:
944        ld1             {v24.16b, v25.16b},  [x5]
945        neg             v26.16b, v24.16b          // -m
946        neg             v27.8b,  v25.8b
947        sxtl            v24.8h,  v26.8b
948        sxtl2           v25.8h,  v26.16b
949        sxtl            v26.8h,  v27.8b
950        shl             v24.8h,  v24.8h,  #9      // -m << 9
951        shl             v25.8h,  v25.8h,  #9
952        shl             v26.8h,  v26.8h,  #9
95332:
954        ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x2], #64
955        ld1             {v0.8h, v1.8h, v2.8h}, [x0]
956        ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x2], #64
957        ld1             {v4.8h, v5.8h, v6.8h}, [x8]
958        subs            w4,  w4,  #2
959        sub             v16.8h,  v0.8h,   v16.8h  // a - b
960        sub             v17.8h,  v1.8h,   v17.8h
961        sub             v18.8h,  v2.8h,   v18.8h
962        sub             v20.8h,  v4.8h,   v20.8h
963        sub             v21.8h,  v5.8h,   v21.8h
964        sub             v22.8h,  v6.8h,   v22.8h
965        sqrdmulh        v16.8h,  v16.8h,  v24.8h  // ((a-b)*-m + 32) >> 6
966        sqrdmulh        v17.8h,  v17.8h,  v25.8h
967        sqrdmulh        v18.8h,  v18.8h,  v26.8h
968        sqrdmulh        v20.8h,  v20.8h,  v24.8h
969        sqrdmulh        v21.8h,  v21.8h,  v25.8h
970        sqrdmulh        v22.8h,  v22.8h,  v26.8h
971        add             v0.8h,   v0.8h,   v16.8h
972        add             v1.8h,   v1.8h,   v17.8h
973        add             v2.8h,   v2.8h,   v18.8h
974        add             v4.8h,   v4.8h,   v20.8h
975        add             v5.8h,   v5.8h,   v21.8h
976        add             v6.8h,   v6.8h,   v22.8h
977        st1             {v0.8h, v1.8h, v2.8h}, [x0], x1
978        st1             {v4.8h, v5.8h, v6.8h}, [x8], x1
979        b.gt            32b
980        ret
981L(blend_v_tbl):
982        .hword L(blend_v_tbl) - 320b
983        .hword L(blend_v_tbl) - 160b
984        .hword L(blend_v_tbl) -  80b
985        .hword L(blend_v_tbl) -  40b
986        .hword L(blend_v_tbl) -  20b
987endfunc
988
989
990// This has got the same signature as the put_8tap functions,
991// and assumes that x9 is set to (clz(w)-24).
992function put_neon
993        adr             x10, L(put_tbl)
994        ldrh            w9, [x10, x9, lsl #1]
995        sub             x10, x10, w9, uxtw
996        br              x10
997
9982:
999        ld1             {v0.s}[0], [x2], x3
1000        ld1             {v1.s}[0], [x2], x3
1001        subs            w5,  w5,  #2
1002        st1             {v0.s}[0], [x0], x1
1003        st1             {v1.s}[0], [x0], x1
1004        b.gt            2b
1005        ret
10064:
1007        ld1             {v0.4h}, [x2], x3
1008        ld1             {v1.4h}, [x2], x3
1009        subs            w5,  w5,  #2
1010        st1             {v0.4h}, [x0], x1
1011        st1             {v1.4h}, [x0], x1
1012        b.gt            4b
1013        ret
101480:
1015        add             x8,  x0,  x1
1016        lsl             x1,  x1,  #1
1017        add             x9,  x2,  x3
1018        lsl             x3,  x3,  #1
10198:
1020        ld1             {v0.8h}, [x2], x3
1021        ld1             {v1.8h}, [x9], x3
1022        subs            w5,  w5,  #2
1023        st1             {v0.8h}, [x0], x1
1024        st1             {v1.8h}, [x8], x1
1025        b.gt            8b
1026        ret
102716:
1028        ldp             x6,  x7,  [x2]
1029        ldp             x8,  x9,  [x2, #16]
1030        stp             x6,  x7,  [x0]
1031        subs            w5,  w5,  #1
1032        stp             x8,  x9,  [x0, #16]
1033        add             x2,  x2,  x3
1034        add             x0,  x0,  x1
1035        b.gt            16b
1036        ret
103732:
1038        ldp             x6,  x7,  [x2]
1039        ldp             x8,  x9,  [x2, #16]
1040        stp             x6,  x7,  [x0]
1041        ldp             x10, x11, [x2, #32]
1042        stp             x8,  x9,  [x0, #16]
1043        subs            w5,  w5,  #1
1044        ldp             x12, x13, [x2, #48]
1045        stp             x10, x11, [x0, #32]
1046        stp             x12, x13, [x0, #48]
1047        add             x2,  x2,  x3
1048        add             x0,  x0,  x1
1049        b.gt            32b
1050        ret
105164:
1052        ldp             q0,  q1,  [x2]
1053        ldp             q2,  q3,  [x2, #32]
1054        stp             q0,  q1,  [x0]
1055        ldp             q4,  q5,  [x2, #64]
1056        stp             q2,  q3,  [x0, #32]
1057        ldp             q6,  q7,  [x2, #96]
1058        subs            w5,  w5,  #1
1059        stp             q4,  q5,  [x0, #64]
1060        stp             q6,  q7,  [x0, #96]
1061        add             x2,  x2,  x3
1062        add             x0,  x0,  x1
1063        b.gt            64b
1064        ret
1065128:
1066        ldp             q0,  q1,  [x2]
1067        ldp             q2,  q3,  [x2, #32]
1068        stp             q0,  q1,  [x0]
1069        ldp             q4,  q5,  [x2, #64]
1070        stp             q2,  q3,  [x0, #32]
1071        ldp             q6,  q7,  [x2, #96]
1072        subs            w5,  w5,  #1
1073        stp             q4,  q5,  [x0, #64]
1074        ldp             q16, q17, [x2, #128]
1075        stp             q6,  q7,  [x0, #96]
1076        ldp             q18, q19, [x2, #160]
1077        stp             q16, q17, [x0, #128]
1078        ldp             q20, q21, [x2, #192]
1079        stp             q18, q19, [x0, #160]
1080        ldp             q22, q23, [x2, #224]
1081        stp             q20, q21, [x0, #192]
1082        stp             q22, q23, [x0, #224]
1083        add             x2,  x2,  x3
1084        add             x0,  x0,  x1
1085        b.gt            128b
1086        ret
1087
1088L(put_tbl):
1089        .hword L(put_tbl) - 128b
1090        .hword L(put_tbl) -  64b
1091        .hword L(put_tbl) -  32b
1092        .hword L(put_tbl) -  16b
1093        .hword L(put_tbl) -  80b
1094        .hword L(put_tbl) -   4b
1095        .hword L(put_tbl) -   2b
1096endfunc
1097
1098
1099// This has got the same signature as the prep_8tap functions,
1100// and assumes that x9 is set to (clz(w)-24), w7 to intermediate_bits and
1101// x8 to w*2.
1102function prep_neon
1103        adr             x10, L(prep_tbl)
1104        ldrh            w9, [x10, x9, lsl #1]
1105        dup             v31.8h,  w7   // intermediate_bits
1106        movi            v30.8h,  #(PREP_BIAS >> 8), lsl #8
1107        sub             x10, x10, w9, uxtw
1108        br              x10
1109
111040:
1111        add             x9,  x1,  x2
1112        lsl             x2,  x2,  #1
11134:
1114        ld1             {v0.d}[0], [x1], x2
1115        ld1             {v0.d}[1], [x9], x2
1116        subs            w4,  w4,  #2
1117        sshl            v0.8h,   v0.8h,   v31.8h
1118        sub             v0.8h,   v0.8h,   v30.8h
1119        st1             {v0.8h}, [x0], #16
1120        b.gt            4b
1121        ret
112280:
1123        add             x9,  x1,  x2
1124        lsl             x2,  x2,  #1
11258:
1126        ld1             {v0.8h}, [x1], x2
1127        ld1             {v1.8h}, [x9], x2
1128        subs            w4,  w4,  #2
1129        sshl            v0.8h,   v0.8h,   v31.8h
1130        sshl            v1.8h,   v1.8h,   v31.8h
1131        sub             v0.8h,   v0.8h,   v30.8h
1132        sub             v1.8h,   v1.8h,   v30.8h
1133        st1             {v0.8h, v1.8h}, [x0], #32
1134        b.gt            8b
1135        ret
113616:
1137        ldp             q0,  q1,  [x1]
1138        add             x1,  x1,  x2
1139        sshl            v0.8h,   v0.8h,   v31.8h
1140        ldp             q2,  q3,  [x1]
1141        add             x1,  x1,  x2
1142        subs            w4,  w4,  #2
1143        sshl            v1.8h,   v1.8h,   v31.8h
1144        sshl            v2.8h,   v2.8h,   v31.8h
1145        sshl            v3.8h,   v3.8h,   v31.8h
1146        sub             v0.8h,   v0.8h,   v30.8h
1147        sub             v1.8h,   v1.8h,   v30.8h
1148        sub             v2.8h,   v2.8h,   v30.8h
1149        sub             v3.8h,   v3.8h,   v30.8h
1150        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
1151        b.gt            16b
1152        ret
115332:
1154        ldp             q0,  q1,  [x1]
1155        sshl            v0.8h,   v0.8h,   v31.8h
1156        ldp             q2,  q3,  [x1, #32]
1157        add             x1,  x1,  x2
1158        sshl            v1.8h,   v1.8h,   v31.8h
1159        sshl            v2.8h,   v2.8h,   v31.8h
1160        sshl            v3.8h,   v3.8h,   v31.8h
1161        subs            w4,  w4,  #1
1162        sub             v0.8h,   v0.8h,   v30.8h
1163        sub             v1.8h,   v1.8h,   v30.8h
1164        sub             v2.8h,   v2.8h,   v30.8h
1165        sub             v3.8h,   v3.8h,   v30.8h
1166        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
1167        b.gt            32b
1168        ret
116964:
1170        ldp             q0,  q1,  [x1]
1171        subs            w4,  w4,  #1
1172        sshl            v0.8h,   v0.8h,   v31.8h
1173        ldp             q2,  q3,  [x1, #32]
1174        sshl            v1.8h,   v1.8h,   v31.8h
1175        ldp             q4,  q5,  [x1, #64]
1176        sshl            v2.8h,   v2.8h,   v31.8h
1177        sshl            v3.8h,   v3.8h,   v31.8h
1178        ldp             q6,  q7,  [x1, #96]
1179        add             x1,  x1,  x2
1180        sshl            v4.8h,   v4.8h,   v31.8h
1181        sshl            v5.8h,   v5.8h,   v31.8h
1182        sshl            v6.8h,   v6.8h,   v31.8h
1183        sshl            v7.8h,   v7.8h,   v31.8h
1184        sub             v0.8h,   v0.8h,   v30.8h
1185        sub             v1.8h,   v1.8h,   v30.8h
1186        sub             v2.8h,   v2.8h,   v30.8h
1187        sub             v3.8h,   v3.8h,   v30.8h
1188        stp             q0,  q1,  [x0]
1189        sub             v4.8h,   v4.8h,   v30.8h
1190        sub             v5.8h,   v5.8h,   v30.8h
1191        stp             q2,  q3,  [x0, #32]
1192        sub             v6.8h,   v6.8h,   v30.8h
1193        sub             v7.8h,   v7.8h,   v30.8h
1194        stp             q4,  q5,  [x0, #64]
1195        stp             q6,  q7,  [x0, #96]
1196        add             x0,  x0,  x8
1197        b.gt            64b
1198        ret
1199128:
1200        ldp             q0,  q1,  [x1]
1201        subs            w4,  w4,  #1
1202        sshl            v0.8h,   v0.8h,   v31.8h
1203        ldp             q2,  q3,  [x1, #32]
1204        sshl            v1.8h,   v1.8h,   v31.8h
1205        ldp             q4,  q5,  [x1, #64]
1206        sshl            v2.8h,   v2.8h,   v31.8h
1207        sshl            v3.8h,   v3.8h,   v31.8h
1208        ldp             q6,  q7,  [x1, #96]
1209        sshl            v4.8h,   v4.8h,   v31.8h
1210        sshl            v5.8h,   v5.8h,   v31.8h
1211        ldp             q16, q17, [x1, #128]
1212        sshl            v6.8h,   v6.8h,   v31.8h
1213        sshl            v7.8h,   v7.8h,   v31.8h
1214        ldp             q18, q19, [x1, #160]
1215        sshl            v16.8h,  v16.8h,  v31.8h
1216        sshl            v17.8h,  v17.8h,  v31.8h
1217        ldp             q20, q21, [x1, #192]
1218        sshl            v18.8h,  v18.8h,  v31.8h
1219        sshl            v19.8h,  v19.8h,  v31.8h
1220        ldp             q22, q23, [x1, #224]
1221        add             x1,  x1,  x2
1222        sshl            v20.8h,  v20.8h,  v31.8h
1223        sshl            v21.8h,  v21.8h,  v31.8h
1224        sshl            v22.8h,  v22.8h,  v31.8h
1225        sshl            v23.8h,  v23.8h,  v31.8h
1226        sub             v0.8h,   v0.8h,   v30.8h
1227        sub             v1.8h,   v1.8h,   v30.8h
1228        sub             v2.8h,   v2.8h,   v30.8h
1229        sub             v3.8h,   v3.8h,   v30.8h
1230        stp             q0,  q1,  [x0]
1231        sub             v4.8h,   v4.8h,   v30.8h
1232        sub             v5.8h,   v5.8h,   v30.8h
1233        stp             q2,  q3,  [x0, #32]
1234        sub             v6.8h,   v6.8h,   v30.8h
1235        sub             v7.8h,   v7.8h,   v30.8h
1236        stp             q4,  q5,  [x0, #64]
1237        sub             v16.8h,  v16.8h,  v30.8h
1238        sub             v17.8h,  v17.8h,  v30.8h
1239        stp             q6,  q7,  [x0, #96]
1240        sub             v18.8h,  v18.8h,  v30.8h
1241        sub             v19.8h,  v19.8h,  v30.8h
1242        stp             q16, q17, [x0, #128]
1243        sub             v20.8h,  v20.8h,  v30.8h
1244        sub             v21.8h,  v21.8h,  v30.8h
1245        stp             q18, q19, [x0, #160]
1246        sub             v22.8h,  v22.8h,  v30.8h
1247        sub             v23.8h,  v23.8h,  v30.8h
1248        stp             q20, q21, [x0, #192]
1249        stp             q22, q23, [x0, #224]
1250        add             x0,  x0,  x8
1251        b.gt            128b
1252        ret
1253
1254L(prep_tbl):
1255        .hword L(prep_tbl) - 128b
1256        .hword L(prep_tbl) -  64b
1257        .hword L(prep_tbl) -  32b
1258        .hword L(prep_tbl) -  16b
1259        .hword L(prep_tbl) -  80b
1260        .hword L(prep_tbl) -  40b
1261endfunc
1262
1263
1264.macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6
1265        ld1             {\d0\wd}[0], [\s0], \strd
1266        ld1             {\d1\wd}[0], [\s1], \strd
1267.ifnb \d2
1268        ld1             {\d2\wd}[0], [\s0], \strd
1269        ld1             {\d3\wd}[0], [\s1], \strd
1270.endif
1271.ifnb \d4
1272        ld1             {\d4\wd}[0], [\s0], \strd
1273.endif
1274.ifnb \d5
1275        ld1             {\d5\wd}[0], [\s1], \strd
1276.endif
1277.ifnb \d6
1278        ld1             {\d6\wd}[0], [\s0], \strd
1279.endif
1280.endm
1281.macro load_reg s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6
1282        ld1             {\d0\wd}, [\s0], \strd
1283        ld1             {\d1\wd}, [\s1], \strd
1284.ifnb \d2
1285        ld1             {\d2\wd}, [\s0], \strd
1286        ld1             {\d3\wd}, [\s1], \strd
1287.endif
1288.ifnb \d4
1289        ld1             {\d4\wd}, [\s0], \strd
1290.endif
1291.ifnb \d5
1292        ld1             {\d5\wd}, [\s1], \strd
1293.endif
1294.ifnb \d6
1295        ld1             {\d6\wd}, [\s0], \strd
1296.endif
1297.endm
1298.macro load_regpair s0, s1, strd, wd, d0, d1, d2, d3, d4, d5
1299        ld1             {\d0\wd, \d1\wd}, [\s0], \strd
1300.ifnb \d2
1301        ld1             {\d2\wd, \d3\wd}, [\s1], \strd
1302.endif
1303.ifnb \d4
1304        ld1             {\d4\wd, \d5\wd}, [\s0], \strd
1305.endif
1306.endm
1307.macro load_s s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
1308        load_slice      \s0, \s1, \strd, .s, \d0, \d1, \d2, \d3, \d4, \d5, \d6
1309.endm
1310.macro load_4h s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
1311        load_reg        \s0, \s1, \strd, .4h, \d0, \d1, \d2, \d3, \d4, \d5, \d6
1312.endm
1313.macro load_8h s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
1314        load_reg        \s0, \s1, \strd, .8h, \d0, \d1, \d2, \d3, \d4, \d5, \d6
1315.endm
1316.macro load_16h s0, s1, strd, d0, d1, d2, d3, d4, d5
1317        load_regpair    \s0, \s1, \strd, .8h, \d0, \d1, \d2, \d3, \d4, \d5
1318.endm
1319.macro interleave_1 wd, r0, r1, r2, r3, r4
1320        trn1            \r0\wd, \r0\wd, \r1\wd
1321        trn1            \r1\wd, \r1\wd, \r2\wd
1322.ifnb \r3
1323        trn1            \r2\wd, \r2\wd, \r3\wd
1324        trn1            \r3\wd, \r3\wd, \r4\wd
1325.endif
1326.endm
1327.macro interleave_1_s r0, r1, r2, r3, r4
1328        interleave_1    .2s, \r0, \r1, \r2, \r3, \r4
1329.endm
1330.macro umin_h c, wd, r0, r1, r2, r3
1331        umin            \r0\wd,  \r0\wd,  \c\wd
1332.ifnb \r1
1333        umin            \r1\wd,  \r1\wd,  \c\wd
1334.endif
1335.ifnb \r2
1336        umin            \r2\wd,  \r2\wd,  \c\wd
1337        umin            \r3\wd,  \r3\wd,  \c\wd
1338.endif
1339.endm
1340.macro sub_h c, wd, r0, r1, r2, r3
1341        sub             \r0\wd,  \r0\wd,  \c\wd
1342.ifnb \r1
1343        sub             \r1\wd,  \r1\wd,  \c\wd
1344.endif
1345.ifnb \r2
1346        sub             \r2\wd,  \r2\wd,  \c\wd
1347        sub             \r3\wd,  \r3\wd,  \c\wd
1348.endif
1349.endm
1350.macro smull_smlal_4 d, s0, s1, s2, s3
1351        smull           \d\().4s,  \s0\().4h,  v0.h[0]
1352        smlal           \d\().4s,  \s1\().4h,  v0.h[1]
1353        smlal           \d\().4s,  \s2\().4h,  v0.h[2]
1354        smlal           \d\().4s,  \s3\().4h,  v0.h[3]
1355.endm
1356.macro smull2_smlal2_4 d, s0, s1, s2, s3
1357        smull2          \d\().4s,  \s0\().8h,  v0.h[0]
1358        smlal2          \d\().4s,  \s1\().8h,  v0.h[1]
1359        smlal2          \d\().4s,  \s2\().8h,  v0.h[2]
1360        smlal2          \d\().4s,  \s3\().8h,  v0.h[3]
1361.endm
1362.macro smull_smlal_8 d, s0, s1, s2, s3, s4, s5, s6, s7
1363        smull           \d\().4s,  \s0\().4h,  v0.h[0]
1364        smlal           \d\().4s,  \s1\().4h,  v0.h[1]
1365        smlal           \d\().4s,  \s2\().4h,  v0.h[2]
1366        smlal           \d\().4s,  \s3\().4h,  v0.h[3]
1367        smlal           \d\().4s,  \s4\().4h,  v0.h[4]
1368        smlal           \d\().4s,  \s5\().4h,  v0.h[5]
1369        smlal           \d\().4s,  \s6\().4h,  v0.h[6]
1370        smlal           \d\().4s,  \s7\().4h,  v0.h[7]
1371.endm
1372.macro smull2_smlal2_8 d, s0, s1, s2, s3, s4, s5, s6, s7
1373        smull2          \d\().4s,  \s0\().8h,  v0.h[0]
1374        smlal2          \d\().4s,  \s1\().8h,  v0.h[1]
1375        smlal2          \d\().4s,  \s2\().8h,  v0.h[2]
1376        smlal2          \d\().4s,  \s3\().8h,  v0.h[3]
1377        smlal2          \d\().4s,  \s4\().8h,  v0.h[4]
1378        smlal2          \d\().4s,  \s5\().8h,  v0.h[5]
1379        smlal2          \d\().4s,  \s6\().8h,  v0.h[6]
1380        smlal2          \d\().4s,  \s7\().8h,  v0.h[7]
1381.endm
1382.macro sqrshrun_h shift, r0, r1, r2, r3
1383        sqrshrun        \r0\().4h, \r0\().4s,  #\shift
1384.ifnb \r1
1385        sqrshrun2       \r0\().8h, \r1\().4s,  #\shift
1386.endif
1387.ifnb \r2
1388        sqrshrun        \r2\().4h, \r2\().4s,  #\shift
1389        sqrshrun2       \r2\().8h, \r3\().4s,  #\shift
1390.endif
1391.endm
1392.macro xtn_h r0, r1, r2, r3
1393        xtn             \r0\().4h,  \r0\().4s
1394        xtn2            \r0\().8h,  \r1\().4s
1395.ifnb \r2
1396        xtn             \r2\().4h,  \r2\().4s
1397        xtn2            \r2\().8h,  \r3\().4s
1398.endif
1399.endm
1400.macro srshl_s shift, r0, r1, r2, r3
1401        srshl           \r0\().4s,  \r0\().4s,  \shift\().4s
1402        srshl           \r1\().4s,  \r1\().4s,  \shift\().4s
1403.ifnb \r2
1404        srshl           \r2\().4s,  \r2\().4s,  \shift\().4s
1405        srshl           \r3\().4s,  \r3\().4s,  \shift\().4s
1406.endif
1407.endm
1408.macro st_s strd, reg, lanes
1409        st1             {\reg\().s}[0], [x0], \strd
1410        st1             {\reg\().s}[1], [x9], \strd
1411.if \lanes > 2
1412        st1             {\reg\().s}[2], [x0], \strd
1413        st1             {\reg\().s}[3], [x9], \strd
1414.endif
1415.endm
1416.macro st_d strd, r0, r1
1417        st1             {\r0\().d}[0], [x0], \strd
1418        st1             {\r0\().d}[1], [x9], \strd
1419.ifnb \r1
1420        st1             {\r1\().d}[0], [x0], \strd
1421        st1             {\r1\().d}[1], [x9], \strd
1422.endif
1423.endm
1424.macro shift_store_4 type, strd, r0, r1, r2, r3
1425.ifc \type, put
1426        sqrshrun_h      6,   \r0, \r1, \r2, \r3
1427        umin_h          v31, .8h, \r0, \r2
1428.else
1429        srshl_s         v30, \r0, \r1, \r2, \r3  // -(6-intermediate_bits)
1430        xtn_h           \r0, \r1, \r2, \r3
1431        sub_h           v29, .8h, \r0, \r2       // PREP_BIAS
1432.endif
1433        st_d            \strd, \r0, \r2
1434.endm
1435.macro st_reg strd, wd, r0, r1, r2, r3, r4, r5, r6, r7
1436        st1             {\r0\wd}, [x0], \strd
1437        st1             {\r1\wd}, [x9], \strd
1438.ifnb \r2
1439        st1             {\r2\wd}, [x0], \strd
1440        st1             {\r3\wd}, [x9], \strd
1441.endif
1442.ifnb \r4
1443        st1             {\r4\wd}, [x0], \strd
1444        st1             {\r5\wd}, [x9], \strd
1445        st1             {\r6\wd}, [x0], \strd
1446        st1             {\r7\wd}, [x9], \strd
1447.endif
1448.endm
1449.macro st_8h strd, r0, r1, r2, r3, r4, r5, r6, r7
1450        st_reg          \strd, .8h, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7
1451.endm
1452.macro shift_store_8 type, strd, r0, r1, r2, r3
1453.ifc \type, put
1454        sqrshrun_h      6,   \r0, \r1, \r2, \r3
1455        umin_h          v31, .8h, \r0, \r2
1456.else
1457        srshl_s         v30, \r0, \r1, \r2, \r3  // -(6-intermediate_bits)
1458        xtn_h           \r0, \r1, \r2, \r3
1459        sub_h           v29, .8h, \r0, \r2       // PREP_BIAS
1460.endif
1461        st_8h           \strd, \r0, \r2
1462.endm
1463.macro shift_store_16 type, strd, dst, r0, r1, r2, r3
1464.ifc \type, put
1465        sqrshrun_h      6,   \r0, \r1, \r2, \r3
1466        umin            \r0\().8h, \r0\().8h, v31.8h
1467        umin            \r1\().8h, \r2\().8h, v31.8h
1468.else
1469        srshl_s         v30, \r0, \r1, \r2, \r3  // -(6-intermediate_bits)
1470        xtn_h           \r0, \r1, \r2, \r3
1471        sub             \r0\().8h, \r0\().8h, v29.8h
1472        sub             \r1\().8h, \r2\().8h, v29.8h
1473.endif
1474        st1             {\r0\().8h, \r1\().8h}, [\dst], \strd
1475.endm
1476
1477.macro make_8tap_fn op, type, type_h, type_v
1478function \op\()_8tap_\type\()_16bpc_neon, export=1
1479        mov             w9,  \type_h
1480        mov             w10, \type_v
1481        b               \op\()_8tap_neon
1482endfunc
1483.endm
1484
1485// No spaces in these expressions, due to gas-preprocessor.
1486#define REGULAR ((0*15<<7)|3*15)
1487#define SMOOTH  ((1*15<<7)|4*15)
1488#define SHARP   ((2*15<<7)|3*15)
1489
1490.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, bdmax, ds2, sr2
1491make_8tap_fn \type, regular,        REGULAR, REGULAR
1492make_8tap_fn \type, regular_smooth, REGULAR, SMOOTH
1493make_8tap_fn \type, regular_sharp,  REGULAR, SHARP
1494make_8tap_fn \type, smooth,         SMOOTH,  SMOOTH
1495make_8tap_fn \type, smooth_regular, SMOOTH,  REGULAR
1496make_8tap_fn \type, smooth_sharp,   SMOOTH,  SHARP
1497make_8tap_fn \type, sharp,          SHARP,   SHARP
1498make_8tap_fn \type, sharp_regular,  SHARP,   REGULAR
1499make_8tap_fn \type, sharp_smooth,   SHARP,   SMOOTH
1500
1501function \type\()_8tap_neon
1502.ifc \bdmax, w8
1503        ldr             w8,  [sp]
1504.endif
1505        mov             w11,  #0x4081  // (1 << 14) | (1 << 7) | (1 << 0)
1506        mul             \mx,  \mx, w11
1507        mul             \my,  \my, w11
1508        add             \mx,  \mx, w9  // mx, 8tap_h, 4tap_h
1509        add             \my,  \my, w10 // my, 8tap_v, 4tap_v
1510.ifc \type, prep
1511        uxtw            \d_strd, \w
1512        lsl             \d_strd, \d_strd, #1
1513.endif
1514
1515        dup             v31.8h,  \bdmax        // bitdepth_max
1516        clz             \bdmax,  \bdmax
1517        clz             w9,  \w
1518        sub             \bdmax,  \bdmax,  #18  // intermediate_bits = clz(bitdepth_max) - 18
1519        mov             w12, #6
1520        tst             \mx, #(0x7f << 14)
1521        sub             w9,  w9,  #24
1522        add             w13, w12, \bdmax       // 6 + intermediate_bits
1523        sub             w12, w12, \bdmax       // 6 - intermediate_bits
1524        movrel          x11, X(mc_subpel_filters), -8
1525        b.ne            L(\type\()_8tap_h)
1526        tst             \my, #(0x7f << 14)
1527        b.ne            L(\type\()_8tap_v)
1528        b               \type\()_neon
1529
1530L(\type\()_8tap_h):
1531        cmp             \w,   #4
1532        ubfx            w10,  \mx, #7, #7
1533        and             \mx,  \mx, #0x7f
1534        b.le            4f
1535        mov             \mx,  w10
15364:
1537        tst             \my,  #(0x7f << 14)
1538        add             \xmx, x11, \mx, uxtw #3
1539        b.ne            L(\type\()_8tap_hv)
1540
1541        adr             x10, L(\type\()_8tap_h_tbl)
1542        dup             v30.4s,  w12           // 6 - intermediate_bits
1543        ldrh            w9,  [x10, x9, lsl #1]
1544        neg             v30.4s,  v30.4s        // -(6-intermediate_bits)
1545.ifc \type, put
1546        dup             v29.8h,  \bdmax        // intermediate_bits
1547.else
1548        movi            v28.8h,  #(PREP_BIAS >> 8), lsl #8
1549.endif
1550        sub             x10, x10, w9, uxtw
1551.ifc \type, put
1552        neg             v29.8h,  v29.8h        // -intermediate_bits
1553.endif
1554        br              x10
1555
155620:     // 2xN h
1557.ifc \type, put
1558        add             \xmx,  \xmx,  #2
1559        ld1             {v0.s}[0], [\xmx]
1560        sub             \src,  \src,  #2
1561        add             \ds2,  \dst,  \d_strd
1562        add             \sr2,  \src,  \s_strd
1563        lsl             \d_strd,  \d_strd,  #1
1564        lsl             \s_strd,  \s_strd,  #1
1565        sxtl            v0.8h,   v0.8b
15662:
1567        ld1             {v4.8h},  [\src], \s_strd
1568        ld1             {v6.8h},  [\sr2], \s_strd
1569        ext             v5.16b,  v4.16b,  v4.16b,  #2
1570        ext             v7.16b,  v6.16b,  v6.16b,  #2
1571        subs            \h,  \h,  #2
1572        trn1            v3.2s,   v4.2s,   v6.2s
1573        trn2            v6.2s,   v4.2s,   v6.2s
1574        trn1            v4.2s,   v5.2s,   v7.2s
1575        trn2            v7.2s,   v5.2s,   v7.2s
1576        smull           v3.4s,   v3.4h,   v0.h[0]
1577        smlal           v3.4s,   v4.4h,   v0.h[1]
1578        smlal           v3.4s,   v6.4h,   v0.h[2]
1579        smlal           v3.4s,   v7.4h,   v0.h[3]
1580        srshl           v3.4s,   v3.4s,   v30.4s // -(6-intermediate_bits)
1581        sqxtun          v3.4h,   v3.4s
1582        srshl           v3.4h,   v3.4h,   v29.4h // -intermediate_bits
1583        umin            v3.4h,   v3.4h,   v31.4h
1584        st1             {v3.s}[0], [\dst], \d_strd
1585        st1             {v3.s}[1], [\ds2], \d_strd
1586        b.gt            2b
1587        ret
1588.endif
1589
159040:     // 4xN h
1591        add             \xmx,  \xmx,  #2
1592        ld1             {v0.s}[0], [\xmx]
1593        sub             \src,  \src,  #2
1594        add             \ds2,  \dst,  \d_strd
1595        add             \sr2,  \src,  \s_strd
1596        lsl             \d_strd,  \d_strd,  #1
1597        lsl             \s_strd,  \s_strd,  #1
1598        sxtl            v0.8h,   v0.8b
15994:
1600        ld1             {v16.8h}, [\src], \s_strd
1601        ld1             {v20.8h}, [\sr2], \s_strd
1602        ext             v17.16b, v16.16b, v16.16b, #2
1603        ext             v18.16b, v16.16b, v16.16b, #4
1604        ext             v19.16b, v16.16b, v16.16b, #6
1605        ext             v21.16b, v20.16b, v20.16b, #2
1606        ext             v22.16b, v20.16b, v20.16b, #4
1607        ext             v23.16b, v20.16b, v20.16b, #6
1608        subs            \h,  \h,  #2
1609        smull           v16.4s,  v16.4h,  v0.h[0]
1610        smlal           v16.4s,  v17.4h,  v0.h[1]
1611        smlal           v16.4s,  v18.4h,  v0.h[2]
1612        smlal           v16.4s,  v19.4h,  v0.h[3]
1613        smull           v20.4s,  v20.4h,  v0.h[0]
1614        smlal           v20.4s,  v21.4h,  v0.h[1]
1615        smlal           v20.4s,  v22.4h,  v0.h[2]
1616        smlal           v20.4s,  v23.4h,  v0.h[3]
1617        srshl           v16.4s,  v16.4s,  v30.4s // -(6-intermediate_bits)
1618        srshl           v20.4s,  v20.4s,  v30.4s // -(6-intermediate_bits)
1619.ifc \type, put
1620        sqxtun          v16.4h,  v16.4s
1621        sqxtun2         v16.8h,  v20.4s
1622        srshl           v16.8h,  v16.8h,  v29.8h // -intermediate_bits
1623        umin            v16.8h,  v16.8h,  v31.8h
1624.else
1625        xtn             v16.4h,  v16.4s
1626        xtn2            v16.8h,  v20.4s
1627        sub             v16.8h,  v16.8h,  v28.8h // PREP_BIAS
1628.endif
1629        st1             {v16.d}[0], [\dst], \d_strd
1630        st1             {v16.d}[1], [\ds2], \d_strd
1631        b.gt            4b
1632        ret
1633
163480:
1635160:
1636320:
1637640:
16381280:   // 8xN, 16xN, 32xN, ... h
1639        ld1             {v0.8b}, [\xmx]
1640        sub             \src,  \src,  #6
1641        add             \ds2,  \dst,  \d_strd
1642        add             \sr2,  \src,  \s_strd
1643        lsl             \s_strd,  \s_strd,  #1
1644        sxtl            v0.8h,   v0.8b
1645
1646        sub             \s_strd,  \s_strd,  \w, uxtw #1
1647        sub             \s_strd,  \s_strd,  #16
1648.ifc \type, put
1649        lsl             \d_strd,  \d_strd,  #1
1650        sub             \d_strd,  \d_strd,  \w, uxtw #1
1651.endif
165281:
1653        ld1             {v16.8h, v17.8h},  [\src], #32
1654        ld1             {v20.8h, v21.8h},  [\sr2], #32
1655        mov             \mx, \w
1656
16578:
1658        smull           v18.4s,  v16.4h,  v0.h[0]
1659        smull2          v19.4s,  v16.8h,  v0.h[0]
1660        smull           v22.4s,  v20.4h,  v0.h[0]
1661        smull2          v23.4s,  v20.8h,  v0.h[0]
1662.irpc i, 1234567
1663        ext             v24.16b, v16.16b, v17.16b, #(2*\i)
1664        ext             v25.16b, v20.16b, v21.16b, #(2*\i)
1665        smlal           v18.4s,  v24.4h,  v0.h[\i]
1666        smlal2          v19.4s,  v24.8h,  v0.h[\i]
1667        smlal           v22.4s,  v25.4h,  v0.h[\i]
1668        smlal2          v23.4s,  v25.8h,  v0.h[\i]
1669.endr
1670        subs            \mx, \mx, #8
1671        srshl           v18.4s,  v18.4s,  v30.4s // -(6-intermediate_bits)
1672        srshl           v19.4s,  v19.4s,  v30.4s // -(6-intermediate_bits)
1673        srshl           v22.4s,  v22.4s,  v30.4s // -(6-intermediate_bits)
1674        srshl           v23.4s,  v23.4s,  v30.4s // -(6-intermediate_bits)
1675.ifc \type, put
1676        sqxtun          v18.4h,  v18.4s
1677        sqxtun2         v18.8h,  v19.4s
1678        sqxtun          v22.4h,  v22.4s
1679        sqxtun2         v22.8h,  v23.4s
1680        srshl           v18.8h,  v18.8h,  v29.8h // -intermediate_bits
1681        srshl           v22.8h,  v22.8h,  v29.8h // -intermediate_bits
1682        umin            v18.8h,  v18.8h,  v31.8h
1683        umin            v22.8h,  v22.8h,  v31.8h
1684.else
1685        xtn             v18.4h,  v18.4s
1686        xtn2            v18.8h,  v19.4s
1687        xtn             v22.4h,  v22.4s
1688        xtn2            v22.8h,  v23.4s
1689        sub             v18.8h,  v18.8h,  v28.8h // PREP_BIAS
1690        sub             v22.8h,  v22.8h,  v28.8h // PREP_BIAS
1691.endif
1692        st1             {v18.8h}, [\dst], #16
1693        st1             {v22.8h}, [\ds2], #16
1694        b.le            9f
1695
1696        mov             v16.16b, v17.16b
1697        mov             v20.16b, v21.16b
1698        ld1             {v17.8h}, [\src], #16
1699        ld1             {v21.8h}, [\sr2], #16
1700        b               8b
1701
17029:
1703        add             \dst,  \dst,  \d_strd
1704        add             \ds2,  \ds2,  \d_strd
1705        add             \src,  \src,  \s_strd
1706        add             \sr2,  \sr2,  \s_strd
1707
1708        subs            \h,  \h,  #2
1709        b.gt            81b
1710        ret
1711
1712L(\type\()_8tap_h_tbl):
1713        .hword L(\type\()_8tap_h_tbl) - 1280b
1714        .hword L(\type\()_8tap_h_tbl) -  640b
1715        .hword L(\type\()_8tap_h_tbl) -  320b
1716        .hword L(\type\()_8tap_h_tbl) -  160b
1717        .hword L(\type\()_8tap_h_tbl) -   80b
1718        .hword L(\type\()_8tap_h_tbl) -   40b
1719        .hword L(\type\()_8tap_h_tbl) -   20b
1720        .hword 0
1721
1722
1723L(\type\()_8tap_v):
1724        cmp             \h,  #4
1725        ubfx            w10, \my, #7, #7
1726        and             \my, \my, #0x7f
1727        b.le            4f
1728        mov             \my, w10
17294:
1730        add             \xmy, x11, \my, uxtw #3
1731
1732.ifc \type, prep
1733        dup             v30.4s,  w12           // 6 - intermediate_bits
1734        movi            v29.8h,  #(PREP_BIAS >> 8), lsl #8
1735.endif
1736        adr             x10, L(\type\()_8tap_v_tbl)
1737        ldrh            w9,  [x10, x9, lsl #1]
1738.ifc \type, prep
1739        neg             v30.4s,  v30.4s        // -(6-intermediate_bits)
1740.endif
1741        sub             x10, x10, w9, uxtw
1742        br              x10
1743
174420:     // 2xN v
1745.ifc \type, put
1746        b.gt            28f
1747
1748        cmp             \h,  #2
1749        add             \xmy, \xmy, #2
1750        ld1             {v0.s}[0], [\xmy]
1751        sub             \src,  \src,  \s_strd
1752        add             \ds2,  \dst,  \d_strd
1753        add             \sr2,  \src,  \s_strd
1754        lsl             \s_strd,  \s_strd,  #1
1755        lsl             \d_strd,  \d_strd,  #1
1756        sxtl            v0.8h,   v0.8b
1757
1758        // 2x2 v
1759        load_s          \src, \sr2, \s_strd, v1, v2, v3, v4, v5
1760        interleave_1_s  v1,  v2,  v3,  v4,  v5
1761        b.gt            24f
1762        smull_smlal_4   v6,  v1,  v2,  v3,  v4
1763        sqrshrun_h      6,   v6
1764        umin_h          v31, .8h, v6
1765        st_s            \d_strd, v6, 2
1766        ret
1767
176824:     // 2x4 v
1769        load_s          \sr2, \src, \s_strd, v6, v7
1770        interleave_1_s  v5,  v6,  v7
1771        smull_smlal_4   v16, v1,  v2,  v3,  v4
1772        smull_smlal_4   v17, v3,  v4,  v5,  v6
1773        sqrshrun_h      6,   v16, v17
1774        umin_h          v31, .8h, v16
1775        st_s            \d_strd, v16, 4
1776        ret
1777
177828:     // 2x8, 2x16 v
1779        ld1             {v0.8b}, [\xmy]
1780        sub             \sr2,  \src,  \s_strd, lsl #1
1781        add             \ds2,  \dst,  \d_strd
1782        sub             \src,  \sr2,  \s_strd
1783        lsl             \d_strd,  \d_strd,  #1
1784        lsl             \s_strd,  \s_strd,  #1
1785        sxtl            v0.8h,   v0.8b
1786
1787        load_s          \src, \sr2, \s_strd, v1,  v2,  v3,  v4, v5, v6, v7
1788        interleave_1_s  v1,  v2,  v3,  v4,  v5
1789        interleave_1_s  v5,  v6,  v7
1790216:
1791        subs            \h,  \h,  #8
1792        load_s          \sr2, \src, \s_strd, v16, v17, v18, v19
1793        load_s          \sr2, \src, \s_strd, v20, v21, v22, v23
1794        interleave_1_s  v7,  v16, v17, v18, v19
1795        interleave_1_s  v19, v20, v21, v22, v23
1796        smull_smlal_8   v24, v1,  v2,  v3,  v4,  v5,  v6,  v7,  v16
1797        smull_smlal_8   v25, v3,  v4,  v5,  v6,  v7,  v16, v17, v18
1798        smull_smlal_8   v26, v5,  v6,  v7,  v16, v17, v18, v19, v20
1799        smull_smlal_8   v27, v7,  v16, v17, v18, v19, v20, v21, v22
1800        sqrshrun_h      6,   v24, v25, v26, v27
1801        umin_h          v31, .8h, v24, v26
1802        st_s            \d_strd, v24, 4
1803        st_s            \d_strd, v26, 4
1804        b.le            0f
1805        mov             v1.16b,  v17.16b
1806        mov             v2.16b,  v18.16b
1807        mov             v3.16b,  v19.16b
1808        mov             v4.16b,  v20.16b
1809        mov             v5.16b,  v21.16b
1810        mov             v6.16b,  v22.16b
1811        mov             v7.16b,  v23.16b
1812        b               216b
18130:
1814        ret
1815.endif
1816
181740:
1818        b.gt            480f
1819
1820        // 4x2, 4x4 v
1821        cmp             \h,  #2
1822        add             \xmy, \xmy, #2
1823        ld1             {v0.s}[0], [\xmy]
1824        sub             \src, \src, \s_strd
1825        add             \ds2, \dst, \d_strd
1826        add             \sr2, \src, \s_strd
1827        lsl             \s_strd, \s_strd, #1
1828        lsl             \d_strd, \d_strd, #1
1829        sxtl            v0.8h,   v0.8b
1830
1831        load_4h         \src, \sr2, \s_strd, v1, v2, v3, v4, v5
1832        smull_smlal_4   v6,  v1,  v2,  v3,  v4
1833        smull_smlal_4   v7,  v2,  v3,  v4,  v5
1834        shift_store_4   \type, \d_strd, v6, v7
1835        b.le            0f
1836        load_4h         \sr2, \src, \s_strd, v6, v7
1837        smull_smlal_4   v1,  v3,  v4,  v5,  v6
1838        smull_smlal_4   v2,  v4,  v5,  v6,  v7
1839        shift_store_4   \type, \d_strd, v1, v2
18400:
1841        ret
1842
1843480:    // 4x8, 4x16 v
1844        ld1             {v0.8b}, [\xmy]
1845        sub             \sr2, \src, \s_strd, lsl #1
1846        add             \ds2, \dst, \d_strd
1847        sub             \src, \sr2, \s_strd
1848        lsl             \s_strd, \s_strd, #1
1849        lsl             \d_strd, \d_strd, #1
1850        sxtl            v0.8h,   v0.8b
1851
1852        load_4h         \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22
1853
185448:
1855        subs            \h,  \h,  #4
1856        load_4h         \sr2, \src, \s_strd, v23, v24, v25, v26
1857        smull_smlal_8   v1,  v16, v17, v18, v19, v20, v21, v22, v23
1858        smull_smlal_8   v2,  v17, v18, v19, v20, v21, v22, v23, v24
1859        smull_smlal_8   v3,  v18, v19, v20, v21, v22, v23, v24, v25
1860        smull_smlal_8   v4,  v19, v20, v21, v22, v23, v24, v25, v26
1861        shift_store_4   \type, \d_strd, v1, v2, v3, v4
1862        b.le            0f
1863        mov             v16.8b,  v20.8b
1864        mov             v17.8b,  v21.8b
1865        mov             v18.8b,  v22.8b
1866        mov             v19.8b,  v23.8b
1867        mov             v20.8b,  v24.8b
1868        mov             v21.8b,  v25.8b
1869        mov             v22.8b,  v26.8b
1870        b               48b
18710:
1872        ret
1873
187480:
1875        b.gt            880f
1876
1877        // 8x2, 8x4 v
1878        cmp             \h,  #2
1879        add             \xmy, \xmy, #2
1880        ld1             {v0.s}[0], [\xmy]
1881        sub             \src, \src, \s_strd
1882        add             \ds2, \dst, \d_strd
1883        add             \sr2, \src, \s_strd
1884        lsl             \s_strd, \s_strd, #1
1885        lsl             \d_strd, \d_strd, #1
1886        sxtl            v0.8h,   v0.8b
1887
1888        load_8h         \src, \sr2, \s_strd, v1, v2, v3, v4, v5
1889        smull_smlal_4   v16, v1,  v2,  v3,  v4
1890        smull2_smlal2_4 v17, v1,  v2,  v3,  v4
1891        smull_smlal_4   v18, v2,  v3,  v4,  v5
1892        smull2_smlal2_4 v19, v2,  v3,  v4,  v5
1893        shift_store_8   \type, \d_strd, v16, v17, v18, v19
1894        b.le            0f
1895        load_8h         \sr2, \src, \s_strd, v6, v7
1896        smull_smlal_4   v16, v3,  v4,  v5,  v6
1897        smull2_smlal2_4 v17, v3,  v4,  v5,  v6
1898        smull_smlal_4   v18, v4,  v5,  v6,  v7
1899        smull2_smlal2_4 v19, v4,  v5,  v6,  v7
1900        shift_store_8   \type, \d_strd, v16, v17, v18, v19
19010:
1902        ret
1903
1904880:    // 8x6, 8x8, 8x16, 8x32 v
19051680:   // 16x8, 16x16, ...
1906320:    // 32x8, 32x16, ...
1907640:
19081280:
1909        ld1             {v0.8b}, [\xmy]
1910        sub             \src, \src, \s_strd
1911        sub             \src, \src, \s_strd, lsl #1
1912        sxtl            v0.8h,   v0.8b
1913        mov             \my,  \h
1914168:
1915        add             \ds2, \dst, \d_strd
1916        add             \sr2, \src, \s_strd
1917        lsl             \s_strd, \s_strd, #1
1918        lsl             \d_strd, \d_strd, #1
1919
1920        load_8h         \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22
1921
192288:
1923        subs            \h,  \h,  #2
1924        load_8h         \sr2, \src, \s_strd, v23, v24
1925        smull_smlal_8   v1,  v16, v17, v18, v19, v20, v21, v22, v23
1926        smull2_smlal2_8 v2,  v16, v17, v18, v19, v20, v21, v22, v23
1927        smull_smlal_8   v3,  v17, v18, v19, v20, v21, v22, v23, v24
1928        smull2_smlal2_8 v4,  v17, v18, v19, v20, v21, v22, v23, v24
1929        shift_store_8   \type, \d_strd, v1, v2, v3, v4
1930        b.le            9f
1931        subs            \h,  \h,  #2
1932        load_8h         \sr2, \src, \s_strd, v25, v26
1933        smull_smlal_8   v1,  v18, v19, v20, v21, v22, v23, v24, v25
1934        smull2_smlal2_8 v2,  v18, v19, v20, v21, v22, v23, v24, v25
1935        smull_smlal_8   v3,  v19, v20, v21, v22, v23, v24, v25, v26
1936        smull2_smlal2_8 v4,  v19, v20, v21, v22, v23, v24, v25, v26
1937        shift_store_8   \type, \d_strd, v1, v2, v3, v4
1938        b.le            9f
1939        mov             v16.16b, v20.16b
1940        mov             v17.16b, v21.16b
1941        mov             v18.16b, v22.16b
1942        mov             v19.16b, v23.16b
1943        mov             v20.16b, v24.16b
1944        mov             v21.16b, v25.16b
1945        mov             v22.16b, v26.16b
1946        b               88b
19479:
1948        subs            \w,  \w,  #8
1949        b.le            0f
1950        asr             \s_strd, \s_strd, #1
1951        asr             \d_strd, \d_strd, #1
1952        msub            \src, \s_strd, \xmy, \src
1953        msub            \dst, \d_strd, \xmy, \dst
1954        sub             \src, \src, \s_strd, lsl #3
1955        mov             \h,  \my
1956        add             \src, \src, #16
1957        add             \dst, \dst, #16
1958        b               168b
19590:
1960        ret
1961
1962160:
1963        b.gt            1680b
1964
1965        // 16x2, 16x4 v
1966        add             \xmy, \xmy, #2
1967        ld1             {v0.s}[0], [\xmy]
1968        sub             \src, \src, \s_strd
1969        sxtl            v0.8h,   v0.8b
1970
1971        load_16h        \src, \src, \s_strd, v16, v17, v18, v19, v20, v21
197216:
1973        load_16h        \src, \src, \s_strd, v22, v23
1974        subs            \h,  \h,  #1
1975        smull_smlal_4   v1,  v16, v18, v20, v22
1976        smull2_smlal2_4 v2,  v16, v18, v20, v22
1977        smull_smlal_4   v3,  v17, v19, v21, v23
1978        smull2_smlal2_4 v4,  v17, v19, v21, v23
1979        shift_store_16  \type, \d_strd, x0, v1, v2, v3, v4
1980        b.le            0f
1981        mov             v16.16b, v18.16b
1982        mov             v17.16b, v19.16b
1983        mov             v18.16b, v20.16b
1984        mov             v19.16b, v21.16b
1985        mov             v20.16b, v22.16b
1986        mov             v21.16b, v23.16b
1987        b               16b
19880:
1989        ret
1990
1991L(\type\()_8tap_v_tbl):
1992        .hword L(\type\()_8tap_v_tbl) - 1280b
1993        .hword L(\type\()_8tap_v_tbl) -  640b
1994        .hword L(\type\()_8tap_v_tbl) -  320b
1995        .hword L(\type\()_8tap_v_tbl) -  160b
1996        .hword L(\type\()_8tap_v_tbl) -   80b
1997        .hword L(\type\()_8tap_v_tbl) -   40b
1998        .hword L(\type\()_8tap_v_tbl) -   20b
1999        .hword 0
2000
2001L(\type\()_8tap_hv):
2002        cmp             \h,  #4
2003        ubfx            w10, \my, #7, #7
2004        and             \my, \my, #0x7f
2005        b.le            4f
2006        mov             \my,  w10
20074:
2008        add             \xmy, x11, \my, uxtw #3
2009
2010        adr             x10, L(\type\()_8tap_hv_tbl)
2011        dup             v30.4s,  w12           // 6 - intermediate_bits
2012        ldrh            w9,  [x10, x9, lsl #1]
2013        neg             v30.4s,  v30.4s        // -(6-intermediate_bits)
2014.ifc \type, put
2015        dup             v29.4s,  w13           // 6 + intermediate_bits
2016.else
2017        movi            v29.8h,  #(PREP_BIAS >> 8), lsl #8
2018.endif
2019        sub             x10, x10, w9, uxtw
2020.ifc \type, put
2021        neg             v29.4s,  v29.4s        // -(6+intermediate_bits)
2022.endif
2023        br              x10
2024
202520:
2026.ifc \type, put
2027        add             \xmx,  \xmx,  #2
2028        ld1             {v0.s}[0],  [\xmx]
2029        b.gt            280f
2030        add             \xmy,  \xmy,  #2
2031        ld1             {v1.s}[0],  [\xmy]
2032
2033        // 2x2, 2x4 hv
2034        sub             \sr2, \src, #2
2035        sub             \src, \sr2, \s_strd
2036        add             \ds2, \dst, \d_strd
2037        lsl             \s_strd, \s_strd, #1
2038        lsl             \d_strd, \d_strd, #1
2039        sxtl            v0.8h,   v0.8b
2040        sxtl            v1.8h,   v1.8b
2041        mov             x15, x30
2042
2043        ld1             {v27.8h}, [\src], \s_strd
2044        ext             v28.16b, v27.16b, v27.16b, #2
2045        smull           v27.4s,  v27.4h,  v0.4h
2046        smull           v28.4s,  v28.4h,  v0.4h
2047        addp            v27.4s,  v27.4s,  v28.4s
2048        addp            v16.4s,  v27.4s,  v27.4s
2049        srshl           v16.2s,  v16.2s,  v30.2s // -(6-intermediate_bits)
2050        bl              L(\type\()_8tap_filter_2)
2051        // The intermediates from the horizontal pass fit in 16 bit without
2052        // any bias; we could just as well keep them as .4s, but narrowing
2053        // them to .4h gives a significant speedup on out of order cores
2054        // (at the cost of a smaller slowdown on in-order cores such as A53).
2055        xtn             v16.4h,  v16.4s
2056
2057        trn1            v16.2s,  v16.2s,  v24.2s
2058        mov             v17.8b,  v24.8b
2059
20602:
2061        bl              L(\type\()_8tap_filter_2)
2062
2063        ext             v18.8b,  v17.8b,  v24.8b,  #4
2064        smull           v2.4s,   v16.4h,  v1.h[0]
2065        smlal           v2.4s,   v17.4h,  v1.h[1]
2066        smlal           v2.4s,   v18.4h,  v1.h[2]
2067        smlal           v2.4s,   v24.4h,  v1.h[3]
2068
2069        srshl           v2.4s,   v2.4s,   v29.4s // -(6+intermediate_bits)
2070        sqxtun          v2.4h,   v2.4s
2071        umin            v2.4h,   v2.4h,   v31.4h
2072        subs            \h,  \h,  #2
2073        st1             {v2.s}[0], [\dst], \d_strd
2074        st1             {v2.s}[1], [\ds2], \d_strd
2075        b.le            0f
2076        mov             v16.8b,  v18.8b
2077        mov             v17.8b,  v24.8b
2078        b               2b
2079
2080280:    // 2x8, 2x16, 2x32 hv
2081        ld1             {v1.8b},  [\xmy]
2082        sub             \src, \src, #2
2083        sub             \sr2, \src, \s_strd, lsl #1
2084        sub             \src, \sr2, \s_strd
2085        add             \ds2, \dst, \d_strd
2086        lsl             \s_strd, \s_strd, #1
2087        lsl             \d_strd, \d_strd, #1
2088        sxtl            v0.8h,   v0.8b
2089        sxtl            v1.8h,   v1.8b
2090        mov             x15, x30
2091
2092        ld1             {v27.8h}, [\src], \s_strd
2093        ext             v28.16b, v27.16b, v27.16b, #2
2094        smull           v27.4s,  v27.4h,  v0.4h
2095        smull           v28.4s,  v28.4h,  v0.4h
2096        addp            v27.4s,  v27.4s,  v28.4s
2097        addp            v16.4s,  v27.4s,  v27.4s
2098        srshl           v16.2s,  v16.2s,  v30.2s // -(6-intermediate_bits)
2099        // The intermediates from the horizontal pass fit in 16 bit without
2100        // any bias; we could just as well keep them as .4s, but narrowing
2101        // them to .4h gives a significant speedup on out of order cores
2102        // (at the cost of a smaller slowdown on in-order cores such as A53).
2103
2104        bl              L(\type\()_8tap_filter_2)
2105        xtn             v16.4h,  v16.4s
2106        trn1            v16.2s,  v16.2s,  v24.2s
2107        mov             v17.8b,  v24.8b
2108        bl              L(\type\()_8tap_filter_2)
2109        ext             v18.8b,  v17.8b,  v24.8b,  #4
2110        mov             v19.8b,  v24.8b
2111        bl              L(\type\()_8tap_filter_2)
2112        ext             v20.8b,  v19.8b,  v24.8b,  #4
2113        mov             v21.8b,  v24.8b
2114
211528:
2116        bl              L(\type\()_8tap_filter_2)
2117        ext             v22.8b,  v21.8b,  v24.8b,  #4
2118        smull           v3.4s,   v16.4h,  v1.h[0]
2119        smlal           v3.4s,   v17.4h,  v1.h[1]
2120        smlal           v3.4s,   v18.4h,  v1.h[2]
2121        smlal           v3.4s,   v19.4h,  v1.h[3]
2122        smlal           v3.4s,   v20.4h,  v1.h[4]
2123        smlal           v3.4s,   v21.4h,  v1.h[5]
2124        smlal           v3.4s,   v22.4h,  v1.h[6]
2125        smlal           v3.4s,   v24.4h,  v1.h[7]
2126
2127        srshl           v3.4s,   v3.4s,   v29.4s // -(6+intermediate_bits)
2128        sqxtun          v3.4h,   v3.4s
2129        umin            v3.4h,   v3.4h,   v31.4h
2130        subs            \h,  \h,  #2
2131        st1             {v3.s}[0], [\dst], \d_strd
2132        st1             {v3.s}[1], [\ds2], \d_strd
2133        b.le            0f
2134        mov             v16.8b,  v18.8b
2135        mov             v17.8b,  v19.8b
2136        mov             v18.8b,  v20.8b
2137        mov             v19.8b,  v21.8b
2138        mov             v20.8b,  v22.8b
2139        mov             v21.8b,  v24.8b
2140        b               28b
2141
21420:
2143        br              x15
2144
2145L(\type\()_8tap_filter_2):
2146        ld1             {v25.8h},  [\sr2], \s_strd
2147        ld1             {v27.8h},  [\src], \s_strd
2148        ext             v26.16b, v25.16b, v25.16b, #2
2149        ext             v28.16b, v27.16b, v27.16b, #2
2150        trn1            v24.2s,  v25.2s,  v27.2s
2151        trn2            v27.2s,  v25.2s,  v27.2s
2152        trn1            v25.2s,  v26.2s,  v28.2s
2153        trn2            v28.2s,  v26.2s,  v28.2s
2154        smull           v24.4s,  v24.4h,  v0.h[0]
2155        smlal           v24.4s,  v25.4h,  v0.h[1]
2156        smlal           v24.4s,  v27.4h,  v0.h[2]
2157        smlal           v24.4s,  v28.4h,  v0.h[3]
2158        srshl           v24.4s,  v24.4s,  v30.4s // -(6-intermediate_bits)
2159        xtn             v24.4h,  v24.4s
2160        ret
2161.endif
2162
216340:
2164        add             \xmx, \xmx, #2
2165        ld1             {v0.s}[0],  [\xmx]
2166        b.gt            480f
2167        add             \xmy, \xmy,  #2
2168        ld1             {v1.s}[0],  [\xmy]
2169        sub             \sr2, \src, #2
2170        sub             \src, \sr2, \s_strd
2171        add             \ds2, \dst, \d_strd
2172        lsl             \s_strd, \s_strd, #1
2173        lsl             \d_strd, \d_strd, #1
2174        sxtl            v0.8h,   v0.8b
2175        sxtl            v1.8h,   v1.8b
2176        mov             x15, x30
2177
2178        // 4x2, 4x4 hv
2179        ld1             {v25.8h}, [\src], \s_strd
2180        ext             v26.16b, v25.16b, v25.16b, #2
2181        ext             v27.16b, v25.16b, v25.16b, #4
2182        ext             v28.16b, v25.16b, v25.16b, #6
2183        smull           v25.4s,  v25.4h,  v0.h[0]
2184        smlal           v25.4s,  v26.4h,  v0.h[1]
2185        smlal           v25.4s,  v27.4h,  v0.h[2]
2186        smlal           v25.4s,  v28.4h,  v0.h[3]
2187        srshl           v16.4s,  v25.4s,  v30.4s // -(6-intermediate_bits)
2188        // The intermediates from the horizontal pass fit in 16 bit without
2189        // any bias; we could just as well keep them as .4s, but narrowing
2190        // them to .4h gives a significant speedup on out of order cores
2191        // (at the cost of a smaller slowdown on in-order cores such as A53).
2192        xtn             v16.4h,  v16.4s
2193
2194        bl              L(\type\()_8tap_filter_4)
2195        mov             v17.8b,  v24.8b
2196        mov             v18.8b,  v25.8b
2197
21984:
2199        bl              L(\type\()_8tap_filter_4)
2200        smull           v2.4s,   v16.4h,  v1.h[0]
2201        smlal           v2.4s,   v17.4h,  v1.h[1]
2202        smlal           v2.4s,   v18.4h,  v1.h[2]
2203        smlal           v2.4s,   v24.4h,  v1.h[3]
2204        smull           v3.4s,   v17.4h,  v1.h[0]
2205        smlal           v3.4s,   v18.4h,  v1.h[1]
2206        smlal           v3.4s,   v24.4h,  v1.h[2]
2207        smlal           v3.4s,   v25.4h,  v1.h[3]
2208.ifc \type, put
2209        srshl           v2.4s,   v2.4s,   v29.4s // -(6+intermediate_bits)
2210        srshl           v3.4s,   v3.4s,   v29.4s // -(6+intermediate_bits)
2211        sqxtun          v2.4h,   v2.4s
2212        sqxtun2         v2.8h,   v3.4s
2213        umin            v2.8h,   v2.8h,   v31.8h
2214.else
2215        rshrn           v2.4h,   v2.4s,   #6
2216        rshrn2          v2.8h,   v3.4s,   #6
2217        sub             v2.8h,   v2.8h,   v29.8h // PREP_BIAS
2218.endif
2219        subs            \h,  \h,  #2
2220
2221        st1             {v2.d}[0], [\dst], \d_strd
2222        st1             {v2.d}[1], [\ds2], \d_strd
2223        b.le            0f
2224        mov             v16.8b,  v18.8b
2225        mov             v17.8b,  v24.8b
2226        mov             v18.8b,  v25.8b
2227        b               4b
2228
2229480:    // 4x8, 4x16, 4x32 hv
2230        ld1             {v1.8b},  [\xmy]
2231        sub             \src, \src, #2
2232        sub             \sr2, \src, \s_strd, lsl #1
2233        sub             \src, \sr2, \s_strd
2234        add             \ds2, \dst, \d_strd
2235        lsl             \s_strd, \s_strd, #1
2236        lsl             \d_strd, \d_strd, #1
2237        sxtl            v0.8h,   v0.8b
2238        sxtl            v1.8h,   v1.8b
2239        mov             x15, x30
2240
2241        ld1             {v25.8h}, [\src], \s_strd
2242        ext             v26.16b, v25.16b, v25.16b, #2
2243        ext             v27.16b, v25.16b, v25.16b, #4
2244        ext             v28.16b, v25.16b, v25.16b, #6
2245        smull           v25.4s,  v25.4h,  v0.h[0]
2246        smlal           v25.4s,  v26.4h,  v0.h[1]
2247        smlal           v25.4s,  v27.4h,  v0.h[2]
2248        smlal           v25.4s,  v28.4h,  v0.h[3]
2249        srshl           v16.4s,  v25.4s,  v30.4s // -(6-intermediate_bits)
2250        // The intermediates from the horizontal pass fit in 16 bit without
2251        // any bias; we could just as well keep them as .4s, but narrowing
2252        // them to .4h gives a significant speedup on out of order cores
2253        // (at the cost of a smaller slowdown on in-order cores such as A53).
2254        xtn             v16.4h,  v16.4s
2255
2256        bl              L(\type\()_8tap_filter_4)
2257        mov             v17.8b,  v24.8b
2258        mov             v18.8b,  v25.8b
2259        bl              L(\type\()_8tap_filter_4)
2260        mov             v19.8b,  v24.8b
2261        mov             v20.8b,  v25.8b
2262        bl              L(\type\()_8tap_filter_4)
2263        mov             v21.8b,  v24.8b
2264        mov             v22.8b,  v25.8b
2265
226648:
2267        bl              L(\type\()_8tap_filter_4)
2268        smull           v3.4s,   v16.4h,  v1.h[0]
2269        smlal           v3.4s,   v17.4h,  v1.h[1]
2270        smlal           v3.4s,   v18.4h,  v1.h[2]
2271        smlal           v3.4s,   v19.4h,  v1.h[3]
2272        smlal           v3.4s,   v20.4h,  v1.h[4]
2273        smlal           v3.4s,   v21.4h,  v1.h[5]
2274        smlal           v3.4s,   v22.4h,  v1.h[6]
2275        smlal           v3.4s,   v24.4h,  v1.h[7]
2276        smull           v4.4s,   v17.4h,  v1.h[0]
2277        smlal           v4.4s,   v18.4h,  v1.h[1]
2278        smlal           v4.4s,   v19.4h,  v1.h[2]
2279        smlal           v4.4s,   v20.4h,  v1.h[3]
2280        smlal           v4.4s,   v21.4h,  v1.h[4]
2281        smlal           v4.4s,   v22.4h,  v1.h[5]
2282        smlal           v4.4s,   v24.4h,  v1.h[6]
2283        smlal           v4.4s,   v25.4h,  v1.h[7]
2284.ifc \type, put
2285        srshl           v3.4s,   v3.4s,   v29.4s // -(6+intermediate_bits)
2286        srshl           v4.4s,   v4.4s,   v29.4s // -(6+intermediate_bits)
2287        sqxtun          v3.4h,   v3.4s
2288        sqxtun2         v3.8h,   v4.4s
2289        umin            v3.8h,   v3.8h,   v31.8h
2290.else
2291        rshrn           v3.4h,   v3.4s,   #6
2292        rshrn2          v3.8h,   v4.4s,   #6
2293        sub             v3.8h,   v3.8h,   v29.8h // PREP_BIAS
2294.endif
2295        subs            \h,  \h,  #2
2296        st1             {v3.d}[0], [\dst], \d_strd
2297        st1             {v3.d}[1], [\ds2], \d_strd
2298        b.le            0f
2299        mov             v16.8b,  v18.8b
2300        mov             v17.8b,  v19.8b
2301        mov             v18.8b,  v20.8b
2302        mov             v19.8b,  v21.8b
2303        mov             v20.8b,  v22.8b
2304        mov             v21.8b,  v24.8b
2305        mov             v22.8b,  v25.8b
2306        b               48b
23070:
2308        br              x15
2309
2310L(\type\()_8tap_filter_4):
2311        ld1             {v24.8h}, [\sr2], \s_strd
2312        ld1             {v25.8h}, [\src], \s_strd
2313        ext             v26.16b, v24.16b, v24.16b, #2
2314        ext             v27.16b, v24.16b, v24.16b, #4
2315        ext             v28.16b, v24.16b, v24.16b, #6
2316        smull           v24.4s,  v24.4h,  v0.h[0]
2317        smlal           v24.4s,  v26.4h,  v0.h[1]
2318        smlal           v24.4s,  v27.4h,  v0.h[2]
2319        smlal           v24.4s,  v28.4h,  v0.h[3]
2320        ext             v26.16b, v25.16b, v25.16b, #2
2321        ext             v27.16b, v25.16b, v25.16b, #4
2322        ext             v28.16b, v25.16b, v25.16b, #6
2323        smull           v25.4s,  v25.4h,  v0.h[0]
2324        smlal           v25.4s,  v26.4h,  v0.h[1]
2325        smlal           v25.4s,  v27.4h,  v0.h[2]
2326        smlal           v25.4s,  v28.4h,  v0.h[3]
2327        srshl           v24.4s,  v24.4s,  v30.4s // -(6-intermediate_bits)
2328        srshl           v25.4s,  v25.4s,  v30.4s // -(6-intermediate_bits)
2329        xtn             v24.4h,  v24.4s
2330        xtn             v25.4h,  v25.4s
2331        ret
2332
233380:
2334160:
2335320:
2336        b.gt            880f
2337        add             \xmy,  \xmy,  #2
2338        ld1             {v0.8b},  [\xmx]
2339        ld1             {v1.s}[0],  [\xmy]
2340        sub             \src,  \src,  #6
2341        sub             \src,  \src,  \s_strd
2342        sxtl            v0.8h,   v0.8b
2343        sxtl            v1.8h,   v1.8b
2344        mov             x15, x30
2345        mov             \my, \h
2346
2347164:    // 8x2, 8x4, 16x2, 16x4, 32x2, 32x4 hv
2348        add             \ds2,  \dst,  \d_strd
2349        add             \sr2,  \src,  \s_strd
2350        lsl             \d_strd, \d_strd, #1
2351        lsl             \s_strd, \s_strd, #1
2352
2353        ld1             {v27.8h, v28.8h},  [\src], \s_strd
2354        smull           v24.4s,  v27.4h,  v0.h[0]
2355        smull2          v25.4s,  v27.8h,  v0.h[0]
2356.irpc i, 1234567
2357        ext             v26.16b, v27.16b, v28.16b, #(2*\i)
2358        smlal           v24.4s,  v26.4h,  v0.h[\i]
2359        smlal2          v25.4s,  v26.8h,  v0.h[\i]
2360.endr
2361        srshl           v24.4s,  v24.4s,  v30.4s // -(6-intermediate_bits)
2362        srshl           v25.4s,  v25.4s,  v30.4s // -(6-intermediate_bits)
2363        // The intermediates from the horizontal pass fit in 16 bit without
2364        // any bias; we could just as well keep them as .4s, but narrowing
2365        // them to .4h gives a significant speedup on out of order cores
2366        // (at the cost of a smaller slowdown on in-order cores such as A53),
2367        // and conserves register space (no need to clobber v8-v15).
2368        xtn             v16.4h,  v24.4s
2369        xtn2            v16.8h,  v25.4s
2370
2371        bl              L(\type\()_8tap_filter_8)
2372        mov             v17.16b, v23.16b
2373        mov             v18.16b, v24.16b
2374
23758:
2376        smull           v2.4s,   v16.4h,  v1.h[0]
2377        smull2          v3.4s,   v16.8h,  v1.h[0]
2378        bl              L(\type\()_8tap_filter_8)
2379        smull           v4.4s,   v17.4h,  v1.h[0]
2380        smull2          v5.4s,   v17.8h,  v1.h[0]
2381        smlal           v2.4s,   v17.4h,  v1.h[1]
2382        smlal2          v3.4s,   v17.8h,  v1.h[1]
2383        smlal           v4.4s,   v18.4h,  v1.h[1]
2384        smlal2          v5.4s,   v18.8h,  v1.h[1]
2385        smlal           v2.4s,   v18.4h,  v1.h[2]
2386        smlal2          v3.4s,   v18.8h,  v1.h[2]
2387        smlal           v4.4s,   v23.4h,  v1.h[2]
2388        smlal2          v5.4s,   v23.8h,  v1.h[2]
2389        smlal           v2.4s,   v23.4h,  v1.h[3]
2390        smlal2          v3.4s,   v23.8h,  v1.h[3]
2391        smlal           v4.4s,   v24.4h,  v1.h[3]
2392        smlal2          v5.4s,   v24.8h,  v1.h[3]
2393.ifc \type, put
2394        srshl           v2.4s,   v2.4s,   v29.4s // -(6+intermediate_bits)
2395        srshl           v3.4s,   v3.4s,   v29.4s // -(6+intermediate_bits)
2396        srshl           v4.4s,   v4.4s,   v29.4s // -(6+intermediate_bits)
2397        srshl           v5.4s,   v5.4s,   v29.4s // -(6+intermediate_bits)
2398        sqxtun          v2.4h,   v2.4s
2399        sqxtun2         v2.8h,   v3.4s
2400        sqxtun          v3.4h,   v4.4s
2401        sqxtun2         v3.8h,   v5.4s
2402        umin            v2.8h,   v2.8h,   v31.8h
2403        umin            v3.8h,   v3.8h,   v31.8h
2404.else
2405        rshrn           v2.4h,   v2.4s,   #6
2406        rshrn2          v2.8h,   v3.4s,   #6
2407        rshrn           v3.4h,   v4.4s,   #6
2408        rshrn2          v3.8h,   v5.4s,   #6
2409        sub             v2.8h,   v2.8h,   v29.8h // PREP_BIAS
2410        sub             v3.8h,   v3.8h,   v29.8h // PREP_BIAS
2411.endif
2412        subs            \h,  \h,  #2
2413        st1             {v2.8h}, [\dst], \d_strd
2414        st1             {v3.8h}, [\ds2], \d_strd
2415        b.le            9f
2416        mov             v16.16b, v18.16b
2417        mov             v17.16b, v23.16b
2418        mov             v18.16b, v24.16b
2419        b               8b
24209:
2421        subs            \w,  \w,  #8
2422        b.le            0f
2423        asr             \s_strd,  \s_strd,  #1
2424        asr             \d_strd,  \d_strd,  #1
2425        msub            \src,  \s_strd,  \xmy,  \src
2426        msub            \dst,  \d_strd,  \xmy,  \dst
2427        sub             \src,  \src,  \s_strd,  lsl #2
2428        mov             \h,  \my
2429        add             \src,  \src,  #16
2430        add             \dst,  \dst,  #16
2431        b               164b
2432
2433880:    // 8x8, 8x16, ..., 16x8, ..., 32x8, ... hv
2434640:
24351280:
2436        ld1             {v0.8b},  [\xmx]
2437        ld1             {v1.8b},  [\xmy]
2438        sub             \src,  \src,  #6
2439        sub             \src,  \src,  \s_strd
2440        sub             \src,  \src,  \s_strd, lsl #1
2441        sxtl            v0.8h,   v0.8b
2442        sxtl            v1.8h,   v1.8b
2443        mov             x15, x30
2444        mov             \my, \h
2445
2446168:
2447        add             \ds2,  \dst,  \d_strd
2448        add             \sr2,  \src,  \s_strd
2449        lsl             \d_strd, \d_strd, #1
2450        lsl             \s_strd, \s_strd, #1
2451
2452        ld1             {v27.8h, v28.8h},  [\src], \s_strd
2453        smull           v24.4s,  v27.4h,  v0.h[0]
2454        smull2          v25.4s,  v27.8h,  v0.h[0]
2455.irpc i, 1234567
2456        ext             v26.16b, v27.16b, v28.16b, #(2*\i)
2457        smlal           v24.4s,  v26.4h,  v0.h[\i]
2458        smlal2          v25.4s,  v26.8h,  v0.h[\i]
2459.endr
2460        srshl           v24.4s,  v24.4s,  v30.4s // -(6-intermediate_bits)
2461        srshl           v25.4s,  v25.4s,  v30.4s // -(6-intermediate_bits)
2462        // The intermediates from the horizontal pass fit in 16 bit without
2463        // any bias; we could just as well keep them as .4s, but narrowing
2464        // them to .4h gives a significant speedup on out of order cores
2465        // (at the cost of a smaller slowdown on in-order cores such as A53),
2466        // and conserves register space (no need to clobber v8-v15).
2467        xtn             v16.4h,  v24.4s
2468        xtn2            v16.8h,  v25.4s
2469
2470        bl              L(\type\()_8tap_filter_8)
2471        mov             v17.16b, v23.16b
2472        mov             v18.16b, v24.16b
2473        bl              L(\type\()_8tap_filter_8)
2474        mov             v19.16b, v23.16b
2475        mov             v20.16b, v24.16b
2476        bl              L(\type\()_8tap_filter_8)
2477        mov             v21.16b, v23.16b
2478        mov             v22.16b, v24.16b
2479
248088:
2481        smull           v2.4s,   v16.4h,  v1.h[0]
2482        smull2          v3.4s,   v16.8h,  v1.h[0]
2483        bl              L(\type\()_8tap_filter_8)
2484        smull           v4.4s,   v17.4h,  v1.h[0]
2485        smull2          v5.4s,   v17.8h,  v1.h[0]
2486        smlal           v2.4s,   v17.4h,  v1.h[1]
2487        smlal2          v3.4s,   v17.8h,  v1.h[1]
2488        smlal           v4.4s,   v18.4h,  v1.h[1]
2489        smlal2          v5.4s,   v18.8h,  v1.h[1]
2490        smlal           v2.4s,   v18.4h,  v1.h[2]
2491        smlal2          v3.4s,   v18.8h,  v1.h[2]
2492        smlal           v4.4s,   v19.4h,  v1.h[2]
2493        smlal2          v5.4s,   v19.8h,  v1.h[2]
2494        smlal           v2.4s,   v19.4h,  v1.h[3]
2495        smlal2          v3.4s,   v19.8h,  v1.h[3]
2496        smlal           v4.4s,   v20.4h,  v1.h[3]
2497        smlal2          v5.4s,   v20.8h,  v1.h[3]
2498        smlal           v2.4s,   v20.4h,  v1.h[4]
2499        smlal2          v3.4s,   v20.8h,  v1.h[4]
2500        smlal           v4.4s,   v21.4h,  v1.h[4]
2501        smlal2          v5.4s,   v21.8h,  v1.h[4]
2502        smlal           v2.4s,   v21.4h,  v1.h[5]
2503        smlal2          v3.4s,   v21.8h,  v1.h[5]
2504        smlal           v4.4s,   v22.4h,  v1.h[5]
2505        smlal2          v5.4s,   v22.8h,  v1.h[5]
2506        smlal           v2.4s,   v22.4h,  v1.h[6]
2507        smlal2          v3.4s,   v22.8h,  v1.h[6]
2508        smlal           v4.4s,   v23.4h,  v1.h[6]
2509        smlal2          v5.4s,   v23.8h,  v1.h[6]
2510        smlal           v2.4s,   v23.4h,  v1.h[7]
2511        smlal2          v3.4s,   v23.8h,  v1.h[7]
2512        smlal           v4.4s,   v24.4h,  v1.h[7]
2513        smlal2          v5.4s,   v24.8h,  v1.h[7]
2514.ifc \type, put
2515        srshl           v2.4s,   v2.4s,   v29.4s // -(6+intermediate_bits)
2516        srshl           v3.4s,   v3.4s,   v29.4s // -(6+intermediate_bits)
2517        srshl           v4.4s,   v4.4s,   v29.4s // -(6+intermediate_bits)
2518        srshl           v5.4s,   v5.4s,   v29.4s // -(6+intermediate_bits)
2519        sqxtun          v2.4h,   v2.4s
2520        sqxtun2         v2.8h,   v3.4s
2521        sqxtun          v3.4h,   v4.4s
2522        sqxtun2         v3.8h,   v5.4s
2523        umin            v2.8h,   v2.8h,   v31.8h
2524        umin            v3.8h,   v3.8h,   v31.8h
2525.else
2526        rshrn           v2.4h,   v2.4s,   #6
2527        rshrn2          v2.8h,   v3.4s,   #6
2528        rshrn           v3.4h,   v4.4s,   #6
2529        rshrn2          v3.8h,   v5.4s,   #6
2530        sub             v2.8h,   v2.8h,   v29.8h // PREP_BIAS
2531        sub             v3.8h,   v3.8h,   v29.8h // PREP_BIAS
2532.endif
2533        subs            \h,  \h,  #2
2534        st1             {v2.8h}, [\dst], \d_strd
2535        st1             {v3.8h}, [\ds2], \d_strd
2536        b.le            9f
2537        mov             v16.16b, v18.16b
2538        mov             v17.16b, v19.16b
2539        mov             v18.16b, v20.16b
2540        mov             v19.16b, v21.16b
2541        mov             v20.16b, v22.16b
2542        mov             v21.16b, v23.16b
2543        mov             v22.16b, v24.16b
2544        b               88b
25459:
2546        subs            \w,  \w,  #8
2547        b.le            0f
2548        asr             \s_strd,  \s_strd,  #1
2549        asr             \d_strd,  \d_strd,  #1
2550        msub            \src,  \s_strd,  \xmy,  \src
2551        msub            \dst,  \d_strd,  \xmy,  \dst
2552        sub             \src,  \src,  \s_strd,  lsl #3
2553        mov             \h,  \my
2554        add             \src,  \src,  #16
2555        add             \dst,  \dst,  #16
2556        b               168b
25570:
2558        br              x15
2559
2560L(\type\()_8tap_filter_8):
2561        ld1             {v4.8h, v5.8h},  [\sr2], \s_strd
2562        ld1             {v6.8h, v7.8h},  [\src], \s_strd
2563        smull           v25.4s,  v4.4h,   v0.h[0]
2564        smull2          v26.4s,  v4.8h,   v0.h[0]
2565        smull           v27.4s,  v6.4h,   v0.h[0]
2566        smull2          v28.4s,  v6.8h,   v0.h[0]
2567.irpc i, 1234567
2568        ext             v23.16b, v4.16b,  v5.16b,  #(2*\i)
2569        ext             v24.16b, v6.16b,  v7.16b,  #(2*\i)
2570        smlal           v25.4s,  v23.4h,  v0.h[\i]
2571        smlal2          v26.4s,  v23.8h,  v0.h[\i]
2572        smlal           v27.4s,  v24.4h,  v0.h[\i]
2573        smlal2          v28.4s,  v24.8h,  v0.h[\i]
2574.endr
2575        srshl           v25.4s,  v25.4s,  v30.4s // -(6-intermediate_bits)
2576        srshl           v26.4s,  v26.4s,  v30.4s // -(6-intermediate_bits)
2577        srshl           v27.4s,  v27.4s,  v30.4s // -(6-intermediate_bits)
2578        srshl           v28.4s,  v28.4s,  v30.4s // -(6-intermediate_bits)
2579        xtn             v23.4h,  v25.4s
2580        xtn2            v23.8h,  v26.4s
2581        xtn             v24.4h,  v27.4s
2582        xtn2            v24.8h,  v28.4s
2583        ret
2584
2585L(\type\()_8tap_hv_tbl):
2586        .hword L(\type\()_8tap_hv_tbl) - 1280b
2587        .hword L(\type\()_8tap_hv_tbl) -  640b
2588        .hword L(\type\()_8tap_hv_tbl) -  320b
2589        .hword L(\type\()_8tap_hv_tbl) -  160b
2590        .hword L(\type\()_8tap_hv_tbl) -   80b
2591        .hword L(\type\()_8tap_hv_tbl) -   40b
2592        .hword L(\type\()_8tap_hv_tbl) -   20b
2593        .hword 0
2594endfunc
2595
2596
2597function \type\()_bilin_16bpc_neon, export=1
2598.ifc \bdmax, w8
2599        ldr             w8,  [sp]
2600.endif
2601        dup             v1.8h,   \mx
2602        dup             v3.8h,   \my
2603        mov             w10, #16
2604        sub             w9,  w10, \mx
2605        sub             w10, w10, \my
2606        dup             v0.8h,   w9
2607        dup             v2.8h,   w10
2608.ifc \type, prep
2609        uxtw            \d_strd, \w
2610        lsl             \d_strd, \d_strd, #1
2611.endif
2612
2613        clz             \bdmax,   \bdmax       // bitdepth_max
2614        clz             w9,  \w
2615        sub             \bdmax,   \bdmax,  #18 // intermediate_bits = clz(bitdepth_max) - 18
2616        mov             w11, #4
2617        sub             w9,  w9,  #24
2618        sub             w11, w11, \bdmax  // 4 - intermediate_bits
2619        add             w12, \bdmax, #4   // 4 + intermediate_bits
2620        cbnz            \mx, L(\type\()_bilin_h)
2621        cbnz            \my, L(\type\()_bilin_v)
2622        b               \type\()_neon
2623
2624L(\type\()_bilin_h):
2625        cbnz            \my, L(\type\()_bilin_hv)
2626
2627        adr             x10, L(\type\()_bilin_h_tbl)
2628        dup             v31.8h,  w11      // 4 - intermediate_bits
2629        ldrh            w9,  [x10, x9, lsl #1]
2630        neg             v31.8h,  v31.8h   // -(4-intermediate_bits)
2631.ifc \type, put
2632        dup             v30.8h,  \bdmax   // intermediate_bits
2633.else
2634        movi            v29.8h,  #(PREP_BIAS >> 8), lsl #8
2635.endif
2636        sub             x10, x10, w9, uxtw
2637.ifc \type, put
2638        neg             v30.8h,  v30.8h   // -intermediate_bits
2639.endif
2640        br              x10
2641
264220:     // 2xN h
2643.ifc \type, put
2644        add             \ds2,  \dst,  \d_strd
2645        add             \sr2,  \src,  \s_strd
2646        lsl             \d_strd,  \d_strd,  #1
2647        lsl             \s_strd,  \s_strd,  #1
26482:
2649        ld1             {v4.4h},  [\src], \s_strd
2650        ld1             {v6.4h},  [\sr2], \s_strd
2651        ext             v5.8b,   v4.8b,   v4.8b,   #2
2652        ext             v7.8b,   v6.8b,   v6.8b,   #2
2653        trn1            v4.2s,   v4.2s,   v6.2s
2654        trn1            v5.2s,   v5.2s,   v7.2s
2655        subs            \h,  \h,  #2
2656        mul             v4.4h,   v4.4h,   v0.4h
2657        mla             v4.4h,   v5.4h,   v1.4h
2658        urshl           v4.4h,   v4.4h,   v31.4h
2659        urshl           v4.4h,   v4.4h,   v30.4h
2660        st1             {v4.s}[0], [\dst], \d_strd
2661        st1             {v4.s}[1], [\ds2], \d_strd
2662        b.gt            2b
2663        ret
2664.endif
2665
266640:     // 4xN h
2667        add             \ds2,  \dst,  \d_strd
2668        add             \sr2,  \src,  \s_strd
2669        lsl             \d_strd,  \d_strd,  #1
2670        lsl             \s_strd,  \s_strd,  #1
26714:
2672        ld1             {v4.8h}, [\src], \s_strd
2673        ld1             {v6.8h}, [\sr2], \s_strd
2674        ext             v5.16b,  v4.16b,  v4.16b,  #2
2675        ext             v7.16b,  v6.16b,  v6.16b,  #2
2676        trn1            v4.2d,   v4.2d,   v6.2d
2677        trn1            v5.2d,   v5.2d,   v7.2d
2678        subs            \h,  \h,  #2
2679        mul             v4.8h,   v4.8h,   v0.8h
2680        mla             v4.8h,   v5.8h,   v1.8h
2681        urshl           v4.8h,   v4.8h,   v31.8h
2682.ifc \type, put
2683        urshl           v4.8h,   v4.8h,   v30.8h
2684.else
2685        sub             v4.8h,   v4.8h,   v29.8h
2686.endif
2687        st1             {v4.d}[0], [\dst], \d_strd
2688        st1             {v4.d}[1], [\ds2], \d_strd
2689        b.gt            4b
2690        ret
2691
269280:     // 8xN h
2693        add             \ds2,  \dst,  \d_strd
2694        add             \sr2,  \src,  \s_strd
2695        lsl             \d_strd,  \d_strd,  #1
2696        lsl             \s_strd,  \s_strd,  #1
26978:
2698        ldr             h5,  [\src, #16]
2699        ldr             h7,  [\sr2, #16]
2700        ld1             {v4.8h}, [\src], \s_strd
2701        ld1             {v6.8h}, [\sr2], \s_strd
2702        ext             v5.16b,  v4.16b,  v5.16b,  #2
2703        ext             v7.16b,  v6.16b,  v7.16b,  #2
2704        subs            \h,  \h,  #2
2705        mul             v4.8h,   v4.8h,   v0.8h
2706        mla             v4.8h,   v5.8h,   v1.8h
2707        mul             v6.8h,   v6.8h,   v0.8h
2708        mla             v6.8h,   v7.8h,   v1.8h
2709        urshl           v4.8h,   v4.8h,   v31.8h
2710        urshl           v6.8h,   v6.8h,   v31.8h
2711.ifc \type, put
2712        urshl           v4.8h,   v4.8h,   v30.8h
2713        urshl           v6.8h,   v6.8h,   v30.8h
2714.else
2715        sub             v4.8h,   v4.8h,   v29.8h
2716        sub             v6.8h,   v6.8h,   v29.8h
2717.endif
2718        st1             {v4.8h}, [\dst], \d_strd
2719        st1             {v6.8h}, [\ds2], \d_strd
2720        b.gt            8b
2721        ret
2722160:
2723320:
2724640:
27251280:   // 16xN, 32xN, ... h
2726        add             \ds2,  \dst,  \d_strd
2727        add             \sr2,  \src,  \s_strd
2728        lsl             \s_strd,  \s_strd,  #1
2729
2730        sub             \s_strd,  \s_strd,  \w, uxtw #1
2731        sub             \s_strd,  \s_strd,  #16
2732.ifc \type, put
2733        lsl             \d_strd,  \d_strd,  #1
2734        sub             \d_strd,  \d_strd,  \w, uxtw #1
2735.endif
2736161:
2737        ld1             {v16.8h},  [\src], #16
2738        ld1             {v21.8h},  [\sr2], #16
2739        mov             \mx, \w
2740
274116:
2742        ld1             {v17.8h, v18.8h},  [\src], #32
2743        ld1             {v22.8h, v23.8h},  [\sr2], #32
2744        ext             v19.16b, v16.16b, v17.16b, #2
2745        ext             v20.16b, v17.16b, v18.16b, #2
2746        ext             v24.16b, v21.16b, v22.16b, #2
2747        ext             v25.16b, v22.16b, v23.16b, #2
2748        mul             v16.8h,  v16.8h,  v0.8h
2749        mla             v16.8h,  v19.8h,  v1.8h
2750        mul             v17.8h,  v17.8h,  v0.8h
2751        mla             v17.8h,  v20.8h,  v1.8h
2752        mul             v21.8h,  v21.8h,  v0.8h
2753        mla             v21.8h,  v24.8h,  v1.8h
2754        mul             v22.8h,  v22.8h,  v0.8h
2755        mla             v22.8h,  v25.8h,  v1.8h
2756        urshl           v16.8h,  v16.8h,  v31.8h
2757        urshl           v17.8h,  v17.8h,  v31.8h
2758        urshl           v21.8h,  v21.8h,  v31.8h
2759        urshl           v22.8h,  v22.8h,  v31.8h
2760        subs            \mx, \mx, #16
2761.ifc \type, put
2762        urshl           v16.8h,  v16.8h,  v30.8h
2763        urshl           v17.8h,  v17.8h,  v30.8h
2764        urshl           v21.8h,  v21.8h,  v30.8h
2765        urshl           v22.8h,  v22.8h,  v30.8h
2766.else
2767        sub             v16.8h,  v16.8h,  v29.8h
2768        sub             v17.8h,  v17.8h,  v29.8h
2769        sub             v21.8h,  v21.8h,  v29.8h
2770        sub             v22.8h,  v22.8h,  v29.8h
2771.endif
2772        st1             {v16.8h, v17.8h}, [\dst], #32
2773        st1             {v21.8h, v22.8h}, [\ds2], #32
2774        b.le            9f
2775
2776        mov             v16.16b, v18.16b
2777        mov             v21.16b, v23.16b
2778        b               16b
2779
27809:
2781        add             \dst,  \dst,  \d_strd
2782        add             \ds2,  \ds2,  \d_strd
2783        add             \src,  \src,  \s_strd
2784        add             \sr2,  \sr2,  \s_strd
2785
2786        subs            \h,  \h,  #2
2787        b.gt            161b
2788        ret
2789
2790L(\type\()_bilin_h_tbl):
2791        .hword L(\type\()_bilin_h_tbl) - 1280b
2792        .hword L(\type\()_bilin_h_tbl) -  640b
2793        .hword L(\type\()_bilin_h_tbl) -  320b
2794        .hword L(\type\()_bilin_h_tbl) -  160b
2795        .hword L(\type\()_bilin_h_tbl) -   80b
2796        .hword L(\type\()_bilin_h_tbl) -   40b
2797        .hword L(\type\()_bilin_h_tbl) -   20b
2798        .hword 0
2799
2800
2801L(\type\()_bilin_v):
2802        cmp             \h,  #4
2803        adr             x10, L(\type\()_bilin_v_tbl)
2804.ifc \type, prep
2805        dup             v31.8h,  w11      // 4 - intermediate_bits
2806.endif
2807        ldrh            w9,  [x10, x9, lsl #1]
2808.ifc \type, prep
2809        movi            v29.8h,  #(PREP_BIAS >> 8), lsl #8
2810        neg             v31.8h,  v31.8h   // -(4-intermediate_bits)
2811.endif
2812        sub             x10, x10, w9, uxtw
2813        br              x10
2814
281520:     // 2xN v
2816.ifc \type, put
2817        cmp             \h,  #2
2818        add             \ds2,  \dst,  \d_strd
2819        add             \sr2,  \src,  \s_strd
2820        lsl             \s_strd,  \s_strd,  #1
2821        lsl             \d_strd,  \d_strd,  #1
2822
2823        // 2x2 v
2824        ld1             {v16.s}[0], [\src], \s_strd
2825        b.gt            24f
2826        ld1             {v17.s}[0], [\sr2], \s_strd
2827        ld1             {v18.s}[0], [\src], \s_strd
2828        trn1            v16.2s,  v16.2s,  v17.2s
2829        trn1            v17.2s,  v17.2s,  v18.2s
2830        mul             v4.4h,   v16.4h,  v2.4h
2831        mla             v4.4h,   v17.4h,  v3.4h
2832        urshr           v4.8h,   v4.8h,   #4
2833        st1             {v4.s}[0], [\dst]
2834        st1             {v4.s}[1], [\ds2]
2835        ret
283624:     // 2x4, 2x8, ... v
2837        ld1             {v17.s}[0], [\sr2], \s_strd
2838        ld1             {v18.s}[0], [\src], \s_strd
2839        ld1             {v19.s}[0], [\sr2], \s_strd
2840        ld1             {v20.s}[0], [\src], \s_strd
2841        trn1            v16.2s,  v16.2s,  v17.2s
2842        trn1            v17.2s,  v17.2s,  v18.2s
2843        trn1            v18.2s,  v18.2s,  v19.2s
2844        trn1            v19.2s,  v19.2s,  v20.2s
2845        trn1            v16.2d,  v16.2d,  v18.2d
2846        trn1            v17.2d,  v17.2d,  v19.2d
2847        mul             v4.8h,   v16.8h,  v2.8h
2848        mla             v4.8h,   v17.8h,  v3.8h
2849        subs            \h,  \h,  #4
2850        urshr           v4.8h,   v4.8h,   #4
2851        st1             {v4.s}[0], [\dst], \d_strd
2852        st1             {v4.s}[1], [\ds2], \d_strd
2853        st1             {v4.s}[2], [\dst], \d_strd
2854        st1             {v4.s}[3], [\ds2], \d_strd
2855        b.le            0f
2856        mov             v16.8b,  v20.8b
2857        b               24b
28580:
2859        ret
2860.endif
2861
286240:     // 4xN v
2863        add             \ds2,  \dst,  \d_strd
2864        add             \sr2,  \src,  \s_strd
2865        lsl             \s_strd,  \s_strd,  #1
2866        lsl             \d_strd,  \d_strd,  #1
2867        ld1             {v16.4h}, [\src], \s_strd
28684:
2869        ld1             {v17.4h}, [\sr2], \s_strd
2870        ld1             {v18.4h}, [\src], \s_strd
2871        trn1            v16.2d,  v16.2d,  v17.2d
2872        trn1            v17.2d,  v17.2d,  v18.2d
2873        mul             v4.8h,   v16.8h,  v2.8h
2874        mla             v4.8h,   v17.8h,  v3.8h
2875        subs            \h,  \h,  #2
2876.ifc \type, put
2877        urshr           v4.8h,   v4.8h,   #4
2878.else
2879        urshl           v4.8h,   v4.8h,   v31.8h
2880        sub             v4.8h,   v4.8h,   v29.8h
2881.endif
2882        st1             {v4.d}[0], [\dst], \d_strd
2883        st1             {v4.d}[1], [\ds2], \d_strd
2884        b.le            0f
2885        mov             v16.8b,  v18.8b
2886        b               4b
28870:
2888        ret
2889
289080:     // 8xN v
2891        add             \ds2,  \dst,  \d_strd
2892        add             \sr2,  \src,  \s_strd
2893        lsl             \s_strd,  \s_strd,  #1
2894        lsl             \d_strd,  \d_strd,  #1
2895        ld1             {v16.8h}, [\src], \s_strd
28968:
2897        ld1             {v17.8h}, [\sr2], \s_strd
2898        ld1             {v18.8h}, [\src], \s_strd
2899        mul             v4.8h,   v16.8h,  v2.8h
2900        mla             v4.8h,   v17.8h,  v3.8h
2901        mul             v5.8h,   v17.8h,  v2.8h
2902        mla             v5.8h,   v18.8h,  v3.8h
2903        subs            \h,  \h,  #2
2904.ifc \type, put
2905        urshr           v4.8h,   v4.8h,   #4
2906        urshr           v5.8h,   v5.8h,   #4
2907.else
2908        urshl           v4.8h,   v4.8h,   v31.8h
2909        urshl           v5.8h,   v5.8h,   v31.8h
2910        sub             v4.8h,   v4.8h,   v29.8h
2911        sub             v5.8h,   v5.8h,   v29.8h
2912.endif
2913        st1             {v4.8h}, [\dst], \d_strd
2914        st1             {v5.8h}, [\ds2], \d_strd
2915        b.le            0f
2916        mov             v16.16b, v18.16b
2917        b               8b
29180:
2919        ret
2920
2921160:    // 16xN, 32xN, ...
2922320:
2923640:
29241280:
2925        mov             \my, \h
29261:
2927        add             \ds2, \dst, \d_strd
2928        add             \sr2, \src, \s_strd
2929        lsl             \s_strd, \s_strd, #1
2930        lsl             \d_strd, \d_strd, #1
2931
2932        ld1             {v16.8h, v17.8h}, [\src], \s_strd
29332:
2934        ld1             {v18.8h, v19.8h}, [\sr2], \s_strd
2935        ld1             {v20.8h, v21.8h}, [\src], \s_strd
2936        mul             v4.8h,   v16.8h,  v2.8h
2937        mla             v4.8h,   v18.8h,  v3.8h
2938        mul             v5.8h,   v17.8h,  v2.8h
2939        mla             v5.8h,   v19.8h,  v3.8h
2940        mul             v6.8h,   v18.8h,  v2.8h
2941        mla             v6.8h,   v20.8h,  v3.8h
2942        mul             v7.8h,   v19.8h,  v2.8h
2943        mla             v7.8h,   v21.8h,  v3.8h
2944        subs            \h,  \h,  #2
2945.ifc \type, put
2946        urshr           v4.8h,   v4.8h,   #4
2947        urshr           v5.8h,   v5.8h,   #4
2948        urshr           v6.8h,   v6.8h,   #4
2949        urshr           v7.8h,   v7.8h,   #4
2950.else
2951        urshl           v4.8h,   v4.8h,   v31.8h
2952        urshl           v5.8h,   v5.8h,   v31.8h
2953        urshl           v6.8h,   v6.8h,   v31.8h
2954        urshl           v7.8h,   v7.8h,   v31.8h
2955        sub             v4.8h,   v4.8h,   v29.8h
2956        sub             v5.8h,   v5.8h,   v29.8h
2957        sub             v6.8h,   v6.8h,   v29.8h
2958        sub             v7.8h,   v7.8h,   v29.8h
2959.endif
2960        st1             {v4.8h, v5.8h}, [\dst], \d_strd
2961        st1             {v6.8h, v7.8h}, [\ds2], \d_strd
2962        b.le            9f
2963        mov             v16.16b, v20.16b
2964        mov             v17.16b, v21.16b
2965        b               2b
29669:
2967        subs            \w,  \w,  #16
2968        b.le            0f
2969        asr             \s_strd, \s_strd, #1
2970        asr             \d_strd, \d_strd, #1
2971        msub            \src, \s_strd, \xmy, \src
2972        msub            \dst, \d_strd, \xmy, \dst
2973        sub             \src, \src, \s_strd, lsl #1
2974        mov             \h,  \my
2975        add             \src, \src, #32
2976        add             \dst, \dst, #32
2977        b               1b
29780:
2979        ret
2980
2981L(\type\()_bilin_v_tbl):
2982        .hword L(\type\()_bilin_v_tbl) - 1280b
2983        .hword L(\type\()_bilin_v_tbl) -  640b
2984        .hword L(\type\()_bilin_v_tbl) -  320b
2985        .hword L(\type\()_bilin_v_tbl) -  160b
2986        .hword L(\type\()_bilin_v_tbl) -   80b
2987        .hword L(\type\()_bilin_v_tbl) -   40b
2988        .hword L(\type\()_bilin_v_tbl) -   20b
2989        .hword 0
2990
2991L(\type\()_bilin_hv):
2992        adr             x10, L(\type\()_bilin_hv_tbl)
2993        dup             v31.8h,  w11      // 4 - intermediate_bits
2994        ldrh            w9,  [x10, x9, lsl #1]
2995        neg             v31.8h,  v31.8h   // -(4-intermediate_bits)
2996.ifc \type, put
2997        dup             v30.4s,  w12      // 4 + intermediate_bits
2998.else
2999        movi            v29.8h,  #(PREP_BIAS >> 8), lsl #8
3000.endif
3001        sub             x10, x10, w9, uxtw
3002.ifc \type, put
3003        neg             v30.4s,  v30.4s   // -(4+intermediate_bits)
3004.endif
3005        br              x10
3006
300720:     // 2xN hv
3008.ifc \type, put
3009        add             \sr2, \src, \s_strd
3010        add             \ds2, \dst, \d_strd
3011        lsl             \s_strd, \s_strd, #1
3012        lsl             \d_strd, \d_strd, #1
3013
3014        ld1             {v20.4h},  [\src], \s_strd
3015        ext             v21.8b,  v20.8b,  v20.8b,  #2
3016        mul             v16.4h,  v20.4h,  v0.4h
3017        mla             v16.4h,  v21.4h,  v1.4h
3018        urshl           v16.4h,  v16.4h,  v31.4h
3019
30202:
3021        ld1             {v22.4h},  [\sr2], \s_strd
3022        ld1             {v24.4h},  [\src], \s_strd
3023        ext             v23.8b,  v22.8b,  v22.8b,  #2
3024        ext             v25.8b,  v24.8b,  v24.8b,  #2
3025        trn1            v22.2s,  v22.2s,  v24.2s
3026        trn1            v23.2s,  v23.2s,  v25.2s
3027        mul             v17.4h,  v22.4h,  v0.4h
3028        mla             v17.4h,  v23.4h,  v1.4h
3029        urshl           v17.4h,  v17.4h,  v31.4h
3030
3031        trn1            v16.2s,  v16.2s,  v17.2s
3032
3033        umull           v4.4s,   v16.4h,  v2.4h
3034        umlal           v4.4s,   v17.4h,  v3.4h
3035        urshl           v4.4s,   v4.4s,   v30.4s
3036        xtn             v4.4h,   v4.4s
3037        subs            \h,  \h,  #2
3038        st1             {v4.s}[0], [\dst], \d_strd
3039        st1             {v4.s}[1], [\ds2], \d_strd
3040        b.le            0f
3041        trn2            v16.2s,  v17.2s,  v17.2s
3042        b               2b
30430:
3044        ret
3045.endif
3046
304740:     // 4xN hv
3048        add             \sr2, \src, \s_strd
3049        add             \ds2, \dst, \d_strd
3050        lsl             \s_strd, \s_strd, #1
3051        lsl             \d_strd, \d_strd, #1
3052
3053        ld1             {v20.8h},  [\src], \s_strd
3054        ext             v21.16b, v20.16b, v20.16b, #2
3055        mul             v16.4h,  v20.4h,  v0.4h
3056        mla             v16.4h,  v21.4h,  v1.4h
3057        urshl           v16.4h,  v16.4h,  v31.4h
3058
30594:
3060        ld1             {v22.8h},  [\sr2], \s_strd
3061        ld1             {v24.8h},  [\src], \s_strd
3062        ext             v23.16b, v22.16b, v22.16b, #2
3063        ext             v25.16b, v24.16b, v24.16b, #2
3064        trn1            v22.2d,  v22.2d,  v24.2d
3065        trn1            v23.2d,  v23.2d,  v25.2d
3066        mul             v17.8h,  v22.8h,  v0.8h
3067        mla             v17.8h,  v23.8h,  v1.8h
3068        urshl           v17.8h,  v17.8h,  v31.8h
3069
3070        trn1            v16.2d,  v16.2d,  v17.2d
3071
3072        umull           v4.4s,   v16.4h,  v2.4h
3073        umlal           v4.4s,   v17.4h,  v3.4h
3074        umull2          v5.4s,   v16.8h,  v2.8h
3075        umlal2          v5.4s,   v17.8h,  v3.8h
3076.ifc \type, put
3077        urshl           v4.4s,   v4.4s,   v30.4s
3078        urshl           v5.4s,   v5.4s,   v30.4s
3079        xtn             v4.4h,   v4.4s
3080        xtn2            v4.8h,   v5.4s
3081.else
3082        rshrn           v4.4h,   v4.4s,   #4
3083        rshrn2          v4.8h,   v5.4s,   #4
3084        sub             v4.8h,   v4.8h,   v29.8h
3085.endif
3086        subs            \h,  \h,  #2
3087        st1             {v4.d}[0], [\dst], \d_strd
3088        st1             {v4.d}[1], [\ds2], \d_strd
3089        b.le            0f
3090        trn2            v16.2d,  v17.2d,  v17.2d
3091        b               4b
30920:
3093        ret
3094
309580:     // 8xN, 16xN, ... hv
3096160:
3097320:
3098640:
30991280:
3100        mov             \my, \h
3101
31021:
3103        add             \sr2, \src, \s_strd
3104        add             \ds2, \dst, \d_strd
3105        lsl             \s_strd, \s_strd, #1
3106        lsl             \d_strd, \d_strd, #1
3107
3108        ldr             h21, [\src, #16]
3109        ld1             {v20.8h},  [\src], \s_strd
3110        ext             v21.16b, v20.16b, v21.16b, #2
3111        mul             v16.8h,  v20.8h,  v0.8h
3112        mla             v16.8h,  v21.8h,  v1.8h
3113        urshl           v16.8h,  v16.8h,  v31.8h
3114
31152:
3116        ldr             h23, [\sr2, #16]
3117        ld1             {v22.8h},  [\sr2], \s_strd
3118        ldr             h25, [\src, #16]
3119        ld1             {v24.8h},  [\src], \s_strd
3120        ext             v23.16b, v22.16b, v23.16b, #2
3121        ext             v25.16b, v24.16b, v25.16b, #2
3122        mul             v17.8h,  v22.8h,  v0.8h
3123        mla             v17.8h,  v23.8h,  v1.8h
3124        mul             v18.8h,  v24.8h,  v0.8h
3125        mla             v18.8h,  v25.8h,  v1.8h
3126        urshl           v17.8h,  v17.8h,  v31.8h
3127        urshl           v18.8h,  v18.8h,  v31.8h
3128
3129        umull           v4.4s,   v16.4h,  v2.4h
3130        umlal           v4.4s,   v17.4h,  v3.4h
3131        umull2          v5.4s,   v16.8h,  v2.8h
3132        umlal2          v5.4s,   v17.8h,  v3.8h
3133        umull           v6.4s,   v17.4h,  v2.4h
3134        umlal           v6.4s,   v18.4h,  v3.4h
3135        umull2          v7.4s,   v17.8h,  v2.8h
3136        umlal2          v7.4s,   v18.8h,  v3.8h
3137.ifc \type, put
3138        urshl           v4.4s,   v4.4s,   v30.4s
3139        urshl           v5.4s,   v5.4s,   v30.4s
3140        urshl           v6.4s,   v6.4s,   v30.4s
3141        urshl           v7.4s,   v7.4s,   v30.4s
3142        xtn             v4.4h,   v4.4s
3143        xtn2            v4.8h,   v5.4s
3144        xtn             v5.4h,   v6.4s
3145        xtn2            v5.8h,   v7.4s
3146.else
3147        rshrn           v4.4h,   v4.4s,   #4
3148        rshrn2          v4.8h,   v5.4s,   #4
3149        rshrn           v5.4h,   v6.4s,   #4
3150        rshrn2          v5.8h,   v7.4s,   #4
3151        sub             v4.8h,   v4.8h,   v29.8h
3152        sub             v5.8h,   v5.8h,   v29.8h
3153.endif
3154        subs            \h,  \h,  #2
3155        st1             {v4.8h}, [\dst], \d_strd
3156        st1             {v5.8h}, [\ds2], \d_strd
3157        b.le            9f
3158        mov             v16.16b, v18.16b
3159        b               2b
31609:
3161        subs            \w,  \w,  #8
3162        b.le            0f
3163        asr             \s_strd,  \s_strd,  #1
3164        asr             \d_strd,  \d_strd,  #1
3165        msub            \src,  \s_strd,  \xmy,  \src
3166        msub            \dst,  \d_strd,  \xmy,  \dst
3167        sub             \src,  \src,  \s_strd,  lsl #1
3168        mov             \h,  \my
3169        add             \src,  \src,  #16
3170        add             \dst,  \dst,  #16
3171        b               1b
31720:
3173        ret
3174
3175L(\type\()_bilin_hv_tbl):
3176        .hword L(\type\()_bilin_hv_tbl) - 1280b
3177        .hword L(\type\()_bilin_hv_tbl) -  640b
3178        .hword L(\type\()_bilin_hv_tbl) -  320b
3179        .hword L(\type\()_bilin_hv_tbl) -  160b
3180        .hword L(\type\()_bilin_hv_tbl) -   80b
3181        .hword L(\type\()_bilin_hv_tbl) -   40b
3182        .hword L(\type\()_bilin_hv_tbl) -   20b
3183        .hword 0
3184endfunc
3185.endm
3186
3187filter_fn put,  x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10
3188filter_fn prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10
3189
3190.macro load_filter_row dst, src, inc
3191        asr             w13, \src, #10
3192        ldr             \dst, [x11, w13, sxtw #3]
3193        add             \src, \src, \inc
3194.endm
3195
3196function warp_filter_horz_neon
3197        add             w12, w5,  #512
3198
3199        ld1             {v16.8h, v17.8h}, [x2], x3
3200
3201        load_filter_row d0, w12, w7
3202        load_filter_row d1, w12, w7
3203        load_filter_row d2, w12, w7
3204        sxtl            v0.8h,   v0.8b
3205        load_filter_row d3, w12, w7
3206        sxtl            v1.8h,   v1.8b
3207        load_filter_row d4, w12, w7
3208        sxtl            v2.8h,   v2.8b
3209        load_filter_row d5, w12, w7
3210        sxtl            v3.8h,   v3.8b
3211        load_filter_row d6, w12, w7
3212        sxtl            v4.8h,   v4.8b
3213        load_filter_row d7, w12, w7
3214        sxtl            v5.8h,   v5.8b
3215        ext             v18.16b, v16.16b, v17.16b, #2*1
3216        smull           v8.4s,   v16.4h,  v0.4h
3217        smull2          v9.4s,   v16.8h,  v0.8h
3218        sxtl            v6.8h,   v6.8b
3219        ext             v19.16b, v16.16b, v17.16b, #2*2
3220        smull           v10.4s,  v18.4h,  v1.4h
3221        smull2          v11.4s,  v18.8h,  v1.8h
3222        sxtl            v7.8h,   v7.8b
3223        ext             v20.16b, v16.16b, v17.16b, #2*3
3224        smull           v0.4s,   v19.4h,  v2.4h
3225        smull2          v1.4s,   v19.8h,  v2.8h
3226        ext             v21.16b, v16.16b, v17.16b, #2*4
3227        addp            v8.4s,   v8.4s,   v9.4s
3228        smull           v2.4s,   v20.4h,  v3.4h
3229        smull2          v3.4s,   v20.8h,  v3.8h
3230        ext             v22.16b, v16.16b, v17.16b, #2*5
3231        addp            v9.4s,   v10.4s,  v11.4s
3232        smull           v10.4s,  v21.4h,  v4.4h
3233        smull2          v11.4s,  v21.8h,  v4.8h
3234        ext             v23.16b, v16.16b, v17.16b, #2*6
3235        addp            v0.4s,   v0.4s,   v1.4s
3236        smull           v18.4s,  v22.4h,  v5.4h
3237        smull2          v19.4s,  v22.8h,  v5.8h
3238        ext             v16.16b, v16.16b, v17.16b, #2*7
3239        addp            v1.4s,   v2.4s,   v3.4s
3240        addp            v2.4s,   v10.4s,  v11.4s
3241        smull           v20.4s,  v23.4h,  v6.4h
3242        smull2          v21.4s,  v23.8h,  v6.8h
3243        addp            v3.4s,   v18.4s,  v19.4s
3244        smull           v22.4s,  v16.4h,  v7.4h
3245        smull2          v23.4s,  v16.8h,  v7.8h
3246        addp            v4.4s,   v20.4s,  v21.4s
3247        addp            v5.4s,   v22.4s,  v23.4s
3248
3249        addp            v8.4s,   v8.4s,   v9.4s
3250        addp            v0.4s,   v0.4s,   v1.4s
3251        addp            v2.4s,   v2.4s,   v3.4s
3252        addp            v4.4s,   v4.4s,   v5.4s
3253
3254        addp            v16.4s,  v8.4s,   v0.4s
3255        addp            v17.4s,  v2.4s,   v4.4s
3256
3257        add             w5,  w5,  w8
3258
3259        srshl           v16.4s,  v16.4s,  v14.4s // -(7 - intermediate_bits)
3260        srshl           v17.4s,  v17.4s,  v14.4s // -(7 - intermediate_bits)
3261
3262        ret
3263endfunc
3264
3265// void dav1d_warp_affine_8x8_16bpc_neon(
3266//         pixel *dst, const ptrdiff_t dst_stride,
3267//         const pixel *src, const ptrdiff_t src_stride,
3268//         const int16_t *const abcd, int mx, int my,
3269//         const int bitdepth_max)
3270.macro warp t
3271function warp_affine_8x8\t\()_16bpc_neon, export=1
3272        stp             d8,  d9,  [sp, #-0x40]!
3273        stp             d10, d11, [sp, #0x10]
3274        stp             d12, d13, [sp, #0x20]
3275        stp             d14, d15, [sp, #0x30]
3276
3277.ifb \t
3278        dup             v15.8h,  w7        // bitdepth_max
3279.else
3280        movi            v15.8h,  #(PREP_BIAS >> 8), lsl #8
3281.endif
3282        clz             w7,  w7
3283                                           // intermediate_bits = clz(bitdepth_max) - 18
3284.ifb \t
3285        sub             w8,  w7,  #11      // 7 + intermediate_bits = clz(bitdepth_max) - 18 + 7
3286.endif
3287        sub             w7,  w7,  #25      // -(7 - intermediate_bits)
3288.ifb \t
3289        neg             w8,  w8            // -(7 + intermediate_bits)
3290.endif
3291        dup             v14.4s,  w7        // -(7 - intermediate_bits)
3292.ifb \t
3293        dup             v13.4s,  w8        // -(7 + intermediate_bits)
3294.endif
3295
3296        ldr             x4,  [x4]
3297        sbfx            x7,  x4, #0,  #16
3298        sbfx            x8,  x4, #16, #16
3299        sbfx            x9,  x4, #32, #16
3300        sbfx            x4,  x4, #48, #16
3301        mov             w10, #8
3302        sub             x2,  x2,  x3, lsl #1
3303        sub             x2,  x2,  x3
3304        sub             x2,  x2,  #6
3305        movrel          x11, X(mc_warp_filter), 64*8
3306        mov             x15, x30
3307.ifnb \t
3308        lsl             x1,  x1,  #1
3309.endif
3310
3311        bl              warp_filter_horz_neon
3312        xtn             v24.4h,  v16.4s
3313        xtn2            v24.8h,  v17.4s
3314        bl              warp_filter_horz_neon
3315        xtn             v25.4h,  v16.4s
3316        xtn2            v25.8h,  v17.4s
3317        bl              warp_filter_horz_neon
3318        xtn             v26.4h,  v16.4s
3319        xtn2            v26.8h,  v17.4s
3320        bl              warp_filter_horz_neon
3321        xtn             v27.4h,  v16.4s
3322        xtn2            v27.8h,  v17.4s
3323        bl              warp_filter_horz_neon
3324        xtn             v28.4h,  v16.4s
3325        xtn2            v28.8h,  v17.4s
3326        bl              warp_filter_horz_neon
3327        xtn             v29.4h,  v16.4s
3328        xtn2            v29.8h,  v17.4s
3329        bl              warp_filter_horz_neon
3330        xtn             v30.4h,  v16.4s
3331        xtn2            v30.8h,  v17.4s
3332
33331:
3334        add             w14, w6,  #512
3335        bl              warp_filter_horz_neon
3336        xtn             v31.4h,  v16.4s
3337        xtn2            v31.8h,  v17.4s
3338
3339        load_filter_row d0, w14, w9
3340        load_filter_row d1, w14, w9
3341        load_filter_row d2, w14, w9
3342        load_filter_row d3, w14, w9
3343        load_filter_row d4, w14, w9
3344        load_filter_row d5, w14, w9
3345        load_filter_row d6, w14, w9
3346        load_filter_row d7, w14, w9
3347        transpose_8x8b  v0, v1, v2, v3, v4, v5, v6, v7, v16, v17
3348        sxtl            v0.8h,   v0.8b
3349        sxtl            v1.8h,   v1.8b
3350        sxtl            v2.8h,   v2.8b
3351        sxtl            v3.8h,   v3.8b
3352        sxtl            v4.8h,   v4.8b
3353        sxtl            v5.8h,   v5.8b
3354        sxtl            v6.8h,   v6.8b
3355        sxtl            v7.8h,   v7.8b
3356
3357        // This ordering of smull/smlal/smull2/smlal2 is highly
3358        // beneficial for Cortex A53 here.
3359        smull           v16.4s,  v24.4h,  v0.4h
3360        smlal           v16.4s,  v25.4h,  v1.4h
3361        smlal           v16.4s,  v26.4h,  v2.4h
3362        smlal           v16.4s,  v27.4h,  v3.4h
3363        smlal           v16.4s,  v28.4h,  v4.4h
3364        smlal           v16.4s,  v29.4h,  v5.4h
3365        smlal           v16.4s,  v30.4h,  v6.4h
3366        smlal           v16.4s,  v31.4h,  v7.4h
3367        smull2          v17.4s,  v24.8h,  v0.8h
3368        smlal2          v17.4s,  v25.8h,  v1.8h
3369        smlal2          v17.4s,  v26.8h,  v2.8h
3370        smlal2          v17.4s,  v27.8h,  v3.8h
3371        smlal2          v17.4s,  v28.8h,  v4.8h
3372        smlal2          v17.4s,  v29.8h,  v5.8h
3373        smlal2          v17.4s,  v30.8h,  v6.8h
3374        smlal2          v17.4s,  v31.8h,  v7.8h
3375
3376        mov             v24.16b, v25.16b
3377        mov             v25.16b, v26.16b
3378.ifb \t
3379        srshl           v16.4s,  v16.4s,  v13.4s // -(7 + intermediate_bits)
3380        srshl           v17.4s,  v17.4s,  v13.4s // -(7 + intermediate_bits)
3381.else
3382        rshrn           v16.4h,  v16.4s,  #7
3383        rshrn2          v16.8h,  v17.4s,  #7
3384.endif
3385        mov             v26.16b, v27.16b
3386.ifb \t
3387        sqxtun          v16.4h,  v16.4s
3388        sqxtun2         v16.8h,  v17.4s
3389.else
3390        sub             v16.8h,  v16.8h,  v15.8h // PREP_BIAS
3391.endif
3392        mov             v27.16b, v28.16b
3393        mov             v28.16b, v29.16b
3394.ifb \t
3395        umin            v16.8h,  v16.8h,  v15.8h // bitdepth_max
3396.endif
3397        mov             v29.16b, v30.16b
3398        mov             v30.16b, v31.16b
3399        subs            w10, w10, #1
3400        st1             {v16.8h}, [x0], x1
3401
3402        add             w6,  w6,  w4
3403        b.gt            1b
3404
3405        ldp             d14, d15, [sp, #0x30]
3406        ldp             d12, d13, [sp, #0x20]
3407        ldp             d10, d11, [sp, #0x10]
3408        ldp             d8,  d9,  [sp], 0x40
3409
3410        br              x15
3411endfunc
3412.endm
3413
3414warp
3415warp t
3416
3417// void dav1d_emu_edge_16bpc_neon(
3418//         const intptr_t bw, const intptr_t bh,
3419//         const intptr_t iw, const intptr_t ih,
3420//         const intptr_t x, const intptr_t y,
3421//         pixel *dst, const ptrdiff_t dst_stride,
3422//         const pixel *ref, const ptrdiff_t ref_stride)
3423function emu_edge_16bpc_neon, export=1
3424        ldp             x8,  x9,  [sp]
3425
3426        // ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride)
3427        // ref += iclip(x, 0, iw - 1)
3428        sub             x12, x3,  #1           // ih - 1
3429        cmp             x5,  x3
3430        sub             x13, x2,  #1           // iw - 1
3431        csel            x12, x12, x5,  ge      // min(y, ih - 1)
3432        cmp             x4,  x2
3433        bic             x12, x12, x12, asr #63 // max(min(y, ih - 1), 0)
3434        csel            x13, x13, x4,  ge      // min(x, iw - 1)
3435        bic             x13, x13, x13, asr #63 // max(min(x, iw - 1), 0)
3436        madd            x8,  x12, x9,  x8      // ref += iclip() * stride
3437        add             x8,  x8,  x13, lsl #1  // ref += iclip()
3438
3439        // bottom_ext = iclip(y + bh - ih, 0, bh - 1)
3440        // top_ext = iclip(-y, 0, bh - 1)
3441        add             x10, x5,  x1           // y + bh
3442        neg             x5,  x5                // -y
3443        sub             x10, x10, x3           // y + bh - ih
3444        sub             x12, x1,  #1           // bh - 1
3445        cmp             x10, x1
3446        bic             x5,  x5,  x5,  asr #63 // max(-y, 0)
3447        csel            x10, x10, x12, lt      // min(y + bh - ih, bh-1)
3448        cmp             x5,  x1
3449        bic             x10, x10, x10, asr #63 // max(min(y + bh - ih, bh-1), 0)
3450        csel            x5,  x5,  x12, lt      // min(max(-y, 0), bh-1)
3451
3452        // right_ext = iclip(x + bw - iw, 0, bw - 1)
3453        // left_ext = iclip(-x, 0, bw - 1)
3454        add             x11, x4,  x0           // x + bw
3455        neg             x4,  x4                // -x
3456        sub             x11, x11, x2           // x + bw - iw
3457        sub             x13, x0,  #1           // bw - 1
3458        cmp             x11, x0
3459        bic             x4,  x4,  x4,  asr #63 // max(-x, 0)
3460        csel            x11, x11, x13, lt      // min(x + bw - iw, bw-1)
3461        cmp             x4,  x0
3462        bic             x11, x11, x11, asr #63 // max(min(x + bw - iw, bw-1), 0)
3463        csel            x4,  x4,  x13, lt      // min(max(-x, 0), bw - 1)
3464
3465        // center_h = bh - top_ext - bottom_ext
3466        // dst += top_ext * PXSTRIDE(dst_stride)
3467        // center_w = bw - left_ext - right_ext
3468        sub             x1,  x1,  x5           // bh - top_ext
3469        madd            x6,  x5,  x7,  x6
3470        sub             x2,  x0,  x4           // bw - left_ext
3471        sub             x1,  x1,  x10          // center_h = bh - top_ext - bottom_ext
3472        sub             x2,  x2,  x11          // center_w = bw - left_ext - right_ext
3473
3474        mov             x14, x6                // backup of dst
3475
3476.macro v_loop need_left, need_right
34770:
3478.if \need_left
3479        ld1r            {v0.8h}, [x8]
3480        mov             x12, x6                // out = dst
3481        mov             x3,  x4
3482        mov             v1.16b,  v0.16b
34831:
3484        subs            x3,  x3,  #16
3485        st1             {v0.8h, v1.8h}, [x12], #32
3486        b.gt            1b
3487.endif
3488        mov             x13, x8
3489        add             x12, x6,  x4, lsl #1   // out = dst + left_ext
3490        mov             x3,  x2
34911:
3492        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x13], #64
3493        subs            x3,  x3,  #32
3494        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x12], #64
3495        b.gt            1b
3496.if \need_right
3497        add             x3,  x8,  x2, lsl #1   // in + center_w
3498        sub             x3,  x3,  #2           // in + center_w - 1
3499        add             x12, x6,  x4, lsl #1   // dst + left_ext
3500        ld1r            {v0.8h}, [x3]
3501        add             x12, x12, x2, lsl #1   // out = dst + left_ext + center_w
3502        mov             x3,  x11
3503        mov             v1.16b,  v0.16b
35041:
3505        subs            x3,  x3,  #16
3506        st1             {v0.8h, v1.8h}, [x12], #32
3507        b.gt            1b
3508.endif
3509
3510        subs            x1,  x1,  #1           // center_h--
3511        add             x6,  x6,  x7
3512        add             x8,  x8,  x9
3513        b.gt            0b
3514.endm
3515
3516        cbz             x4,  2f
3517        // need_left
3518        cbz             x11, 3f
3519        // need_left + need_right
3520        v_loop          1,   1
3521        b               5f
3522
35232:
3524        // !need_left
3525        cbz             x11, 4f
3526        // !need_left + need_right
3527        v_loop          0,   1
3528        b               5f
3529
35303:
3531        // need_left + !need_right
3532        v_loop          1,   0
3533        b               5f
3534
35354:
3536        // !need_left + !need_right
3537        v_loop          0,   0
3538
35395:
3540
3541        cbz             x10, 3f
3542        // need_bottom
3543        sub             x8,  x6,  x7           // ref = dst - stride
3544        mov             x4,  x0
35451:
3546        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x8], #64
3547        mov             x3,  x10
35482:
3549        subs            x3,  x3,  #1
3550        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7
3551        b.gt            2b
3552        msub            x6,  x7,  x10,  x6     // dst -= bottom_ext * stride
3553        subs            x4,  x4,  #32          // bw -= 32
3554        add             x6,  x6,  #64          // dst += 32
3555        b.gt            1b
3556
35573:
3558        cbz             x5,  3f
3559        // need_top
3560        msub            x6,  x7,  x5,  x14     // dst = stored_dst - top_ext * stride
35611:
3562        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x14], #64
3563        mov             x3,  x5
35642:
3565        subs            x3,  x3,  #1
3566        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7
3567        b.gt            2b
3568        msub            x6,  x7,  x5,  x6      // dst -= top_ext * stride
3569        subs            x0,  x0,  #32          // bw -= 32
3570        add             x6,  x6,  #64          // dst += 32
3571        b.gt            1b
3572
35733:
3574        ret
3575endfunc
3576