1/*
2 * Copyright © 2018, VideoLAN and dav1d authors
3 * Copyright © 2019, Martin Storsjo
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright notice, this
10 *    list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright notice,
13 *    this list of conditions and the following disclaimer in the documentation
14 *    and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include "src/arm/asm.S"
29#include "util.S"
30
31// void ipred_dc_128_8bpc_neon(pixel *dst, const ptrdiff_t stride,
32//                             const pixel *const topleft,
33//                             const int width, const int height, const int a,
34//                             const int max_width, const int max_height);
35function ipred_dc_128_8bpc_neon, export=1
36        clz             w3,  w3
37        adr             x5,  L(ipred_dc_128_tbl)
38        sub             w3,  w3,  #25
39        ldrh            w3,  [x5, w3, uxtw #1]
40        movi            v0.16b,  #128
41        sub             x5,  x5,  w3, uxtw
42        add             x6,  x0,  x1
43        lsl             x1,  x1,  #1
44        br              x5
454:
46        st1             {v0.s}[0],  [x0], x1
47        st1             {v0.s}[0],  [x6], x1
48        subs            w4,  w4,  #4
49        st1             {v0.s}[0],  [x0], x1
50        st1             {v0.s}[0],  [x6], x1
51        b.gt            4b
52        ret
538:
54        st1             {v0.8b},  [x0], x1
55        st1             {v0.8b},  [x6], x1
56        subs            w4,  w4,  #4
57        st1             {v0.8b},  [x0], x1
58        st1             {v0.8b},  [x6], x1
59        b.gt            8b
60        ret
6116:
62        st1             {v0.16b}, [x0], x1
63        st1             {v0.16b}, [x6], x1
64        subs            w4,  w4,  #4
65        st1             {v0.16b}, [x0], x1
66        st1             {v0.16b}, [x6], x1
67        b.gt            16b
68        ret
69320:
70        movi            v1.16b,  #128
7132:
72        st1             {v0.16b, v1.16b}, [x0], x1
73        st1             {v0.16b, v1.16b}, [x6], x1
74        subs            w4,  w4,  #4
75        st1             {v0.16b, v1.16b}, [x0], x1
76        st1             {v0.16b, v1.16b}, [x6], x1
77        b.gt            32b
78        ret
79640:
80        movi            v1.16b,  #128
81        movi            v2.16b,  #128
82        movi            v3.16b,  #128
8364:
84        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
85        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
86        subs            w4,  w4,  #4
87        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
88        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
89        b.gt            64b
90        ret
91
92L(ipred_dc_128_tbl):
93        .hword L(ipred_dc_128_tbl) - 640b
94        .hword L(ipred_dc_128_tbl) - 320b
95        .hword L(ipred_dc_128_tbl) -  16b
96        .hword L(ipred_dc_128_tbl) -   8b
97        .hword L(ipred_dc_128_tbl) -   4b
98endfunc
99
100// void ipred_v_8bpc_neon(pixel *dst, const ptrdiff_t stride,
101//                        const pixel *const topleft,
102//                        const int width, const int height, const int a,
103//                        const int max_width, const int max_height);
104function ipred_v_8bpc_neon, export=1
105        clz             w3,  w3
106        adr             x5,  L(ipred_v_tbl)
107        sub             w3,  w3,  #25
108        ldrh            w3,  [x5, w3, uxtw #1]
109        add             x2,  x2,  #1
110        sub             x5,  x5,  w3, uxtw
111        add             x6,  x0,  x1
112        lsl             x1,  x1,  #1
113        br              x5
11440:
115        ld1             {v0.s}[0],  [x2]
1164:
117        st1             {v0.s}[0],  [x0], x1
118        st1             {v0.s}[0],  [x6], x1
119        subs            w4,  w4,  #4
120        st1             {v0.s}[0],  [x0], x1
121        st1             {v0.s}[0],  [x6], x1
122        b.gt            4b
123        ret
12480:
125        ld1             {v0.8b},  [x2]
1268:
127        st1             {v0.8b},  [x0], x1
128        st1             {v0.8b},  [x6], x1
129        subs            w4,  w4,  #4
130        st1             {v0.8b},  [x0], x1
131        st1             {v0.8b},  [x6], x1
132        b.gt            8b
133        ret
134160:
135        ld1             {v0.16b}, [x2]
13616:
137        st1             {v0.16b}, [x0], x1
138        st1             {v0.16b}, [x6], x1
139        subs            w4,  w4,  #4
140        st1             {v0.16b}, [x0], x1
141        st1             {v0.16b}, [x6], x1
142        b.gt            16b
143        ret
144320:
145        ld1             {v0.16b, v1.16b}, [x2]
14632:
147        st1             {v0.16b, v1.16b}, [x0], x1
148        st1             {v0.16b, v1.16b}, [x6], x1
149        subs            w4,  w4,  #4
150        st1             {v0.16b, v1.16b}, [x0], x1
151        st1             {v0.16b, v1.16b}, [x6], x1
152        b.gt            32b
153        ret
154640:
155        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x2]
15664:
157        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
158        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
159        subs            w4,  w4,  #4
160        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
161        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
162        b.gt            64b
163        ret
164
165L(ipred_v_tbl):
166        .hword L(ipred_v_tbl) - 640b
167        .hword L(ipred_v_tbl) - 320b
168        .hword L(ipred_v_tbl) - 160b
169        .hword L(ipred_v_tbl) -  80b
170        .hword L(ipred_v_tbl) -  40b
171endfunc
172
173// void ipred_h_8bpc_neon(pixel *dst, const ptrdiff_t stride,
174//                        const pixel *const topleft,
175//                        const int width, const int height, const int a,
176//                        const int max_width, const int max_height);
177function ipred_h_8bpc_neon, export=1
178        clz             w3,  w3
179        adr             x5,  L(ipred_h_tbl)
180        sub             w3,  w3,  #25
181        ldrh            w3,  [x5, w3, uxtw #1]
182        sub             x2,  x2,  #4
183        sub             x5,  x5,  w3, uxtw
184        mov             x7,  #-4
185        add             x6,  x0,  x1
186        lsl             x1,  x1,  #1
187        br              x5
1884:
189        ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7
190        st1             {v3.s}[0],  [x0], x1
191        st1             {v2.s}[0],  [x6], x1
192        subs            w4,  w4,  #4
193        st1             {v1.s}[0],  [x0], x1
194        st1             {v0.s}[0],  [x6], x1
195        b.gt            4b
196        ret
1978:
198        ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7
199        st1             {v3.8b},  [x0], x1
200        st1             {v2.8b},  [x6], x1
201        subs            w4,  w4,  #4
202        st1             {v1.8b},  [x0], x1
203        st1             {v0.8b},  [x6], x1
204        b.gt            8b
205        ret
20616:
207        ld4r            {v0.16b, v1.16b, v2.16b, v3.16b},  [x2], x7
208        st1             {v3.16b}, [x0], x1
209        st1             {v2.16b}, [x6], x1
210        subs            w4,  w4,  #4
211        st1             {v1.16b}, [x0], x1
212        st1             {v0.16b}, [x6], x1
213        b.gt            16b
214        ret
21532:
216        ld4r            {v0.16b, v1.16b, v2.16b, v3.16b},  [x2], x7
217        str             q3,  [x0, #16]
218        str             q2,  [x6, #16]
219        st1             {v3.16b}, [x0], x1
220        st1             {v2.16b}, [x6], x1
221        subs            w4,  w4,  #4
222        str             q1,  [x0, #16]
223        str             q0,  [x6, #16]
224        st1             {v1.16b}, [x0], x1
225        st1             {v0.16b}, [x6], x1
226        b.gt            32b
227        ret
22864:
229        ld4r            {v0.16b, v1.16b, v2.16b, v3.16b},  [x2], x7
230        str             q3,  [x0, #16]
231        str             q2,  [x6, #16]
232        stp             q3,  q3,  [x0, #32]
233        stp             q2,  q2,  [x6, #32]
234        st1             {v3.16b}, [x0], x1
235        st1             {v2.16b}, [x6], x1
236        subs            w4,  w4,  #4
237        str             q1,  [x0, #16]
238        str             q0,  [x6, #16]
239        stp             q1,  q1,  [x0, #32]
240        stp             q0,  q0,  [x6, #32]
241        st1             {v1.16b}, [x0], x1
242        st1             {v0.16b}, [x6], x1
243        b.gt            64b
244        ret
245
246L(ipred_h_tbl):
247        .hword L(ipred_h_tbl) - 64b
248        .hword L(ipred_h_tbl) - 32b
249        .hword L(ipred_h_tbl) - 16b
250        .hword L(ipred_h_tbl) -  8b
251        .hword L(ipred_h_tbl) -  4b
252endfunc
253
254// void ipred_dc_top_8bpc_neon(pixel *dst, const ptrdiff_t stride,
255//                             const pixel *const topleft,
256//                             const int width, const int height, const int a,
257//                             const int max_width, const int max_height);
258function ipred_dc_top_8bpc_neon, export=1
259        clz             w3,  w3
260        adr             x5,  L(ipred_dc_top_tbl)
261        sub             w3,  w3,  #25
262        ldrh            w3,  [x5, w3, uxtw #1]
263        add             x2,  x2,  #1
264        sub             x5,  x5,  w3, uxtw
265        add             x6,  x0,  x1
266        lsl             x1,  x1,  #1
267        br              x5
26840:
269        ld1r            {v0.2s},  [x2]
270        uaddlv          h0,      v0.8b
271        rshrn           v0.8b,   v0.8h,   #3
272        dup             v0.8b,   v0.b[0]
2734:
274        st1             {v0.s}[0],  [x0], x1
275        st1             {v0.s}[0],  [x6], x1
276        subs            w4,  w4,  #4
277        st1             {v0.s}[0],  [x0], x1
278        st1             {v0.s}[0],  [x6], x1
279        b.gt            4b
280        ret
28180:
282        ld1             {v0.8b},  [x2]
283        uaddlv          h0,      v0.8b
284        rshrn           v0.8b,   v0.8h,   #3
285        dup             v0.8b,   v0.b[0]
2868:
287        st1             {v0.8b},  [x0], x1
288        st1             {v0.8b},  [x6], x1
289        subs            w4,  w4,  #4
290        st1             {v0.8b},  [x0], x1
291        st1             {v0.8b},  [x6], x1
292        b.gt            8b
293        ret
294160:
295        ld1             {v0.16b}, [x2]
296        uaddlv          h0,      v0.16b
297        rshrn           v0.8b,   v0.8h,   #4
298        dup             v0.16b,  v0.b[0]
29916:
300        st1             {v0.16b}, [x0], x1
301        st1             {v0.16b}, [x6], x1
302        subs            w4,  w4,  #4
303        st1             {v0.16b}, [x0], x1
304        st1             {v0.16b}, [x6], x1
305        b.gt            16b
306        ret
307320:
308        ld1             {v0.16b, v1.16b}, [x2]
309        uaddlv          h0,      v0.16b
310        uaddlv          h1,      v1.16b
311        add             v2.4h,   v0.4h,   v1.4h
312        rshrn           v2.8b,   v2.8h,   #5
313        dup             v0.16b,  v2.b[0]
314        dup             v1.16b,  v2.b[0]
31532:
316        st1             {v0.16b, v1.16b}, [x0], x1
317        st1             {v0.16b, v1.16b}, [x6], x1
318        subs            w4,  w4,  #4
319        st1             {v0.16b, v1.16b}, [x0], x1
320        st1             {v0.16b, v1.16b}, [x6], x1
321        b.gt            32b
322        ret
323640:
324        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x2]
325        uaddlv          h0,      v0.16b
326        uaddlv          h1,      v1.16b
327        uaddlv          h2,      v2.16b
328        uaddlv          h3,      v3.16b
329        add             v4.4h,   v0.4h,   v1.4h
330        add             v5.4h,   v2.4h,   v3.4h
331        add             v4.4h,   v4.4h,   v5.4h
332        rshrn           v4.8b,   v4.8h,   #6
333        dup             v0.16b,  v4.b[0]
334        dup             v1.16b,  v4.b[0]
335        dup             v2.16b,  v4.b[0]
336        dup             v3.16b,  v4.b[0]
33764:
338        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
339        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
340        subs            w4,  w4,  #4
341        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
342        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
343        b.gt            64b
344        ret
345
346L(ipred_dc_top_tbl):
347        .hword L(ipred_dc_top_tbl) - 640b
348        .hword L(ipred_dc_top_tbl) - 320b
349        .hword L(ipred_dc_top_tbl) - 160b
350        .hword L(ipred_dc_top_tbl) -  80b
351        .hword L(ipred_dc_top_tbl) -  40b
352endfunc
353
354// void ipred_dc_left_8bpc_neon(pixel *dst, const ptrdiff_t stride,
355//                              const pixel *const topleft,
356//                              const int width, const int height, const int a,
357//                              const int max_width, const int max_height);
358function ipred_dc_left_8bpc_neon, export=1
359        sub             x2,  x2,  w4, uxtw
360        clz             w3,  w3
361        clz             w7,  w4
362        adr             x5,  L(ipred_dc_left_tbl)
363        sub             w3,  w3,  #20 // 25 leading bits, minus table offset 5
364        sub             w7,  w7,  #25
365        ldrh            w3,  [x5, w3, uxtw #1]
366        ldrh            w7,  [x5, w7, uxtw #1]
367        sub             x3,  x5,  w3, uxtw
368        sub             x5,  x5,  w7, uxtw
369        add             x6,  x0,  x1
370        lsl             x1,  x1,  #1
371        br              x5
372
373L(ipred_dc_left_h4):
374        ld1r            {v0.2s},  [x2]
375        uaddlv          h0,      v0.8b
376        rshrn           v0.8b,   v0.8h,   #3
377        dup             v0.16b,  v0.b[0]
378        br              x3
379L(ipred_dc_left_w4):
380        st1             {v0.s}[0],  [x0], x1
381        st1             {v0.s}[0],  [x6], x1
382        subs            w4,  w4,  #4
383        st1             {v0.s}[0],  [x0], x1
384        st1             {v0.s}[0],  [x6], x1
385        b.gt            L(ipred_dc_left_w4)
386        ret
387
388L(ipred_dc_left_h8):
389        ld1             {v0.8b},  [x2]
390        uaddlv          h0,      v0.8b
391        rshrn           v0.8b,   v0.8h,   #3
392        dup             v0.16b,  v0.b[0]
393        br              x3
394L(ipred_dc_left_w8):
395        st1             {v0.8b},  [x0], x1
396        st1             {v0.8b},  [x6], x1
397        subs            w4,  w4,  #4
398        st1             {v0.8b},  [x0], x1
399        st1             {v0.8b},  [x6], x1
400        b.gt            L(ipred_dc_left_w8)
401        ret
402
403L(ipred_dc_left_h16):
404        ld1             {v0.16b}, [x2]
405        uaddlv          h0,      v0.16b
406        rshrn           v0.8b,   v0.8h,   #4
407        dup             v0.16b,  v0.b[0]
408        br              x3
409L(ipred_dc_left_w16):
410        st1             {v0.16b}, [x0], x1
411        st1             {v0.16b}, [x6], x1
412        subs            w4,  w4,  #4
413        st1             {v0.16b}, [x0], x1
414        st1             {v0.16b}, [x6], x1
415        b.gt            L(ipred_dc_left_w16)
416        ret
417
418L(ipred_dc_left_h32):
419        ld1             {v0.16b, v1.16b}, [x2]
420        uaddlv          h0,      v0.16b
421        uaddlv          h1,      v1.16b
422        add             v0.4h,   v0.4h,   v1.4h
423        rshrn           v0.8b,   v0.8h,   #5
424        dup             v0.16b,  v0.b[0]
425        br              x3
426L(ipred_dc_left_w32):
427        mov             v1.16b,  v0.16b
4281:
429        st1             {v0.16b, v1.16b}, [x0], x1
430        st1             {v0.16b, v1.16b}, [x6], x1
431        subs            w4,  w4,  #4
432        st1             {v0.16b, v1.16b}, [x0], x1
433        st1             {v0.16b, v1.16b}, [x6], x1
434        b.gt            1b
435        ret
436
437L(ipred_dc_left_h64):
438        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x2]
439        uaddlv          h0,      v0.16b
440        uaddlv          h1,      v1.16b
441        uaddlv          h2,      v2.16b
442        uaddlv          h3,      v3.16b
443        add             v0.4h,   v0.4h,   v1.4h
444        add             v2.4h,   v2.4h,   v3.4h
445        add             v0.4h,   v0.4h,   v2.4h
446        rshrn           v0.8b,   v0.8h,   #6
447        dup             v0.16b,  v0.b[0]
448        br              x3
449L(ipred_dc_left_w64):
450        mov             v1.16b,  v0.16b
451        mov             v2.16b,  v0.16b
452        mov             v3.16b,  v0.16b
4531:
454        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
455        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
456        subs            w4,  w4,  #4
457        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
458        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
459        b.gt            1b
460        ret
461
462L(ipred_dc_left_tbl):
463        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h64)
464        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h32)
465        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h16)
466        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h8)
467        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h4)
468        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w64)
469        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w32)
470        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w16)
471        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w8)
472        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w4)
473endfunc
474
475// void ipred_dc_8bpc_neon(pixel *dst, const ptrdiff_t stride,
476//                         const pixel *const topleft,
477//                         const int width, const int height, const int a,
478//                         const int max_width, const int max_height);
479function ipred_dc_8bpc_neon, export=1
480        sub             x2,  x2,  w4, uxtw
481        add             w7,  w3,  w4             // width + height
482        clz             w3,  w3
483        clz             w6,  w4
484        dup             v16.8h, w7               // width + height
485        adr             x5,  L(ipred_dc_tbl)
486        rbit            w7,  w7                  // rbit(width + height)
487        sub             w3,  w3,  #20            // 25 leading bits, minus table offset 5
488        sub             w6,  w6,  #25
489        clz             w7,  w7                  // ctz(width + height)
490        ldrh            w3,  [x5, w3, uxtw #1]
491        ldrh            w6,  [x5, w6, uxtw #1]
492        neg             w7,  w7                  // -ctz(width + height)
493        sub             x3,  x5,  w3, uxtw
494        sub             x5,  x5,  w6, uxtw
495        ushr            v16.8h,  v16.8h,  #1     // (width + height) >> 1
496        dup             v17.8h,  w7              // -ctz(width + height)
497        add             x6,  x0,  x1
498        lsl             x1,  x1,  #1
499        br              x5
500
501L(ipred_dc_h4):
502        ld1             {v0.s}[0],  [x2], #4
503        ins             v0.s[1], wzr
504        uaddlv          h0,      v0.8b
505        br              x3
506L(ipred_dc_w4):
507        add             x2,  x2,  #1
508        ld1             {v1.s}[0],  [x2]
509        ins             v1.s[1], wzr
510        add             v0.4h,   v0.4h,   v16.4h
511        uaddlv          h1,      v1.8b
512        cmp             w4,  #4
513        add             v0.4h,   v0.4h,   v1.4h
514        ushl            v0.4h,   v0.4h,   v17.4h
515        b.eq            1f
516        // h = 8/16
517        mov             w16, #(0x3334/2)
518        movk            w16, #(0x5556/2), lsl #16
519        add             w17, w4,  w4  // w17 = 2*h = 16 or 32
520        lsr             w16, w16, w17
521        dup             v16.4h,  w16
522        sqdmulh         v0.4h,   v0.4h,   v16.4h
5231:
524        dup             v0.8b,   v0.b[0]
5252:
526        st1             {v0.s}[0],  [x0], x1
527        st1             {v0.s}[0],  [x6], x1
528        subs            w4,  w4,  #4
529        st1             {v0.s}[0],  [x0], x1
530        st1             {v0.s}[0],  [x6], x1
531        b.gt            2b
532        ret
533
534L(ipred_dc_h8):
535        ld1             {v0.8b},  [x2], #8
536        uaddlv          h0,      v0.8b
537        br              x3
538L(ipred_dc_w8):
539        add             x2,  x2,  #1
540        ld1             {v1.8b},  [x2]
541        add             v0.4h,   v0.4h,   v16.4h
542        uaddlv          h1,      v1.8b
543        cmp             w4,  #8
544        add             v0.4h,   v0.4h,   v1.4h
545        ushl            v0.4h,   v0.4h,   v17.4h
546        b.eq            1f
547        // h = 4/16/32
548        cmp             w4,  #32
549        mov             w16, #(0x3334/2)
550        mov             w17, #(0x5556/2)
551        csel            w16, w16, w17, eq
552        dup             v16.4h,  w16
553        sqdmulh         v0.4h,   v0.4h,   v16.4h
5541:
555        dup             v0.8b,   v0.b[0]
5562:
557        st1             {v0.8b},  [x0], x1
558        st1             {v0.8b},  [x6], x1
559        subs            w4,  w4,  #4
560        st1             {v0.8b},  [x0], x1
561        st1             {v0.8b},  [x6], x1
562        b.gt            2b
563        ret
564
565L(ipred_dc_h16):
566        ld1             {v0.16b}, [x2], #16
567        uaddlv          h0,      v0.16b
568        br              x3
569L(ipred_dc_w16):
570        add             x2,  x2,  #1
571        ld1             {v1.16b}, [x2]
572        add             v0.4h,   v0.4h,   v16.4h
573        uaddlv          h1,      v1.16b
574        cmp             w4,  #16
575        add             v0.4h,   v0.4h,   v1.4h
576        ushl            v0.4h,   v0.4h,   v17.4h
577        b.eq            1f
578        // h = 4/8/32/64
579        tst             w4,  #(32+16+8) // 16 added to make a consecutive bitmask
580        mov             w16, #(0x3334/2)
581        mov             w17, #(0x5556/2)
582        csel            w16, w16, w17, eq
583        dup             v16.4h,  w16
584        sqdmulh         v0.4h,   v0.4h,   v16.4h
5851:
586        dup             v0.16b,  v0.b[0]
5872:
588        st1             {v0.16b}, [x0], x1
589        st1             {v0.16b}, [x6], x1
590        subs            w4,  w4,  #4
591        st1             {v0.16b}, [x0], x1
592        st1             {v0.16b}, [x6], x1
593        b.gt            2b
594        ret
595
596L(ipred_dc_h32):
597        ld1             {v0.16b, v1.16b}, [x2], #32
598        uaddlv          h0,      v0.16b
599        uaddlv          h1,      v1.16b
600        add             v0.4h,   v0.4h,   v1.4h
601        br              x3
602L(ipred_dc_w32):
603        add             x2,  x2,  #1
604        ld1             {v1.16b, v2.16b}, [x2]
605        add             v0.4h,   v0.4h,   v16.4h
606        uaddlv          h1,      v1.16b
607        uaddlv          h2,      v2.16b
608        cmp             w4,  #32
609        add             v0.4h,   v0.4h,   v1.4h
610        add             v0.4h,   v0.4h,   v2.4h
611        ushl            v4.4h,   v0.4h,   v17.4h
612        b.eq            1f
613        // h = 8/16/64
614        cmp             w4,  #8
615        mov             w16, #(0x3334/2)
616        mov             w17, #(0x5556/2)
617        csel            w16, w16, w17, eq
618        dup             v16.4h,  w16
619        sqdmulh         v4.4h,   v4.4h,   v16.4h
6201:
621        dup             v0.16b,  v4.b[0]
622        dup             v1.16b,  v4.b[0]
6232:
624        st1             {v0.16b, v1.16b}, [x0], x1
625        st1             {v0.16b, v1.16b}, [x6], x1
626        subs            w4,  w4,  #4
627        st1             {v0.16b, v1.16b}, [x0], x1
628        st1             {v0.16b, v1.16b}, [x6], x1
629        b.gt            2b
630        ret
631
632L(ipred_dc_h64):
633        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], #64
634        uaddlv          h0,      v0.16b
635        uaddlv          h1,      v1.16b
636        uaddlv          h2,      v2.16b
637        uaddlv          h3,      v3.16b
638        add             v0.4h,   v0.4h,   v1.4h
639        add             v2.4h,   v2.4h,   v3.4h
640        add             v0.4h,   v0.4h,   v2.4h
641        br              x3
642L(ipred_dc_w64):
643        add             x2,  x2,  #1
644        ld1             {v1.16b, v2.16b, v3.16b, v4.16b}, [x2]
645        add             v0.4h,   v0.4h,   v16.4h
646        uaddlv          h1,      v1.16b
647        uaddlv          h2,      v2.16b
648        uaddlv          h3,      v3.16b
649        uaddlv          h4,      v4.16b
650        add             v1.4h,   v1.4h,   v2.4h
651        add             v3.4h,   v3.4h,   v4.4h
652        cmp             w4,  #64
653        add             v0.4h,   v0.4h,   v1.4h
654        add             v0.4h,   v0.4h,   v3.4h
655        ushl            v4.4h,   v0.4h,   v17.4h
656        b.eq            1f
657        // h = 16/32
658        mov             w16, #(0x5556/2)
659        movk            w16, #(0x3334/2), lsl #16
660        lsr             w16, w16, w4
661        dup             v16.4h,  w16
662        sqdmulh         v4.4h,   v4.4h,   v16.4h
6631:
664        dup             v0.16b,  v4.b[0]
665        dup             v1.16b,  v4.b[0]
666        dup             v2.16b,  v4.b[0]
667        dup             v3.16b,  v4.b[0]
6682:
669        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
670        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
671        subs            w4,  w4,  #4
672        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
673        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
674        b.gt            2b
675        ret
676
677L(ipred_dc_tbl):
678        .hword L(ipred_dc_tbl) - L(ipred_dc_h64)
679        .hword L(ipred_dc_tbl) - L(ipred_dc_h32)
680        .hword L(ipred_dc_tbl) - L(ipred_dc_h16)
681        .hword L(ipred_dc_tbl) - L(ipred_dc_h8)
682        .hword L(ipred_dc_tbl) - L(ipred_dc_h4)
683        .hword L(ipred_dc_tbl) - L(ipred_dc_w64)
684        .hword L(ipred_dc_tbl) - L(ipred_dc_w32)
685        .hword L(ipred_dc_tbl) - L(ipred_dc_w16)
686        .hword L(ipred_dc_tbl) - L(ipred_dc_w8)
687        .hword L(ipred_dc_tbl) - L(ipred_dc_w4)
688endfunc
689
690// void ipred_paeth_8bpc_neon(pixel *dst, const ptrdiff_t stride,
691//                            const pixel *const topleft,
692//                            const int width, const int height, const int a,
693//                            const int max_width, const int max_height);
694function ipred_paeth_8bpc_neon, export=1
695        clz             w9,  w3
696        adr             x5,  L(ipred_paeth_tbl)
697        sub             w9,  w9,  #25
698        ldrh            w9,  [x5, w9, uxtw #1]
699        ld1r            {v4.16b},  [x2]
700        add             x8,  x2,  #1
701        sub             x2,  x2,  #4
702        sub             x5,  x5,  w9, uxtw
703        mov             x7,  #-4
704        add             x6,  x0,  x1
705        lsl             x1,  x1,  #1
706        br              x5
70740:
708        ld1r            {v5.4s},  [x8]
709        usubl           v6.8h,   v5.8b,   v4.8b   // top - topleft
7104:
711        ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7
712        zip1            v0.2s,   v0.2s,   v1.2s
713        zip1            v2.2s,   v2.2s,   v3.2s
714        uaddw           v16.8h,  v6.8h,   v0.8b
715        uaddw           v17.8h,  v6.8h,   v2.8b
716        sqxtun          v16.8b,  v16.8h           // base
717        sqxtun2         v16.16b, v17.8h
718        zip1            v0.2d,   v0.2d,   v2.2d
719        uabd            v20.16b, v5.16b,  v16.16b // tdiff
720        uabd            v22.16b, v4.16b,  v16.16b // tldiff
721        uabd            v16.16b, v0.16b,  v16.16b // ldiff
722        umin            v18.16b, v20.16b, v22.16b // min(tdiff, tldiff)
723        cmhs            v20.16b, v22.16b, v20.16b // tldiff >= tdiff
724        cmhs            v16.16b, v18.16b, v16.16b // min(tdiff, tldiff) >= ldiff
725        bsl             v20.16b, v5.16b,  v4.16b  // tdiff <= tldiff ? top : topleft
726        bit             v20.16b, v0.16b,  v16.16b // ldiff <= min ? left : ...
727        st1             {v20.s}[3], [x0], x1
728        st1             {v20.s}[2], [x6], x1
729        subs            w4,  w4,  #4
730        st1             {v20.s}[1], [x0], x1
731        st1             {v20.s}[0], [x6], x1
732        b.gt            4b
733        ret
73480:
735        ld1r            {v5.2d},  [x8]
736        usubl           v6.8h,   v5.8b,   v4.8b   // top - topleft
7378:
738        ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7
739        uaddw           v16.8h,  v6.8h,   v0.8b
740        uaddw           v17.8h,  v6.8h,   v1.8b
741        uaddw           v18.8h,  v6.8h,   v2.8b
742        uaddw           v19.8h,  v6.8h,   v3.8b
743        sqxtun          v16.8b,  v16.8h           // base
744        sqxtun2         v16.16b, v17.8h
745        sqxtun          v18.8b,  v18.8h
746        sqxtun2         v18.16b, v19.8h
747        zip1            v2.2d,   v2.2d,   v3.2d
748        zip1            v0.2d,   v0.2d,   v1.2d
749        uabd            v21.16b, v5.16b,  v18.16b // tdiff
750        uabd            v20.16b, v5.16b,  v16.16b
751        uabd            v23.16b, v4.16b,  v18.16b // tldiff
752        uabd            v22.16b, v4.16b,  v16.16b
753        uabd            v17.16b, v2.16b,  v18.16b // ldiff
754        uabd            v16.16b, v0.16b,  v16.16b
755        umin            v19.16b, v21.16b, v23.16b // min(tdiff, tldiff)
756        umin            v18.16b, v20.16b, v22.16b
757        cmhs            v21.16b, v23.16b, v21.16b // tldiff >= tdiff
758        cmhs            v20.16b, v22.16b, v20.16b
759        cmhs            v17.16b, v19.16b, v17.16b // min(tdiff, tldiff) >= ldiff
760        cmhs            v16.16b, v18.16b, v16.16b
761        bsl             v21.16b, v5.16b,  v4.16b  // tdiff <= tldiff ? top : topleft
762        bsl             v20.16b, v5.16b,  v4.16b
763        bit             v21.16b, v2.16b,  v17.16b // ldiff <= min ? left : ...
764        bit             v20.16b, v0.16b,  v16.16b
765        st1             {v21.d}[1], [x0], x1
766        st1             {v21.d}[0], [x6], x1
767        subs            w4,  w4,  #4
768        st1             {v20.d}[1], [x0], x1
769        st1             {v20.d}[0], [x6], x1
770        b.gt            8b
771        ret
772160:
773320:
774640:
775        ld1             {v5.16b},  [x8], #16
776        mov             w9,  w3
777        // Set up pointers for four rows in parallel; x0, x6, x5, x10
778        add             x5,  x0,  x1
779        add             x10, x6,  x1
780        lsl             x1,  x1,  #1
781        sub             x1,  x1,  w3, uxtw
7821:
783        ld4r            {v0.16b, v1.16b, v2.16b, v3.16b},  [x2], x7
7842:
785        usubl           v6.8h,   v5.8b,   v4.8b   // top - topleft
786        usubl2          v7.8h,   v5.16b,  v4.16b
787        uaddw           v24.8h,  v6.8h,   v0.8b
788        uaddw           v25.8h,  v7.8h,   v0.8b
789        uaddw           v26.8h,  v6.8h,   v1.8b
790        uaddw           v27.8h,  v7.8h,   v1.8b
791        uaddw           v28.8h,  v6.8h,   v2.8b
792        uaddw           v29.8h,  v7.8h,   v2.8b
793        uaddw           v30.8h,  v6.8h,   v3.8b
794        uaddw           v31.8h,  v7.8h,   v3.8b
795        sqxtun          v17.8b,  v26.8h           // base
796        sqxtun2         v17.16b, v27.8h
797        sqxtun          v16.8b,  v24.8h
798        sqxtun2         v16.16b, v25.8h
799        sqxtun          v19.8b,  v30.8h
800        sqxtun2         v19.16b, v31.8h
801        sqxtun          v18.8b,  v28.8h
802        sqxtun2         v18.16b, v29.8h
803        uabd            v23.16b, v5.16b,  v19.16b // tdiff
804        uabd            v22.16b, v5.16b,  v18.16b
805        uabd            v21.16b, v5.16b,  v17.16b
806        uabd            v20.16b, v5.16b,  v16.16b
807        uabd            v27.16b, v4.16b,  v19.16b // tldiff
808        uabd            v26.16b, v4.16b,  v18.16b
809        uabd            v25.16b, v4.16b,  v17.16b
810        uabd            v24.16b, v4.16b,  v16.16b
811        uabd            v19.16b, v3.16b,  v19.16b // ldiff
812        uabd            v18.16b, v2.16b,  v18.16b
813        uabd            v17.16b, v1.16b,  v17.16b
814        uabd            v16.16b, v0.16b,  v16.16b
815        umin            v31.16b, v23.16b, v27.16b // min(tdiff, tldiff)
816        umin            v30.16b, v22.16b, v26.16b
817        umin            v29.16b, v21.16b, v25.16b
818        umin            v28.16b, v20.16b, v24.16b
819        cmhs            v23.16b, v27.16b, v23.16b // tldiff >= tdiff
820        cmhs            v22.16b, v26.16b, v22.16b
821        cmhs            v21.16b, v25.16b, v21.16b
822        cmhs            v20.16b, v24.16b, v20.16b
823        cmhs            v19.16b, v31.16b, v19.16b // min(tdiff, tldiff) >= ldiff
824        cmhs            v18.16b, v30.16b, v18.16b
825        cmhs            v17.16b, v29.16b, v17.16b
826        cmhs            v16.16b, v28.16b, v16.16b
827        bsl             v23.16b, v5.16b,  v4.16b  // tdiff <= tldiff ? top : topleft
828        bsl             v22.16b, v5.16b,  v4.16b
829        bsl             v21.16b, v5.16b,  v4.16b
830        bsl             v20.16b, v5.16b,  v4.16b
831        bit             v23.16b, v3.16b,  v19.16b // ldiff <= min ? left : ...
832        bit             v22.16b, v2.16b,  v18.16b
833        bit             v21.16b, v1.16b,  v17.16b
834        bit             v20.16b, v0.16b,  v16.16b
835        subs            w3,  w3,  #16
836        st1             {v23.16b}, [x0],  #16
837        st1             {v22.16b}, [x6],  #16
838        st1             {v21.16b}, [x5],  #16
839        st1             {v20.16b}, [x10], #16
840        b.le            8f
841        ld1             {v5.16b},  [x8], #16
842        b               2b
8438:
844        subs            w4,  w4,  #4
845        b.le            9f
846        // End of horizontal loop, move pointers to next four rows
847        sub             x8,  x8,  w9, uxtw
848        add             x0,  x0,  x1
849        add             x6,  x6,  x1
850        // Load the top row as early as possible
851        ld1             {v5.16b},  [x8], #16
852        add             x5,  x5,  x1
853        add             x10, x10, x1
854        mov             w3,  w9
855        b               1b
8569:
857        ret
858
859L(ipred_paeth_tbl):
860        .hword L(ipred_paeth_tbl) - 640b
861        .hword L(ipred_paeth_tbl) - 320b
862        .hword L(ipred_paeth_tbl) - 160b
863        .hword L(ipred_paeth_tbl) -  80b
864        .hword L(ipred_paeth_tbl) -  40b
865endfunc
866
867// void ipred_smooth_8bpc_neon(pixel *dst, const ptrdiff_t stride,
868//                             const pixel *const topleft,
869//                             const int width, const int height, const int a,
870//                             const int max_width, const int max_height);
871function ipred_smooth_8bpc_neon, export=1
872        movrel          x10, X(sm_weights)
873        add             x11, x10, w4, uxtw
874        add             x10, x10, w3, uxtw
875        clz             w9,  w3
876        adr             x5,  L(ipred_smooth_tbl)
877        sub             x12, x2,  w4, uxtw
878        sub             w9,  w9,  #25
879        ldrh            w9,  [x5, w9, uxtw #1]
880        ld1r            {v4.16b},  [x12] // bottom
881        add             x8,  x2,  #1
882        sub             x5,  x5,  w9, uxtw
883        add             x6,  x0,  x1
884        lsl             x1,  x1,  #1
885        br              x5
88640:
887        ld1r            {v6.2s}, [x8]             // top
888        ld1r            {v7.2s}, [x10]            // weights_hor
889        sub             x2,  x2,  #4
890        mov             x7,  #-4
891        dup             v5.16b,  v6.b[3]          // right
892        usubl           v6.8h,   v6.8b,   v4.8b   // top-bottom
893        uxtl            v7.8h,   v7.8b            // weights_hor
8944:
895        ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7 // left
896        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x11], #4 // weights_ver
897        shll            v20.8h,  v5.8b,   #8      // right*256
898        shll            v21.8h,  v5.8b,   #8
899        zip1            v1.2s,   v1.2s,   v0.2s   // left, flipped
900        zip1            v0.2s,   v3.2s,   v2.2s
901        zip1            v16.2s,  v16.2s,  v17.2s  // weights_ver
902        zip1            v18.2s,  v18.2s,  v19.2s
903        shll            v22.8h,  v4.8b,   #8      // bottom*256
904        shll            v23.8h,  v4.8b,   #8
905        usubl           v0.8h,   v0.8b,   v5.8b   // left-right
906        usubl           v1.8h,   v1.8b,   v5.8b
907        uxtl            v16.8h,  v16.8b           // weights_ver
908        uxtl            v18.8h,  v18.8b
909        mla             v20.8h,  v0.8h,   v7.8h   // right*256  + (left-right)*weights_hor
910        mla             v21.8h,  v1.8h,   v7.8h
911        mla             v22.8h,  v6.8h,   v16.8h  // bottom*256 + (top-bottom)*weights_ver
912        mla             v23.8h,  v6.8h,   v18.8h
913        uhadd           v20.8h,  v20.8h,  v22.8h
914        uhadd           v21.8h,  v21.8h,  v23.8h
915        rshrn           v20.8b,  v20.8h,  #8
916        rshrn           v21.8b,  v21.8h,  #8
917        st1             {v20.s}[0], [x0], x1
918        st1             {v20.s}[1], [x6], x1
919        subs            w4,  w4,  #4
920        st1             {v21.s}[0], [x0], x1
921        st1             {v21.s}[1], [x6], x1
922        b.gt            4b
923        ret
92480:
925        ld1             {v6.8b}, [x8]             // top
926        ld1             {v7.8b}, [x10]            // weights_hor
927        sub             x2,  x2,  #4
928        mov             x7,  #-4
929        dup             v5.16b,  v6.b[7]          // right
930        usubl           v6.8h,   v6.8b,   v4.8b   // top-bottom
931        uxtl            v7.8h,   v7.8b            // weights_hor
9328:
933        ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7 // left
934        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x11], #4 // weights_ver
935        shll            v20.8h,  v5.8b,   #8      // right*256
936        shll            v21.8h,  v5.8b,   #8
937        shll            v22.8h,  v5.8b,   #8
938        shll            v23.8h,  v5.8b,   #8
939        usubl           v0.8h,   v0.8b,   v5.8b   // left-right
940        usubl           v1.8h,   v1.8b,   v5.8b
941        usubl           v2.8h,   v2.8b,   v5.8b
942        usubl           v3.8h,   v3.8b,   v5.8b
943        shll            v24.8h,  v4.8b,   #8      // bottom*256
944        shll            v25.8h,  v4.8b,   #8
945        shll            v26.8h,  v4.8b,   #8
946        shll            v27.8h,  v4.8b,   #8
947        uxtl            v16.8h,  v16.8b           // weights_ver
948        uxtl            v17.8h,  v17.8b
949        uxtl            v18.8h,  v18.8b
950        uxtl            v19.8h,  v19.8b
951        mla             v20.8h,  v3.8h,   v7.8h   // right*256  + (left-right)*weights_hor
952        mla             v21.8h,  v2.8h,   v7.8h   // (left flipped)
953        mla             v22.8h,  v1.8h,   v7.8h
954        mla             v23.8h,  v0.8h,   v7.8h
955        mla             v24.8h,  v6.8h,   v16.8h  // bottom*256 + (top-bottom)*weights_ver
956        mla             v25.8h,  v6.8h,   v17.8h
957        mla             v26.8h,  v6.8h,   v18.8h
958        mla             v27.8h,  v6.8h,   v19.8h
959        uhadd           v20.8h,  v20.8h,  v24.8h
960        uhadd           v21.8h,  v21.8h,  v25.8h
961        uhadd           v22.8h,  v22.8h,  v26.8h
962        uhadd           v23.8h,  v23.8h,  v27.8h
963        rshrn           v20.8b,  v20.8h,  #8
964        rshrn           v21.8b,  v21.8h,  #8
965        rshrn           v22.8b,  v22.8h,  #8
966        rshrn           v23.8b,  v23.8h,  #8
967        st1             {v20.8b}, [x0], x1
968        st1             {v21.8b}, [x6], x1
969        subs            w4,  w4,  #4
970        st1             {v22.8b}, [x0], x1
971        st1             {v23.8b}, [x6], x1
972        b.gt            8b
973        ret
974160:
975320:
976640:
977        add             x12, x2,  w3, uxtw
978        sub             x2,  x2,  #2
979        mov             x7,  #-2
980        ld1r            {v5.16b}, [x12]           // right
981        sub             x1,  x1,  w3, uxtw
982        mov             w9,  w3
983
9841:
985        ld2r            {v0.8b, v1.8b},   [x2],  x7 // left
986        ld2r            {v16.8b, v17.8b}, [x11], #2 // weights_ver
987        usubl           v0.8h,   v0.8b,   v5.8b   // left-right
988        usubl           v1.8h,   v1.8b,   v5.8b
989        uxtl            v16.8h,  v16.8b           // weights_ver
990        uxtl            v17.8h,  v17.8b
9912:
992        ld1             {v7.16b}, [x10],  #16     // weights_hor
993        ld1             {v3.16b}, [x8],   #16     // top
994        shll            v20.8h,  v5.8b,   #8      // right*256
995        shll            v21.8h,  v5.8b,   #8
996        shll            v22.8h,  v5.8b,   #8
997        shll            v23.8h,  v5.8b,   #8
998        uxtl            v6.8h,   v7.8b            // weights_hor
999        uxtl2           v7.8h,   v7.16b
1000        usubl           v2.8h,   v3.8b,   v4.8b   // top-bottom
1001        usubl2          v3.8h,   v3.16b,  v4.16b
1002        mla             v20.8h,  v1.8h,   v6.8h   // right*256  + (left-right)*weights_hor
1003        mla             v21.8h,  v1.8h,   v7.8h   // (left flipped)
1004        mla             v22.8h,  v0.8h,   v6.8h
1005        mla             v23.8h,  v0.8h,   v7.8h
1006        shll            v24.8h,  v4.8b,   #8      // bottom*256
1007        shll            v25.8h,  v4.8b,   #8
1008        shll            v26.8h,  v4.8b,   #8
1009        shll            v27.8h,  v4.8b,   #8
1010        mla             v24.8h,  v2.8h,   v16.8h  // bottom*256 + (top-bottom)*weights_ver
1011        mla             v25.8h,  v3.8h,   v16.8h
1012        mla             v26.8h,  v2.8h,   v17.8h
1013        mla             v27.8h,  v3.8h,   v17.8h
1014        uhadd           v20.8h,  v20.8h,  v24.8h
1015        uhadd           v21.8h,  v21.8h,  v25.8h
1016        uhadd           v22.8h,  v22.8h,  v26.8h
1017        uhadd           v23.8h,  v23.8h,  v27.8h
1018        rshrn           v20.8b,  v20.8h,  #8
1019        rshrn2          v20.16b, v21.8h,  #8
1020        rshrn           v22.8b,  v22.8h,  #8
1021        rshrn2          v22.16b, v23.8h,  #8
1022        subs            w3,  w3,  #16
1023        st1             {v20.16b}, [x0],  #16
1024        st1             {v22.16b}, [x6],  #16
1025        b.gt            2b
1026        subs            w4,  w4,  #2
1027        b.le            9f
1028        sub             x8,  x8,  w9, uxtw
1029        sub             x10, x10, w9, uxtw
1030        add             x0,  x0,  x1
1031        add             x6,  x6,  x1
1032        mov             w3,  w9
1033        b               1b
10349:
1035        ret
1036
1037L(ipred_smooth_tbl):
1038        .hword L(ipred_smooth_tbl) - 640b
1039        .hword L(ipred_smooth_tbl) - 320b
1040        .hword L(ipred_smooth_tbl) - 160b
1041        .hword L(ipred_smooth_tbl) -  80b
1042        .hword L(ipred_smooth_tbl) -  40b
1043endfunc
1044
1045// void ipred_smooth_v_8bpc_neon(pixel *dst, const ptrdiff_t stride,
1046//                               const pixel *const topleft,
1047//                               const int width, const int height, const int a,
1048//                               const int max_width, const int max_height);
1049function ipred_smooth_v_8bpc_neon, export=1
1050        movrel          x7,  X(sm_weights)
1051        add             x7,  x7,  w4, uxtw
1052        clz             w9,  w3
1053        adr             x5,  L(ipred_smooth_v_tbl)
1054        sub             x8,  x2,  w4, uxtw
1055        sub             w9,  w9,  #25
1056        ldrh            w9,  [x5, w9, uxtw #1]
1057        ld1r            {v4.16b},  [x8] // bottom
1058        add             x2,  x2,  #1
1059        sub             x5,  x5,  w9, uxtw
1060        add             x6,  x0,  x1
1061        lsl             x1,  x1,  #1
1062        br              x5
106340:
1064        ld1r            {v6.2s}, [x2]             // top
1065        usubl           v6.8h,   v6.8b,   v4.8b   // top-bottom
10664:
1067        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x7], #4 // weights_ver
1068        shll            v22.8h,  v4.8b,   #8      // bottom*256
1069        shll            v23.8h,  v4.8b,   #8
1070        zip1            v16.2s,  v16.2s,  v17.2s  // weights_ver
1071        zip1            v18.2s,  v18.2s,  v19.2s
1072        uxtl            v16.8h,  v16.8b           // weights_ver
1073        uxtl            v18.8h,  v18.8b
1074        mla             v22.8h,  v6.8h,   v16.8h  // bottom*256 + (top-bottom)*weights_ver
1075        mla             v23.8h,  v6.8h,   v18.8h
1076        rshrn           v22.8b,  v22.8h,  #8
1077        rshrn           v23.8b,  v23.8h,  #8
1078        st1             {v22.s}[0], [x0], x1
1079        st1             {v22.s}[1], [x6], x1
1080        subs            w4,  w4,  #4
1081        st1             {v23.s}[0], [x0], x1
1082        st1             {v23.s}[1], [x6], x1
1083        b.gt            4b
1084        ret
108580:
1086        ld1             {v6.8b}, [x2]             // top
1087        usubl           v6.8h,   v6.8b,   v4.8b   // top-bottom
10888:
1089        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x7], #4 // weights_ver
1090        shll            v24.8h,  v4.8b,   #8      // bottom*256
1091        shll            v25.8h,  v4.8b,   #8
1092        shll            v26.8h,  v4.8b,   #8
1093        shll            v27.8h,  v4.8b,   #8
1094        uxtl            v16.8h,  v16.8b           // weights_ver
1095        uxtl            v17.8h,  v17.8b
1096        uxtl            v18.8h,  v18.8b
1097        uxtl            v19.8h,  v19.8b
1098        mla             v24.8h,  v6.8h,   v16.8h  // bottom*256 + (top-bottom)*weights_ver
1099        mla             v25.8h,  v6.8h,   v17.8h
1100        mla             v26.8h,  v6.8h,   v18.8h
1101        mla             v27.8h,  v6.8h,   v19.8h
1102        rshrn           v24.8b,  v24.8h,  #8
1103        rshrn           v25.8b,  v25.8h,  #8
1104        rshrn           v26.8b,  v26.8h,  #8
1105        rshrn           v27.8b,  v27.8h,  #8
1106        st1             {v24.8b}, [x0], x1
1107        st1             {v25.8b}, [x6], x1
1108        subs            w4,  w4,  #4
1109        st1             {v26.8b}, [x0], x1
1110        st1             {v27.8b}, [x6], x1
1111        b.gt            8b
1112        ret
1113160:
1114320:
1115640:
1116        // Set up pointers for four rows in parallel; x0, x6, x5, x8
1117        add             x5,  x0,  x1
1118        add             x8,  x6,  x1
1119        lsl             x1,  x1,  #1
1120        sub             x1,  x1,  w3, uxtw
1121        mov             w9,  w3
1122
11231:
1124        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver
1125        uxtl            v16.8h,  v16.8b           // weights_ver
1126        uxtl            v17.8h,  v17.8b
1127        uxtl            v18.8h,  v18.8b
1128        uxtl            v19.8h,  v19.8b
11292:
1130        ld1             {v3.16b}, [x2],   #16     // top
1131        shll            v20.8h,  v4.8b,   #8      // bottom*256
1132        shll            v21.8h,  v4.8b,   #8
1133        shll            v22.8h,  v4.8b,   #8
1134        shll            v23.8h,  v4.8b,   #8
1135        shll            v24.8h,  v4.8b,   #8
1136        shll            v25.8h,  v4.8b,   #8
1137        shll            v26.8h,  v4.8b,   #8
1138        shll            v27.8h,  v4.8b,   #8
1139        usubl           v2.8h,   v3.8b,   v4.8b   // top-bottom
1140        usubl2          v3.8h,   v3.16b,  v4.16b
1141        mla             v20.8h,  v2.8h,   v16.8h  // bottom*256 + (top-bottom)*weights_ver
1142        mla             v21.8h,  v3.8h,   v16.8h
1143        mla             v22.8h,  v2.8h,   v17.8h
1144        mla             v23.8h,  v3.8h,   v17.8h
1145        mla             v24.8h,  v2.8h,   v18.8h
1146        mla             v25.8h,  v3.8h,   v18.8h
1147        mla             v26.8h,  v2.8h,   v19.8h
1148        mla             v27.8h,  v3.8h,   v19.8h
1149        rshrn           v20.8b,  v20.8h,  #8
1150        rshrn2          v20.16b, v21.8h,  #8
1151        rshrn           v22.8b,  v22.8h,  #8
1152        rshrn2          v22.16b, v23.8h,  #8
1153        rshrn           v24.8b,  v24.8h,  #8
1154        rshrn2          v24.16b, v25.8h,  #8
1155        rshrn           v26.8b,  v26.8h,  #8
1156        rshrn2          v26.16b, v27.8h,  #8
1157        subs            w3,  w3,  #16
1158        st1             {v20.16b}, [x0],  #16
1159        st1             {v22.16b}, [x6],  #16
1160        st1             {v24.16b}, [x5],  #16
1161        st1             {v26.16b}, [x8],  #16
1162        b.gt            2b
1163        subs            w4,  w4,  #4
1164        b.le            9f
1165        sub             x2,  x2,  w9, uxtw
1166        add             x0,  x0,  x1
1167        add             x6,  x6,  x1
1168        add             x5,  x5,  x1
1169        add             x8,  x8,  x1
1170        mov             w3,  w9
1171        b               1b
11729:
1173        ret
1174
1175L(ipred_smooth_v_tbl):
1176        .hword L(ipred_smooth_v_tbl) - 640b
1177        .hword L(ipred_smooth_v_tbl) - 320b
1178        .hword L(ipred_smooth_v_tbl) - 160b
1179        .hword L(ipred_smooth_v_tbl) -  80b
1180        .hword L(ipred_smooth_v_tbl) -  40b
1181endfunc
1182
1183// void ipred_smooth_h_8bpc_neon(pixel *dst, const ptrdiff_t stride,
1184//                               const pixel *const topleft,
1185//                               const int width, const int height, const int a,
1186//                               const int max_width, const int max_height);
1187function ipred_smooth_h_8bpc_neon, export=1
1188        movrel          x8,  X(sm_weights)
1189        add             x8,  x8,  w3, uxtw
1190        clz             w9,  w3
1191        adr             x5,  L(ipred_smooth_h_tbl)
1192        add             x12, x2,  w3, uxtw
1193        sub             w9,  w9,  #25
1194        ldrh            w9,  [x5, w9, uxtw #1]
1195        ld1r            {v5.16b},  [x12] // right
1196        sub             x5,  x5,  w9, uxtw
1197        add             x6,  x0,  x1
1198        lsl             x1,  x1,  #1
1199        br              x5
120040:
1201        ld1r            {v7.2s}, [x8]             // weights_hor
1202        sub             x2,  x2,  #4
1203        mov             x7,  #-4
1204        uxtl            v7.8h,   v7.8b            // weights_hor
12054:
1206        ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7 // left
1207        shll            v20.8h,  v5.8b,   #8      // right*256
1208        shll            v21.8h,  v5.8b,   #8
1209        zip1            v1.2s,   v1.2s,   v0.2s   // left, flipped
1210        zip1            v0.2s,   v3.2s,   v2.2s
1211        usubl           v0.8h,   v0.8b,   v5.8b   // left-right
1212        usubl           v1.8h,   v1.8b,   v5.8b
1213        mla             v20.8h,  v0.8h,   v7.8h   // right*256  + (left-right)*weights_hor
1214        mla             v21.8h,  v1.8h,   v7.8h
1215        rshrn           v20.8b,  v20.8h,  #8
1216        rshrn           v21.8b,  v21.8h,  #8
1217        st1             {v20.s}[0], [x0], x1
1218        st1             {v20.s}[1], [x6], x1
1219        subs            w4,  w4,  #4
1220        st1             {v21.s}[0], [x0], x1
1221        st1             {v21.s}[1], [x6], x1
1222        b.gt            4b
1223        ret
122480:
1225        ld1             {v7.8b}, [x8]             // weights_hor
1226        sub             x2,  x2,  #4
1227        mov             x7,  #-4
1228        uxtl            v7.8h,   v7.8b            // weights_hor
12298:
1230        ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7 // left
1231        shll            v20.8h,  v5.8b,   #8      // right*256
1232        shll            v21.8h,  v5.8b,   #8
1233        shll            v22.8h,  v5.8b,   #8
1234        shll            v23.8h,  v5.8b,   #8
1235        usubl           v3.8h,   v3.8b,   v5.8b   // left-right
1236        usubl           v2.8h,   v2.8b,   v5.8b
1237        usubl           v1.8h,   v1.8b,   v5.8b
1238        usubl           v0.8h,   v0.8b,   v5.8b
1239        mla             v20.8h,  v3.8h,   v7.8h   // right*256  + (left-right)*weights_hor
1240        mla             v21.8h,  v2.8h,   v7.8h   // (left flipped)
1241        mla             v22.8h,  v1.8h,   v7.8h
1242        mla             v23.8h,  v0.8h,   v7.8h
1243        rshrn           v20.8b,  v20.8h,  #8
1244        rshrn           v21.8b,  v21.8h,  #8
1245        rshrn           v22.8b,  v22.8h,  #8
1246        rshrn           v23.8b,  v23.8h,  #8
1247        st1             {v20.8b}, [x0], x1
1248        st1             {v21.8b}, [x6], x1
1249        subs            w4,  w4,  #4
1250        st1             {v22.8b}, [x0], x1
1251        st1             {v23.8b}, [x6], x1
1252        b.gt            8b
1253        ret
1254160:
1255320:
1256640:
1257        sub             x2,  x2,  #4
1258        mov             x7,  #-4
1259        // Set up pointers for four rows in parallel; x0, x6, x5, x10
1260        add             x5,  x0,  x1
1261        add             x10, x6,  x1
1262        lsl             x1,  x1,  #1
1263        sub             x1,  x1,  w3, uxtw
1264        mov             w9,  w3
1265
12661:
1267        ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},   [x2],  x7 // left
1268        usubl           v0.8h,   v0.8b,   v5.8b   // left-right
1269        usubl           v1.8h,   v1.8b,   v5.8b
1270        usubl           v2.8h,   v2.8b,   v5.8b
1271        usubl           v3.8h,   v3.8b,   v5.8b
12722:
1273        ld1             {v7.16b}, [x8],   #16     // weights_hor
1274        shll            v20.8h,  v5.8b,   #8      // right*256
1275        shll            v21.8h,  v5.8b,   #8
1276        shll            v22.8h,  v5.8b,   #8
1277        shll            v23.8h,  v5.8b,   #8
1278        shll            v24.8h,  v5.8b,   #8
1279        shll            v25.8h,  v5.8b,   #8
1280        shll            v26.8h,  v5.8b,   #8
1281        shll            v27.8h,  v5.8b,   #8
1282        uxtl            v6.8h,   v7.8b            // weights_hor
1283        uxtl2           v7.8h,   v7.16b
1284        mla             v20.8h,  v3.8h,   v6.8h   // right*256  + (left-right)*weights_hor
1285        mla             v21.8h,  v3.8h,   v7.8h   // (left flipped)
1286        mla             v22.8h,  v2.8h,   v6.8h
1287        mla             v23.8h,  v2.8h,   v7.8h
1288        mla             v24.8h,  v1.8h,   v6.8h
1289        mla             v25.8h,  v1.8h,   v7.8h
1290        mla             v26.8h,  v0.8h,   v6.8h
1291        mla             v27.8h,  v0.8h,   v7.8h
1292        rshrn           v20.8b,  v20.8h,  #8
1293        rshrn2          v20.16b, v21.8h,  #8
1294        rshrn           v22.8b,  v22.8h,  #8
1295        rshrn2          v22.16b, v23.8h,  #8
1296        rshrn           v24.8b,  v24.8h,  #8
1297        rshrn2          v24.16b, v25.8h,  #8
1298        rshrn           v26.8b,  v26.8h,  #8
1299        rshrn2          v26.16b, v27.8h,  #8
1300        subs            w3,  w3,  #16
1301        st1             {v20.16b}, [x0],  #16
1302        st1             {v22.16b}, [x6],  #16
1303        st1             {v24.16b}, [x5],  #16
1304        st1             {v26.16b}, [x10], #16
1305        b.gt            2b
1306        subs            w4,  w4,  #4
1307        b.le            9f
1308        sub             x8,  x8,  w9, uxtw
1309        add             x0,  x0,  x1
1310        add             x6,  x6,  x1
1311        add             x5,  x5,  x1
1312        add             x10, x10, x1
1313        mov             w3,  w9
1314        b               1b
13159:
1316        ret
1317
1318L(ipred_smooth_h_tbl):
1319        .hword L(ipred_smooth_h_tbl) - 640b
1320        .hword L(ipred_smooth_h_tbl) - 320b
1321        .hword L(ipred_smooth_h_tbl) - 160b
1322        .hword L(ipred_smooth_h_tbl) -  80b
1323        .hword L(ipred_smooth_h_tbl) -  40b
1324endfunc
1325
1326// void ipred_filter_8bpc_neon(pixel *dst, const ptrdiff_t stride,
1327//                             const pixel *const topleft,
1328//                             const int width, const int height, const int filt_idx,
1329//                             const int max_width, const int max_height);
1330function ipred_filter_8bpc_neon, export=1
1331        and             w5,  w5,  #511
1332        movrel          x6,  X(filter_intra_taps)
1333        lsl             w5,  w5,  #6
1334        add             x6,  x6,  w5, uxtw
1335        ld1             {v16.8b, v17.8b, v18.8b, v19.8b}, [x6], #32
1336        clz             w9,  w3
1337        adr             x5,  L(ipred_filter_tbl)
1338        ld1             {v20.8b, v21.8b, v22.8b}, [x6]
1339        sub             w9,  w9,  #26
1340        ldrh            w9,  [x5, w9, uxtw #1]
1341        sxtl            v16.8h,  v16.8b
1342        sxtl            v17.8h,  v17.8b
1343        sub             x5,  x5,  w9, uxtw
1344        sxtl            v18.8h,  v18.8b
1345        sxtl            v19.8h,  v19.8b
1346        add             x6,  x0,  x1
1347        lsl             x1,  x1,  #1
1348        sxtl            v20.8h,  v20.8b
1349        sxtl            v21.8h,  v21.8b
1350        sxtl            v22.8h,  v22.8b
1351        br              x5
135240:
1353        ldur            s0,  [x2, #1]             // top (0-3)
1354        sub             x2,  x2,  #2
1355        mov             x7,  #-2
1356        uxtl            v0.8h,   v0.8b            // top (0-3)
13574:
1358        ld1             {v1.s}[0], [x2], x7       // left (0-1) + topleft (2)
1359        mul             v2.8h,   v17.8h,  v0.h[0] // p1(top[0]) * filter(1)
1360        mla             v2.8h,   v18.8h,  v0.h[1] // p2(top[1]) * filter(2)
1361        mla             v2.8h,   v19.8h,  v0.h[2] // p3(top[2]) * filter(3)
1362        uxtl            v1.8h,   v1.8b            // left (0-1) + topleft (2)
1363        mla             v2.8h,   v20.8h,  v0.h[3] // p4(top[3]) * filter(4)
1364        mla             v2.8h,   v16.8h,  v1.h[2] // p0(topleft) * filter(0)
1365        mla             v2.8h,   v21.8h,  v1.h[1] // p5(left[0]) * filter(5)
1366        mla             v2.8h,   v22.8h,  v1.h[0] // p6(left[1]) * filter(6)
1367        sqrshrun        v2.8b,   v2.8h,   #4
1368        subs            w4,  w4,  #2
1369        st1             {v2.s}[0], [x0], x1
1370        uxtl            v0.8h,   v2.8b
1371        st1             {v2.s}[1], [x6], x1
1372        ext             v0.16b,  v0.16b,  v0.16b, #8 // move top from [4-7] to [0-3]
1373        b.gt            4b
1374        ret
137580:
1376        ldur            d0,  [x2, #1]             // top (0-7)
1377        sub             x2,  x2,  #2
1378        mov             x7,  #-2
1379        uxtl            v0.8h,   v0.8b            // top (0-7)
13808:
1381        ld1             {v1.s}[0], [x2], x7       // left (0-1) + topleft (2)
1382        mul             v2.8h,   v17.8h,  v0.h[0] // p1(top[0]) * filter(1)
1383        mla             v2.8h,   v18.8h,  v0.h[1] // p2(top[1]) * filter(2)
1384        mla             v2.8h,   v19.8h,  v0.h[2] // p3(top[2]) * filter(3)
1385        uxtl            v1.8h,   v1.8b            // left (0-1) + topleft (2)
1386        mla             v2.8h,   v20.8h,  v0.h[3] // p4(top[3]) * filter(4)
1387        mla             v2.8h,   v16.8h,  v1.h[2] // p0(topleft) * filter(0)
1388        mla             v2.8h,   v21.8h,  v1.h[1] // p5(left[0]) * filter(5)
1389        mla             v2.8h,   v22.8h,  v1.h[0] // p6(left[1]) * filter(6)
1390        mul             v3.8h,   v17.8h,  v0.h[4] // p1(top[0]) * filter(1)
1391        mla             v3.8h,   v18.8h,  v0.h[5] // p2(top[1]) * filter(2)
1392        mla             v3.8h,   v19.8h,  v0.h[6] // p3(top[2]) * filter(3)
1393        sqrshrun        v2.8b,   v2.8h,   #4
1394        uxtl            v1.8h,   v2.8b            // first block, in 16 bit
1395        mla             v3.8h,   v20.8h,  v0.h[7] // p4(top[3]) * filter(4)
1396        mla             v3.8h,   v16.8h,  v0.h[3] // p0(topleft) * filter(0)
1397        mla             v3.8h,   v21.8h,  v1.h[3] // p5(left[0]) * filter(5)
1398        mla             v3.8h,   v22.8h,  v1.h[7] // p6(left[1]) * filter(6)
1399        sqrshrun        v3.8b,   v3.8h,   #4
1400        subs            w4,  w4,  #2
1401        st2             {v2.s, v3.s}[0], [x0], x1
1402        zip2            v0.2s,   v2.2s,   v3.2s
1403        st2             {v2.s, v3.s}[1], [x6], x1
1404        uxtl            v0.8h,   v0.8b
1405        b.gt            8b
1406        ret
1407160:
1408320:
1409        add             x8,  x2,  #1
1410        sub             x2,  x2,  #2
1411        mov             x7,  #-2
1412        sub             x1,  x1,  w3, uxtw
1413        mov             w9,  w3
1414
14151:
1416        ld1             {v0.s}[0], [x2], x7       // left (0-1) + topleft (2)
1417        uxtl            v0.8h,   v0.8b            // left (0-1) + topleft (2)
14182:
1419        ld1             {v2.16b}, [x8],   #16     // top(0-15)
1420        mul             v3.8h,   v16.8h,  v0.h[2] // p0(topleft) * filter(0)
1421        mla             v3.8h,   v21.8h,  v0.h[1] // p5(left[0]) * filter(5)
1422        uxtl            v1.8h,   v2.8b            // top(0-7)
1423        uxtl2           v2.8h,   v2.16b           // top(8-15)
1424        mla             v3.8h,   v22.8h,  v0.h[0] // p6(left[1]) * filter(6)
1425        mla             v3.8h,   v17.8h,  v1.h[0] // p1(top[0]) * filter(1)
1426        mla             v3.8h,   v18.8h,  v1.h[1] // p2(top[1]) * filter(2)
1427        mla             v3.8h,   v19.8h,  v1.h[2] // p3(top[2]) * filter(3)
1428        mla             v3.8h,   v20.8h,  v1.h[3] // p4(top[3]) * filter(4)
1429
1430        mul             v4.8h,   v17.8h,  v1.h[4] // p1(top[0]) * filter(1)
1431        mla             v4.8h,   v18.8h,  v1.h[5] // p2(top[1]) * filter(2)
1432        mla             v4.8h,   v19.8h,  v1.h[6] // p3(top[2]) * filter(3)
1433        sqrshrun        v3.8b,   v3.8h,   #4
1434        uxtl            v0.8h,   v3.8b            // first block, in 16 bit
1435        mla             v4.8h,   v20.8h,  v1.h[7] // p4(top[3]) * filter(4)
1436        mla             v4.8h,   v16.8h,  v1.h[3] // p0(topleft) * filter(0)
1437        mla             v4.8h,   v21.8h,  v0.h[3] // p5(left[0]) * filter(5)
1438        mla             v4.8h,   v22.8h,  v0.h[7] // p6(left[1]) * filter(6)
1439
1440        mul             v5.8h,   v17.8h,  v2.h[0] // p1(top[0]) * filter(1)
1441        mla             v5.8h,   v18.8h,  v2.h[1] // p2(top[1]) * filter(2)
1442        mla             v5.8h,   v19.8h,  v2.h[2] // p3(top[2]) * filter(3)
1443        sqrshrun        v4.8b,   v4.8h,   #4
1444        uxtl            v0.8h,   v4.8b            // second block, in 16 bit
1445        mla             v5.8h,   v20.8h,  v2.h[3] // p4(top[3]) * filter(4)
1446        mla             v5.8h,   v16.8h,  v1.h[7] // p0(topleft) * filter(0)
1447        mla             v5.8h,   v21.8h,  v0.h[3] // p5(left[0]) * filter(5)
1448        mla             v5.8h,   v22.8h,  v0.h[7] // p6(left[1]) * filter(6)
1449
1450        mul             v6.8h,   v17.8h,  v2.h[4] // p1(top[0]) * filter(1)
1451        mla             v6.8h,   v18.8h,  v2.h[5] // p2(top[1]) * filter(2)
1452        mla             v6.8h,   v19.8h,  v2.h[6] // p3(top[2]) * filter(3)
1453        sqrshrun        v5.8b,   v5.8h,   #4
1454        uxtl            v0.8h,   v5.8b            // third block, in 16 bit
1455        mla             v6.8h,   v20.8h,  v2.h[7] // p4(top[3]) * filter(4)
1456        mla             v6.8h,   v16.8h,  v2.h[3] // p0(topleft) * filter(0)
1457        mla             v6.8h,   v21.8h,  v0.h[3] // p5(left[0]) * filter(5)
1458        mla             v6.8h,   v22.8h,  v0.h[7] // p6(left[1]) * filter(6)
1459
1460        subs            w3,  w3,  #16
1461        sqrshrun        v6.8b,   v6.8h,   #4
1462
1463        st4             {v3.s, v4.s, v5.s, v6.s}[0], [x0], #16
1464        st4             {v3.s, v4.s, v5.s, v6.s}[1], [x6], #16
1465        b.le            8f
1466        ins             v0.h[2], v2.h[7]
1467        ins             v0.b[0], v6.b[7]
1468        ins             v0.b[2], v6.b[3]
1469        b               2b
14708:
1471        subs            w4,  w4,  #2
1472        b.le            9f
1473        sub             x8,  x6,  w9, uxtw
1474        add             x0,  x0,  x1
1475        add             x6,  x6,  x1
1476        mov             w3,  w9
1477        b               1b
14789:
1479        ret
1480
1481L(ipred_filter_tbl):
1482        .hword L(ipred_filter_tbl) - 320b
1483        .hword L(ipred_filter_tbl) - 160b
1484        .hword L(ipred_filter_tbl) -  80b
1485        .hword L(ipred_filter_tbl) -  40b
1486endfunc
1487
1488// void pal_pred_8bpc_neon(pixel *dst, const ptrdiff_t stride,
1489//                         const uint16_t *const pal, const uint8_t *idx,
1490//                         const int w, const int h);
1491function pal_pred_8bpc_neon, export=1
1492        ld1             {v0.8h}, [x2]
1493        clz             w9,  w4
1494        adr             x6,  L(pal_pred_tbl)
1495        sub             w9,  w9,  #25
1496        ldrh            w9,  [x6, w9, uxtw #1]
1497        xtn             v0.8b,  v0.8h
1498        sub             x6,  x6,  w9, uxtw
1499        add             x2,  x0,  x1
1500        lsl             x1,  x1,  #1
1501        br              x6
15024:
1503        ld1             {v1.16b}, [x3], #16
1504        subs            w5,  w5,  #4
1505        tbl             v1.16b, {v0.16b}, v1.16b
1506        st1             {v1.s}[0], [x0], x1
1507        st1             {v1.s}[1], [x2], x1
1508        st1             {v1.s}[2], [x0], x1
1509        st1             {v1.s}[3], [x2], x1
1510        b.gt            4b
1511        ret
15128:
1513        ld1             {v1.16b, v2.16b}, [x3], #32
1514        subs            w5,  w5,  #4
1515        tbl             v1.16b, {v0.16b}, v1.16b
1516        st1             {v1.d}[0], [x0], x1
1517        tbl             v2.16b, {v0.16b}, v2.16b
1518        st1             {v1.d}[1], [x2], x1
1519        st1             {v2.d}[0], [x0], x1
1520        st1             {v2.d}[1], [x2], x1
1521        b.gt            8b
1522        ret
152316:
1524        ld1             {v1.16b, v2.16b, v3.16b, v4.16b}, [x3], #64
1525        subs            w5,  w5,  #4
1526        tbl             v1.16b, {v0.16b}, v1.16b
1527        tbl             v2.16b, {v0.16b}, v2.16b
1528        st1             {v1.16b}, [x0], x1
1529        tbl             v3.16b, {v0.16b}, v3.16b
1530        st1             {v2.16b}, [x2], x1
1531        tbl             v4.16b, {v0.16b}, v4.16b
1532        st1             {v3.16b}, [x0], x1
1533        st1             {v4.16b}, [x2], x1
1534        b.gt            16b
1535        ret
153632:
1537        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x3], #64
1538        ld1             {v20.16b, v21.16b, v22.16b, v23.16b}, [x3], #64
1539        subs            w5,  w5,  #4
1540        tbl             v16.16b, {v0.16b}, v16.16b
1541        tbl             v17.16b, {v0.16b}, v17.16b
1542        tbl             v18.16b, {v0.16b}, v18.16b
1543        tbl             v19.16b, {v0.16b}, v19.16b
1544        tbl             v20.16b, {v0.16b}, v20.16b
1545        st1             {v16.16b, v17.16b}, [x0], x1
1546        tbl             v21.16b, {v0.16b}, v21.16b
1547        st1             {v18.16b, v19.16b}, [x2], x1
1548        tbl             v22.16b, {v0.16b}, v22.16b
1549        st1             {v20.16b, v21.16b}, [x0], x1
1550        tbl             v23.16b, {v0.16b}, v23.16b
1551        st1             {v22.16b, v23.16b}, [x2], x1
1552        b.gt            32b
1553        ret
155464:
1555        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x3], #64
1556        ld1             {v20.16b, v21.16b, v22.16b, v23.16b}, [x3], #64
1557        subs            w5,  w5,  #2
1558        tbl             v16.16b, {v0.16b}, v16.16b
1559        tbl             v17.16b, {v0.16b}, v17.16b
1560        tbl             v18.16b, {v0.16b}, v18.16b
1561        tbl             v19.16b, {v0.16b}, v19.16b
1562        st1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x0], x1
1563        tbl             v20.16b, {v0.16b}, v20.16b
1564        tbl             v21.16b, {v0.16b}, v21.16b
1565        tbl             v22.16b, {v0.16b}, v22.16b
1566        tbl             v23.16b, {v0.16b}, v23.16b
1567        st1             {v20.16b, v21.16b, v22.16b, v23.16b}, [x2], x1
1568        b.gt            64b
1569        ret
1570
1571L(pal_pred_tbl):
1572        .hword L(pal_pred_tbl) - 64b
1573        .hword L(pal_pred_tbl) - 32b
1574        .hword L(pal_pred_tbl) - 16b
1575        .hword L(pal_pred_tbl) -  8b
1576        .hword L(pal_pred_tbl) -  4b
1577endfunc
1578
1579// void ipred_cfl_128_8bpc_neon(pixel *dst, const ptrdiff_t stride,
1580//                              const pixel *const topleft,
1581//                              const int width, const int height,
1582//                              const int16_t *ac, const int alpha);
1583function ipred_cfl_128_8bpc_neon, export=1
1584        clz             w9,  w3
1585        adr             x7,  L(ipred_cfl_128_tbl)
1586        sub             w9,  w9,  #26
1587        ldrh            w9,  [x7, w9, uxtw #1]
1588        movi            v0.8h,   #128 // dc
1589        dup             v1.8h,   w6   // alpha
1590        sub             x7,  x7,  w9, uxtw
1591        add             x6,  x0,  x1
1592        lsl             x1,  x1,  #1
1593        br              x7
1594L(ipred_cfl_splat_w4):
1595        ld1             {v2.8h, v3.8h}, [x5], #32
1596        mul             v2.8h,   v2.8h,   v1.8h  // diff = ac * alpha
1597        mul             v3.8h,   v3.8h,   v1.8h
1598        sshr            v4.8h,   v2.8h,   #15    // sign = diff >> 15
1599        sshr            v5.8h,   v3.8h,   #15
1600        add             v2.8h,   v2.8h,   v4.8h  // diff + sign
1601        add             v3.8h,   v3.8h,   v5.8h
1602        srshr           v2.8h,   v2.8h,   #6     // (diff + sign + 32) >> 6 = apply_sign()
1603        srshr           v3.8h,   v3.8h,   #6
1604        add             v2.8h,   v2.8h,   v0.8h  // dc + apply_sign()
1605        add             v3.8h,   v3.8h,   v0.8h
1606        sqxtun          v2.8b,   v2.8h           // iclip_pixel(dc + apply_sign())
1607        sqxtun          v3.8b,   v3.8h
1608        st1             {v2.s}[0],  [x0], x1
1609        st1             {v2.s}[1],  [x6], x1
1610        subs            w4,  w4,  #4
1611        st1             {v3.s}[0],  [x0], x1
1612        st1             {v3.s}[1],  [x6], x1
1613        b.gt            L(ipred_cfl_splat_w4)
1614        ret
1615L(ipred_cfl_splat_w8):
1616        ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [x5], #64
1617        mul             v2.8h,   v2.8h,   v1.8h  // diff = ac * alpha
1618        mul             v3.8h,   v3.8h,   v1.8h
1619        mul             v4.8h,   v4.8h,   v1.8h
1620        mul             v5.8h,   v5.8h,   v1.8h
1621        sshr            v16.8h,  v2.8h,   #15    // sign = diff >> 15
1622        sshr            v17.8h,  v3.8h,   #15
1623        sshr            v18.8h,  v4.8h,   #15
1624        sshr            v19.8h,  v5.8h,   #15
1625        add             v2.8h,   v2.8h,   v16.8h // diff + sign
1626        add             v3.8h,   v3.8h,   v17.8h
1627        add             v4.8h,   v4.8h,   v18.8h
1628        add             v5.8h,   v5.8h,   v19.8h
1629        srshr           v2.8h,   v2.8h,   #6     // (diff + sign + 32) >> 6 = apply_sign()
1630        srshr           v3.8h,   v3.8h,   #6
1631        srshr           v4.8h,   v4.8h,   #6
1632        srshr           v5.8h,   v5.8h,   #6
1633        add             v2.8h,   v2.8h,   v0.8h  // dc + apply_sign()
1634        add             v3.8h,   v3.8h,   v0.8h
1635        add             v4.8h,   v4.8h,   v0.8h
1636        add             v5.8h,   v5.8h,   v0.8h
1637        sqxtun          v2.8b,   v2.8h           // iclip_pixel(dc + apply_sign())
1638        sqxtun          v3.8b,   v3.8h
1639        sqxtun          v4.8b,   v4.8h
1640        sqxtun          v5.8b,   v5.8h
1641        st1             {v2.8b},  [x0], x1
1642        st1             {v3.8b},  [x6], x1
1643        subs            w4,  w4,  #4
1644        st1             {v4.8b},  [x0], x1
1645        st1             {v5.8b},  [x6], x1
1646        b.gt            L(ipred_cfl_splat_w8)
1647        ret
1648L(ipred_cfl_splat_w16):
1649        add             x7,  x5,  w3, uxtw #1
1650        sub             x1,  x1,  w3, uxtw
1651        mov             w9,  w3
16521:
1653        ld1             {v2.8h, v3.8h}, [x5], #32
1654        ld1             {v4.8h, v5.8h}, [x7], #32
1655        mul             v2.8h,   v2.8h,   v1.8h  // diff = ac * alpha
1656        mul             v3.8h,   v3.8h,   v1.8h
1657        mul             v4.8h,   v4.8h,   v1.8h
1658        mul             v5.8h,   v5.8h,   v1.8h
1659        sshr            v16.8h,  v2.8h,   #15    // sign = diff >> 15
1660        sshr            v17.8h,  v3.8h,   #15
1661        sshr            v18.8h,  v4.8h,   #15
1662        sshr            v19.8h,  v5.8h,   #15
1663        add             v2.8h,   v2.8h,   v16.8h // diff + sign
1664        add             v3.8h,   v3.8h,   v17.8h
1665        add             v4.8h,   v4.8h,   v18.8h
1666        add             v5.8h,   v5.8h,   v19.8h
1667        srshr           v2.8h,   v2.8h,   #6     // (diff + sign + 32) >> 6 = apply_sign()
1668        srshr           v3.8h,   v3.8h,   #6
1669        srshr           v4.8h,   v4.8h,   #6
1670        srshr           v5.8h,   v5.8h,   #6
1671        add             v2.8h,   v2.8h,   v0.8h  // dc + apply_sign()
1672        add             v3.8h,   v3.8h,   v0.8h
1673        add             v4.8h,   v4.8h,   v0.8h
1674        add             v5.8h,   v5.8h,   v0.8h
1675        sqxtun          v2.8b,   v2.8h           // iclip_pixel(dc + apply_sign())
1676        sqxtun          v3.8b,   v3.8h
1677        sqxtun          v4.8b,   v4.8h
1678        sqxtun          v5.8b,   v5.8h
1679        subs            w3,  w3,  #16
1680        st1             {v2.8b, v3.8b},  [x0], #16
1681        st1             {v4.8b, v5.8b},  [x6], #16
1682        b.gt            1b
1683        subs            w4,  w4,  #2
1684        add             x5,  x5,  w9, uxtw #1
1685        add             x7,  x7,  w9, uxtw #1
1686        add             x0,  x0,  x1
1687        add             x6,  x6,  x1
1688        mov             w3,  w9
1689        b.gt            1b
1690        ret
1691
1692L(ipred_cfl_128_tbl):
1693L(ipred_cfl_splat_tbl):
1694        .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16)
1695        .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16)
1696        .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w8)
1697        .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w4)
1698endfunc
1699
1700// void ipred_cfl_top_8bpc_neon(pixel *dst, const ptrdiff_t stride,
1701//                              const pixel *const topleft,
1702//                              const int width, const int height,
1703//                              const int16_t *ac, const int alpha);
1704function ipred_cfl_top_8bpc_neon, export=1
1705        clz             w9,  w3
1706        adr             x7,  L(ipred_cfl_top_tbl)
1707        sub             w9,  w9,  #26
1708        ldrh            w9,  [x7, w9, uxtw #1]
1709        dup             v1.8h,   w6   // alpha
1710        add             x2,  x2,  #1
1711        sub             x7,  x7,  w9, uxtw
1712        add             x6,  x0,  x1
1713        lsl             x1,  x1,  #1
1714        br              x7
17154:
1716        ld1r            {v0.2s},  [x2]
1717        uaddlv          h0,      v0.8b
1718        urshr           v0.4h,   v0.4h,   #3
1719        dup             v0.8h,   v0.h[0]
1720        b               L(ipred_cfl_splat_w4)
17218:
1722        ld1             {v0.8b},  [x2]
1723        uaddlv          h0,      v0.8b
1724        urshr           v0.4h,   v0.4h,   #3
1725        dup             v0.8h,   v0.h[0]
1726        b               L(ipred_cfl_splat_w8)
172716:
1728        ld1             {v0.16b}, [x2]
1729        uaddlv          h0,      v0.16b
1730        urshr           v0.4h,   v0.4h,   #4
1731        dup             v0.8h,   v0.h[0]
1732        b               L(ipred_cfl_splat_w16)
173332:
1734        ld1             {v2.16b, v3.16b}, [x2]
1735        uaddlv          h2,      v2.16b
1736        uaddlv          h3,      v3.16b
1737        add             v2.4h,   v2.4h,   v3.4h
1738        urshr           v2.4h,   v2.4h,   #5
1739        dup             v0.8h,   v2.h[0]
1740        b               L(ipred_cfl_splat_w16)
1741
1742L(ipred_cfl_top_tbl):
1743        .hword L(ipred_cfl_top_tbl) - 32b
1744        .hword L(ipred_cfl_top_tbl) - 16b
1745        .hword L(ipred_cfl_top_tbl) -  8b
1746        .hword L(ipred_cfl_top_tbl) -  4b
1747endfunc
1748
1749// void ipred_cfl_left_8bpc_neon(pixel *dst, const ptrdiff_t stride,
1750//                               const pixel *const topleft,
1751//                               const int width, const int height,
1752//                               const int16_t *ac, const int alpha);
1753function ipred_cfl_left_8bpc_neon, export=1
1754        sub             x2,  x2,  w4, uxtw
1755        clz             w9,  w3
1756        clz             w8,  w4
1757        adr             x10, L(ipred_cfl_splat_tbl)
1758        adr             x7,  L(ipred_cfl_left_tbl)
1759        sub             w9,  w9,  #26
1760        sub             w8,  w8,  #26
1761        ldrh            w9,  [x10, w9, uxtw #1]
1762        ldrh            w8,  [x7,  w8, uxtw #1]
1763        dup             v1.8h,   w6   // alpha
1764        sub             x9,  x10, w9, uxtw
1765        sub             x7,  x7,  w8, uxtw
1766        add             x6,  x0,  x1
1767        lsl             x1,  x1,  #1
1768        br              x7
1769
1770L(ipred_cfl_left_h4):
1771        ld1r            {v0.2s},  [x2]
1772        uaddlv          h0,      v0.8b
1773        urshr           v0.4h,   v0.4h,   #3
1774        dup             v0.8h,   v0.h[0]
1775        br              x9
1776
1777L(ipred_cfl_left_h8):
1778        ld1             {v0.8b},  [x2]
1779        uaddlv          h0,      v0.8b
1780        urshr           v0.4h,   v0.4h,   #3
1781        dup             v0.8h,   v0.h[0]
1782        br              x9
1783
1784L(ipred_cfl_left_h16):
1785        ld1             {v0.16b}, [x2]
1786        uaddlv          h0,      v0.16b
1787        urshr           v0.4h,   v0.4h,   #4
1788        dup             v0.8h,   v0.h[0]
1789        br              x9
1790
1791L(ipred_cfl_left_h32):
1792        ld1             {v2.16b, v3.16b}, [x2]
1793        uaddlv          h2,      v2.16b
1794        uaddlv          h3,      v3.16b
1795        add             v2.4h,   v2.4h,   v3.4h
1796        urshr           v2.4h,   v2.4h,   #5
1797        dup             v0.8h,   v2.h[0]
1798        br              x9
1799
1800L(ipred_cfl_left_tbl):
1801        .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h32)
1802        .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h16)
1803        .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h8)
1804        .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h4)
1805endfunc
1806
1807// void ipred_cfl_8bpc_neon(pixel *dst, const ptrdiff_t stride,
1808//                          const pixel *const topleft,
1809//                          const int width, const int height,
1810//                          const int16_t *ac, const int alpha);
1811function ipred_cfl_8bpc_neon, export=1
1812        sub             x2,  x2,  w4, uxtw
1813        add             w8,  w3,  w4             // width + height
1814        dup             v1.8h,   w6              // alpha
1815        clz             w9,  w3
1816        clz             w6,  w4
1817        dup             v16.8h, w8               // width + height
1818        adr             x7,  L(ipred_cfl_tbl)
1819        rbit            w8,  w8                  // rbit(width + height)
1820        sub             w9,  w9,  #22            // 26 leading bits, minus table offset 4
1821        sub             w6,  w6,  #26
1822        clz             w8,  w8                  // ctz(width + height)
1823        ldrh            w9,  [x7, w9, uxtw #1]
1824        ldrh            w6,  [x7, w6, uxtw #1]
1825        neg             w8,  w8                  // -ctz(width + height)
1826        sub             x9,  x7,  w9, uxtw
1827        sub             x7,  x7,  w6, uxtw
1828        ushr            v16.8h,  v16.8h,  #1     // (width + height) >> 1
1829        dup             v17.8h,  w8              // -ctz(width + height)
1830        add             x6,  x0,  x1
1831        lsl             x1,  x1,  #1
1832        br              x7
1833
1834L(ipred_cfl_h4):
1835        ld1             {v0.s}[0],  [x2], #4
1836        ins             v0.s[1], wzr
1837        uaddlv          h0,      v0.8b
1838        br              x9
1839L(ipred_cfl_w4):
1840        add             x2,  x2,  #1
1841        ld1             {v2.s}[0],  [x2]
1842        ins             v2.s[1], wzr
1843        add             v0.4h,   v0.4h,   v16.4h
1844        uaddlv          h2,      v2.8b
1845        cmp             w4,  #4
1846        add             v0.4h,   v0.4h,   v2.4h
1847        ushl            v0.4h,   v0.4h,   v17.4h
1848        b.eq            1f
1849        // h = 8/16
1850        mov             w16, #(0x3334/2)
1851        movk            w16, #(0x5556/2), lsl #16
1852        add             w17, w4,  w4  // w17 = 2*h = 16 or 32
1853        lsr             w16, w16, w17
1854        dup             v16.4h,  w16
1855        sqdmulh         v0.4h,   v0.4h,   v16.4h
18561:
1857        dup             v0.8h,   v0.h[0]
1858        b               L(ipred_cfl_splat_w4)
1859
1860L(ipred_cfl_h8):
1861        ld1             {v0.8b},  [x2], #8
1862        uaddlv          h0,      v0.8b
1863        br              x9
1864L(ipred_cfl_w8):
1865        add             x2,  x2,  #1
1866        ld1             {v2.8b},  [x2]
1867        add             v0.4h,   v0.4h,   v16.4h
1868        uaddlv          h2,      v2.8b
1869        cmp             w4,  #8
1870        add             v0.4h,   v0.4h,   v2.4h
1871        ushl            v0.4h,   v0.4h,   v17.4h
1872        b.eq            1f
1873        // h = 4/16/32
1874        cmp             w4,  #32
1875        mov             w16, #(0x3334/2)
1876        mov             w17, #(0x5556/2)
1877        csel            w16, w16, w17, eq
1878        dup             v16.4h,  w16
1879        sqdmulh         v0.4h,   v0.4h,   v16.4h
18801:
1881        dup             v0.8h,   v0.h[0]
1882        b               L(ipred_cfl_splat_w8)
1883
1884L(ipred_cfl_h16):
1885        ld1             {v0.16b}, [x2], #16
1886        uaddlv          h0,      v0.16b
1887        br              x9
1888L(ipred_cfl_w16):
1889        add             x2,  x2,  #1
1890        ld1             {v2.16b}, [x2]
1891        add             v0.4h,   v0.4h,   v16.4h
1892        uaddlv          h2,      v2.16b
1893        cmp             w4,  #16
1894        add             v0.4h,   v0.4h,   v2.4h
1895        ushl            v0.4h,   v0.4h,   v17.4h
1896        b.eq            1f
1897        // h = 4/8/32
1898        cmp             w4,  #4
1899        mov             w16, #(0x3334/2)
1900        mov             w17, #(0x5556/2)
1901        csel            w16, w16, w17, eq
1902        dup             v16.4h,  w16
1903        sqdmulh         v0.4h,   v0.4h,   v16.4h
19041:
1905        dup             v0.8h,   v0.h[0]
1906        b               L(ipred_cfl_splat_w16)
1907
1908L(ipred_cfl_h32):
1909        ld1             {v2.16b, v3.16b}, [x2], #32
1910        uaddlv          h2,      v2.16b
1911        uaddlv          h3,      v3.16b
1912        add             v0.4h,   v2.4h,   v3.4h
1913        br              x9
1914L(ipred_cfl_w32):
1915        add             x2,  x2,  #1
1916        ld1             {v2.16b, v3.16b}, [x2]
1917        add             v0.4h,   v0.4h,   v16.4h
1918        uaddlv          h2,      v2.16b
1919        uaddlv          h3,      v3.16b
1920        cmp             w4,  #32
1921        add             v0.4h,   v0.4h,   v2.4h
1922        add             v0.4h,   v0.4h,   v3.4h
1923        ushl            v0.4h,   v0.4h,   v17.4h
1924        b.eq            1f
1925        // h = 8/16
1926        mov             w16, #(0x5556/2)
1927        movk            w16, #(0x3334/2), lsl #16
1928        add             w17, w4,  w4  // w17 = 2*h = 16 or 32
1929        lsr             w16, w16, w17
1930        dup             v16.4h,  w16
1931        sqdmulh         v0.4h,   v0.4h,   v16.4h
19321:
1933        dup             v0.8h,   v0.h[0]
1934        b               L(ipred_cfl_splat_w16)
1935
1936L(ipred_cfl_tbl):
1937        .hword L(ipred_cfl_tbl) - L(ipred_cfl_h32)
1938        .hword L(ipred_cfl_tbl) - L(ipred_cfl_h16)
1939        .hword L(ipred_cfl_tbl) - L(ipred_cfl_h8)
1940        .hword L(ipred_cfl_tbl) - L(ipred_cfl_h4)
1941        .hword L(ipred_cfl_tbl) - L(ipred_cfl_w32)
1942        .hword L(ipred_cfl_tbl) - L(ipred_cfl_w16)
1943        .hword L(ipred_cfl_tbl) - L(ipred_cfl_w8)
1944        .hword L(ipred_cfl_tbl) - L(ipred_cfl_w4)
1945endfunc
1946
1947// void cfl_ac_420_8bpc_neon(int16_t *const ac, const pixel *const ypx,
1948//                           const ptrdiff_t stride, const int w_pad,
1949//                           const int h_pad, const int cw, const int ch);
1950function ipred_cfl_ac_420_8bpc_neon, export=1
1951        clz             w8,  w5
1952        lsl             w4,  w4,  #2
1953        adr             x7,  L(ipred_cfl_ac_420_tbl)
1954        sub             w8,  w8,  #27
1955        ldrh            w8,  [x7, w8, uxtw #1]
1956        movi            v16.8h,  #0
1957        movi            v17.8h,  #0
1958        movi            v18.8h,  #0
1959        movi            v19.8h,  #0
1960        sub             x7,  x7,  w8, uxtw
1961        sub             w8,  w6,  w4         // height - h_pad
1962        rbit            w9,  w5              // rbit(width)
1963        rbit            w10, w6              // rbit(height)
1964        clz             w9,  w9              // ctz(width)
1965        clz             w10, w10             // ctz(height)
1966        add             w9,  w9,  w10        // log2sz
1967        add             x10, x1,  x2
1968        dup             v31.4s,  w9
1969        lsl             x2,  x2,  #1
1970        neg             v31.4s,  v31.4s      // -log2sz
1971        br              x7
1972
1973L(ipred_cfl_ac_420_w4):
19741:      // Copy and subsample input
1975        ld1             {v0.8b},   [x1],  x2
1976        ld1             {v1.8b},   [x10], x2
1977        ld1             {v0.d}[1], [x1],  x2
1978        ld1             {v1.d}[1], [x10], x2
1979        uaddlp          v0.8h,   v0.16b
1980        uaddlp          v1.8h,   v1.16b
1981        add             v0.8h,   v0.8h,   v1.8h
1982        shl             v0.8h,   v0.8h,   #1
1983        subs            w8,  w8,  #2
1984        st1             {v0.8h}, [x0], #16
1985        add             v16.8h,  v16.8h,  v0.8h
1986        b.gt            1b
1987        trn2            v1.2d,   v0.2d,   v0.2d
1988        trn2            v0.2d,   v0.2d,   v0.2d
1989L(ipred_cfl_ac_420_w4_hpad):
1990        cbz             w4,  3f
19912:      // Vertical padding (h_pad > 0)
1992        subs            w4,  w4,  #4
1993        st1             {v0.8h, v1.8h}, [x0], #32
1994        add             v16.8h,  v16.8h,  v0.8h
1995        add             v17.8h,  v17.8h,  v1.8h
1996        b.gt            2b
19973:
1998        // Aggregate the sums
1999        add             v0.8h,   v16.8h,  v17.8h
2000        uaddlv          s0,  v0.8h                // sum
2001        sub             x0,  x0,  w6, uxtw #3
2002        urshl           v4.2s,   v0.2s,   v31.2s  // (sum + (1 << (log2sz - 1))) >>= log2sz
2003        dup             v4.8h,   v4.h[0]
20046:      // Subtract dc from ac
2005        ld1             {v0.8h, v1.8h}, [x0]
2006        subs            w6,  w6,  #4
2007        sub             v0.8h,   v0.8h,   v4.8h
2008        sub             v1.8h,   v1.8h,   v4.8h
2009        st1             {v0.8h, v1.8h}, [x0], #32
2010        b.gt            6b
2011        ret
2012
2013L(ipred_cfl_ac_420_w8):
2014        cbnz            w3,  L(ipred_cfl_ac_420_w8_wpad)
20151:      // Copy and subsample input, without padding
2016        ld1             {v0.16b}, [x1],  x2
2017        ld1             {v1.16b}, [x10], x2
2018        ld1             {v2.16b}, [x1],  x2
2019        uaddlp          v0.8h,   v0.16b
2020        ld1             {v3.16b}, [x10], x2
2021        uaddlp          v1.8h,   v1.16b
2022        uaddlp          v2.8h,   v2.16b
2023        uaddlp          v3.8h,   v3.16b
2024        add             v0.8h,   v0.8h,   v1.8h
2025        add             v2.8h,   v2.8h,   v3.8h
2026        shl             v0.8h,   v0.8h,   #1
2027        shl             v1.8h,   v2.8h,   #1
2028        subs            w8,  w8,  #2
2029        st1             {v0.8h, v1.8h}, [x0], #32
2030        add             v16.8h,  v16.8h,  v0.8h
2031        add             v17.8h,  v17.8h,  v1.8h
2032        b.gt            1b
2033        mov             v0.16b,  v1.16b
2034        b               L(ipred_cfl_ac_420_w8_hpad)
2035
2036L(ipred_cfl_ac_420_w8_wpad):
20371:      // Copy and subsample input, padding 4
2038        ld1             {v0.8b},   [x1],  x2
2039        ld1             {v1.8b},   [x10], x2
2040        ld1             {v0.d}[1], [x1],  x2
2041        ld1             {v1.d}[1], [x10], x2
2042        uaddlp          v0.8h,   v0.16b
2043        uaddlp          v1.8h,   v1.16b
2044        add             v0.8h,   v0.8h,   v1.8h
2045        shl             v0.8h,   v0.8h,   #1
2046        dup             v1.4h,   v0.h[3]
2047        dup             v3.4h,   v0.h[7]
2048        trn2            v2.2d,   v0.2d,   v0.2d
2049        subs            w8,  w8,  #2
2050        st1             {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32
2051        add             v16.4h,  v16.4h,  v0.4h
2052        add             v17.4h,  v17.4h,  v1.4h
2053        add             v18.4h,  v18.4h,  v2.4h
2054        add             v19.4h,  v19.4h,  v3.4h
2055        b.gt            1b
2056        trn1            v0.2d,   v2.2d,   v3.2d
2057        trn1            v1.2d,   v2.2d,   v3.2d
2058
2059L(ipred_cfl_ac_420_w8_hpad):
2060        cbz             w4,  3f
20612:      // Vertical padding (h_pad > 0)
2062        subs            w4,  w4,  #4
2063        st1             {v0.8h, v1.8h}, [x0], #32
2064        add             v16.8h,  v16.8h,  v0.8h
2065        add             v17.8h,  v17.8h,  v1.8h
2066        st1             {v0.8h, v1.8h}, [x0], #32
2067        add             v18.8h,  v18.8h,  v0.8h
2068        add             v19.8h,  v19.8h,  v1.8h
2069        b.gt            2b
20703:
2071
2072L(ipred_cfl_ac_420_w8_calc_subtract_dc):
2073        // Aggregate the sums
2074        add             v0.8h,   v16.8h,  v17.8h
2075        add             v2.8h,   v18.8h,  v19.8h
2076        uaddlp          v0.4s,   v0.8h
2077        uaddlp          v2.4s,   v2.8h
2078        add             v0.4s,   v0.4s,   v2.4s
2079        addv            s0,  v0.4s                // sum
2080        sub             x0,  x0,  w6, uxtw #4
2081        urshl           v4.2s,   v0.2s,   v31.2s  // (sum + (1 << (log2sz - 1))) >>= log2sz
2082        dup             v4.8h,   v4.h[0]
2083L(ipred_cfl_ac_420_w8_subtract_dc):
20846:      // Subtract dc from ac
2085        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0]
2086        subs            w6,  w6,  #4
2087        sub             v0.8h,   v0.8h,   v4.8h
2088        sub             v1.8h,   v1.8h,   v4.8h
2089        sub             v2.8h,   v2.8h,   v4.8h
2090        sub             v3.8h,   v3.8h,   v4.8h
2091        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
2092        b.gt            6b
2093        ret
2094
2095L(ipred_cfl_ac_420_w16):
2096        adr             x7,  L(ipred_cfl_ac_420_w16_tbl)
2097        ldrh            w3,  [x7, w3, uxtw #1]
2098        sub             x7,  x7,  w3, uxtw
2099        br              x7
2100
2101L(ipred_cfl_ac_420_w16_wpad0):
21021:      // Copy and subsample input, without padding
2103        ld1             {v0.16b, v1.16b}, [x1],  x2
2104        ld1             {v2.16b, v3.16b}, [x10], x2
2105        uaddlp          v0.8h,   v0.16b
2106        ld1             {v4.16b, v5.16b}, [x1],  x2
2107        uaddlp          v1.8h,   v1.16b
2108        ld1             {v6.16b, v7.16b}, [x10], x2
2109        uaddlp          v2.8h,   v2.16b
2110        uaddlp          v3.8h,   v3.16b
2111        uaddlp          v4.8h,   v4.16b
2112        uaddlp          v5.8h,   v5.16b
2113        uaddlp          v6.8h,   v6.16b
2114        uaddlp          v7.8h,   v7.16b
2115        add             v0.8h,   v0.8h,   v2.8h
2116        add             v1.8h,   v1.8h,   v3.8h
2117        add             v4.8h,   v4.8h,   v6.8h
2118        add             v5.8h,   v5.8h,   v7.8h
2119        shl             v0.8h,   v0.8h,   #1
2120        shl             v1.8h,   v1.8h,   #1
2121        shl             v2.8h,   v4.8h,   #1
2122        shl             v3.8h,   v5.8h,   #1
2123        subs            w8,  w8,  #2
2124        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
2125        add             v16.8h,  v16.8h,  v0.8h
2126        add             v17.8h,  v17.8h,  v1.8h
2127        add             v18.8h,  v18.8h,  v2.8h
2128        add             v19.8h,  v19.8h,  v3.8h
2129        b.gt            1b
2130        mov             v0.16b,  v2.16b
2131        mov             v1.16b,  v3.16b
2132        b               L(ipred_cfl_ac_420_w16_hpad)
2133
2134L(ipred_cfl_ac_420_w16_wpad1):
21351:      // Copy and subsample input, padding 4
2136        ldr             d1,  [x1,  #16]
2137        ld1             {v0.16b}, [x1],  x2
2138        ldr             d3,  [x10, #16]
2139        ld1             {v2.16b}, [x10], x2
2140        uaddlp          v1.4h,   v1.8b
2141        ldr             d5,  [x1,  #16]
2142        uaddlp          v0.8h,   v0.16b
2143        ld1             {v4.16b}, [x1],  x2
2144        uaddlp          v3.4h,   v3.8b
2145        ldr             d7,  [x10, #16]
2146        uaddlp          v2.8h,   v2.16b
2147        ld1             {v6.16b}, [x10], x2
2148        uaddlp          v5.4h,   v5.8b
2149        uaddlp          v4.8h,   v4.16b
2150        uaddlp          v7.4h,   v7.8b
2151        uaddlp          v6.8h,   v6.16b
2152        add             v1.4h,   v1.4h,   v3.4h
2153        add             v0.8h,   v0.8h,   v2.8h
2154        add             v5.4h,   v5.4h,   v7.4h
2155        add             v4.8h,   v4.8h,   v6.8h
2156        shl             v1.4h,   v1.4h,   #1
2157        shl             v0.8h,   v0.8h,   #1
2158        shl             v3.4h,   v5.4h,   #1
2159        shl             v2.8h,   v4.8h,   #1
2160        dup             v4.4h,   v1.h[3]
2161        dup             v5.4h,   v3.h[3]
2162        trn1            v1.2d,   v1.2d,   v4.2d
2163        trn1            v3.2d,   v3.2d,   v5.2d
2164        subs            w8,  w8,  #2
2165        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
2166        add             v16.8h,  v16.8h,  v0.8h
2167        add             v17.8h,  v17.8h,  v1.8h
2168        add             v18.8h,  v18.8h,  v2.8h
2169        add             v19.8h,  v19.8h,  v3.8h
2170        b.gt            1b
2171        mov             v0.16b,  v2.16b
2172        mov             v1.16b,  v3.16b
2173        b               L(ipred_cfl_ac_420_w16_hpad)
2174
2175L(ipred_cfl_ac_420_w16_wpad2):
21761:      // Copy and subsample input, padding 8
2177        ld1             {v0.16b}, [x1],  x2
2178        ld1             {v2.16b}, [x10], x2
2179        ld1             {v4.16b}, [x1],  x2
2180        uaddlp          v0.8h,   v0.16b
2181        ld1             {v6.16b}, [x10], x2
2182        uaddlp          v2.8h,   v2.16b
2183        uaddlp          v4.8h,   v4.16b
2184        uaddlp          v6.8h,   v6.16b
2185        add             v0.8h,   v0.8h,   v2.8h
2186        add             v4.8h,   v4.8h,   v6.8h
2187        shl             v0.8h,   v0.8h,   #1
2188        shl             v2.8h,   v4.8h,   #1
2189        dup             v1.8h,   v0.h[7]
2190        dup             v3.8h,   v2.h[7]
2191        subs            w8,  w8,  #2
2192        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
2193        add             v16.8h,  v16.8h,  v0.8h
2194        add             v17.8h,  v17.8h,  v1.8h
2195        add             v18.8h,  v18.8h,  v2.8h
2196        add             v19.8h,  v19.8h,  v3.8h
2197        b.gt            1b
2198        mov             v0.16b,  v2.16b
2199        mov             v1.16b,  v3.16b
2200        b               L(ipred_cfl_ac_420_w16_hpad)
2201
2202L(ipred_cfl_ac_420_w16_wpad3):
22031:      // Copy and subsample input, padding 12
2204        ld1             {v0.8b}, [x1],  x2
2205        ld1             {v2.8b}, [x10], x2
2206        ld1             {v4.8b}, [x1],  x2
2207        uaddlp          v0.4h,   v0.8b
2208        ld1             {v6.8b}, [x10], x2
2209        uaddlp          v2.4h,   v2.8b
2210        uaddlp          v4.4h,   v4.8b
2211        uaddlp          v6.4h,   v6.8b
2212        add             v0.4h,   v0.4h,   v2.4h
2213        add             v4.4h,   v4.4h,   v6.4h
2214        shl             v0.4h,   v0.4h,   #1
2215        shl             v2.4h,   v4.4h,   #1
2216        dup             v1.8h,   v0.h[3]
2217        dup             v3.8h,   v2.h[3]
2218        trn1            v0.2d,   v0.2d,   v1.2d
2219        trn1            v2.2d,   v2.2d,   v3.2d
2220        subs            w8,  w8,  #2
2221        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
2222        add             v16.8h,  v16.8h,  v0.8h
2223        add             v17.8h,  v17.8h,  v1.8h
2224        add             v18.8h,  v18.8h,  v2.8h
2225        add             v19.8h,  v19.8h,  v3.8h
2226        b.gt            1b
2227        mov             v0.16b,  v2.16b
2228        mov             v1.16b,  v3.16b
2229
2230L(ipred_cfl_ac_420_w16_hpad):
2231        cbz             w4,  3f
22322:      // Vertical padding (h_pad > 0)
2233        subs            w4,  w4,  #4
2234        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
2235        add             v16.8h,  v16.8h,  v0.8h
2236        add             v17.8h,  v17.8h,  v1.8h
2237        add             v18.8h,  v18.8h,  v2.8h
2238        add             v19.8h,  v19.8h,  v3.8h
2239        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
2240        add             v16.8h,  v16.8h,  v0.8h
2241        add             v17.8h,  v17.8h,  v1.8h
2242        add             v18.8h,  v18.8h,  v2.8h
2243        add             v19.8h,  v19.8h,  v3.8h
2244        b.gt            2b
22453:
2246
2247        // Double the height and reuse the w8 summing/subtracting
2248        lsl             w6,  w6,  #1
2249        b               L(ipred_cfl_ac_420_w8_calc_subtract_dc)
2250
2251L(ipred_cfl_ac_420_tbl):
2252        .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w16)
2253        .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w8)
2254        .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w4)
2255        .hword 0
2256
2257L(ipred_cfl_ac_420_w16_tbl):
2258        .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad0)
2259        .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad1)
2260        .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad2)
2261        .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad3)
2262endfunc
2263
2264// void cfl_ac_422_8bpc_neon(int16_t *const ac, const pixel *const ypx,
2265//                           const ptrdiff_t stride, const int w_pad,
2266//                           const int h_pad, const int cw, const int ch);
2267function ipred_cfl_ac_422_8bpc_neon, export=1
2268        clz             w8,  w5
2269        lsl             w4,  w4,  #2
2270        adr             x7,  L(ipred_cfl_ac_422_tbl)
2271        sub             w8,  w8,  #27
2272        ldrh            w8,  [x7, w8, uxtw #1]
2273        movi            v16.8h,  #0
2274        movi            v17.8h,  #0
2275        movi            v18.8h,  #0
2276        movi            v19.8h,  #0
2277        sub             x7,  x7,  w8, uxtw
2278        sub             w8,  w6,  w4         // height - h_pad
2279        rbit            w9,  w5              // rbit(width)
2280        rbit            w10, w6              // rbit(height)
2281        clz             w9,  w9              // ctz(width)
2282        clz             w10, w10             // ctz(height)
2283        add             w9,  w9,  w10        // log2sz
2284        add             x10, x1,  x2
2285        dup             v31.4s,  w9
2286        lsl             x2,  x2,  #1
2287        neg             v31.4s,  v31.4s      // -log2sz
2288        br              x7
2289
2290L(ipred_cfl_ac_422_w4):
22911:      // Copy and subsample input
2292        ld1             {v0.8b},   [x1],  x2
2293        ld1             {v0.d}[1], [x10], x2
2294        ld1             {v1.8b},   [x1],  x2
2295        ld1             {v1.d}[1], [x10], x2
2296        uaddlp          v0.8h,   v0.16b
2297        uaddlp          v1.8h,   v1.16b
2298        shl             v0.8h,   v0.8h,   #2
2299        shl             v1.8h,   v1.8h,   #2
2300        subs            w8,  w8,  #4
2301        add             v16.8h,  v16.8h,  v0.8h
2302        add             v17.8h,  v17.8h,  v1.8h
2303        st1             {v0.8h, v1.8h}, [x0], #32
2304        b.gt            1b
2305        trn2            v0.2d,   v1.2d,   v1.2d
2306        trn2            v1.2d,   v1.2d,   v1.2d
2307        b               L(ipred_cfl_ac_420_w4_hpad)
2308
2309L(ipred_cfl_ac_422_w8):
2310        cbnz            w3,  L(ipred_cfl_ac_422_w8_wpad)
23111:      // Copy and subsample input, without padding
2312        ld1             {v0.16b}, [x1],  x2
2313        ld1             {v1.16b}, [x10], x2
2314        ld1             {v2.16b}, [x1],  x2
2315        uaddlp          v0.8h,   v0.16b
2316        ld1             {v3.16b}, [x10], x2
2317        uaddlp          v1.8h,   v1.16b
2318        uaddlp          v2.8h,   v2.16b
2319        uaddlp          v3.8h,   v3.16b
2320        shl             v0.8h,   v0.8h,   #2
2321        shl             v1.8h,   v1.8h,   #2
2322        shl             v2.8h,   v2.8h,   #2
2323        shl             v3.8h,   v3.8h,   #2
2324        subs            w8,  w8,  #4
2325        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
2326        add             v16.8h,  v16.8h,  v0.8h
2327        add             v17.8h,  v17.8h,  v1.8h
2328        add             v18.8h,  v18.8h,  v2.8h
2329        add             v19.8h,  v19.8h,  v3.8h
2330        b.gt            1b
2331        mov             v0.16b,  v3.16b
2332        mov             v1.16b,  v3.16b
2333        b               L(ipred_cfl_ac_420_w8_hpad)
2334
2335L(ipred_cfl_ac_422_w8_wpad):
23361:      // Copy and subsample input, padding 4
2337        ld1             {v0.8b},   [x1],  x2
2338        ld1             {v0.d}[1], [x10], x2
2339        ld1             {v2.8b},   [x1],  x2
2340        ld1             {v2.d}[1], [x10], x2
2341        uaddlp          v0.8h,   v0.16b
2342        uaddlp          v2.8h,   v2.16b
2343        shl             v0.8h,   v0.8h,   #2
2344        shl             v2.8h,   v2.8h,   #2
2345        dup             v4.4h,   v0.h[3]
2346        dup             v5.8h,   v0.h[7]
2347        dup             v6.4h,   v2.h[3]
2348        dup             v7.8h,   v2.h[7]
2349        trn2            v1.2d,   v0.2d,   v5.2d
2350        trn1            v0.2d,   v0.2d,   v4.2d
2351        trn2            v3.2d,   v2.2d,   v7.2d
2352        trn1            v2.2d,   v2.2d,   v6.2d
2353        subs            w8,  w8,  #4
2354        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
2355        add             v16.8h,  v16.8h,  v0.8h
2356        add             v17.8h,  v17.8h,  v1.8h
2357        add             v18.8h,  v18.8h,  v2.8h
2358        add             v19.8h,  v19.8h,  v3.8h
2359        b.gt            1b
2360        mov             v0.16b,  v3.16b
2361        mov             v1.16b,  v3.16b
2362        b               L(ipred_cfl_ac_420_w8_hpad)
2363
2364L(ipred_cfl_ac_422_w16):
2365        adr             x7,  L(ipred_cfl_ac_422_w16_tbl)
2366        ldrh            w3,  [x7, w3, uxtw #1]
2367        sub             x7,  x7,  w3, uxtw
2368        br              x7
2369
2370L(ipred_cfl_ac_422_w16_wpad0):
23711:      // Copy and subsample input, without padding
2372        ld1             {v0.16b, v1.16b}, [x1],  x2
2373        ld1             {v2.16b, v3.16b}, [x10], x2
2374        uaddlp          v0.8h,   v0.16b
2375        uaddlp          v1.8h,   v1.16b
2376        uaddlp          v2.8h,   v2.16b
2377        uaddlp          v3.8h,   v3.16b
2378        shl             v0.8h,   v0.8h,   #2
2379        shl             v1.8h,   v1.8h,   #2
2380        shl             v2.8h,   v2.8h,   #2
2381        shl             v3.8h,   v3.8h,   #2
2382        subs            w8,  w8,  #2
2383        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
2384        add             v16.8h,  v16.8h,  v0.8h
2385        add             v17.8h,  v17.8h,  v1.8h
2386        add             v18.8h,  v18.8h,  v2.8h
2387        add             v19.8h,  v19.8h,  v3.8h
2388        b.gt            1b
2389        mov             v0.16b,  v2.16b
2390        mov             v1.16b,  v3.16b
2391        b               L(ipred_cfl_ac_420_w16_hpad)
2392
2393L(ipred_cfl_ac_422_w16_wpad1):
23941:      // Copy and subsample input, padding 4
2395        ldr             d1,  [x1,  #16]
2396        ld1             {v0.16b}, [x1],  x2
2397        ldr             d3,  [x10, #16]
2398        ld1             {v2.16b}, [x10], x2
2399        uaddlp          v1.4h,   v1.8b
2400        uaddlp          v0.8h,   v0.16b
2401        uaddlp          v3.4h,   v3.8b
2402        uaddlp          v2.8h,   v2.16b
2403        shl             v1.4h,   v1.4h,   #2
2404        shl             v0.8h,   v0.8h,   #2
2405        shl             v3.4h,   v3.4h,   #2
2406        shl             v2.8h,   v2.8h,   #2
2407        dup             v4.4h,   v1.h[3]
2408        dup             v5.4h,   v3.h[3]
2409        trn1            v1.2d,   v1.2d,   v4.2d
2410        trn1            v3.2d,   v3.2d,   v5.2d
2411        subs            w8,  w8,  #2
2412        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
2413        add             v16.8h,  v16.8h,  v0.8h
2414        add             v17.8h,  v17.8h,  v1.8h
2415        add             v18.8h,  v18.8h,  v2.8h
2416        add             v19.8h,  v19.8h,  v3.8h
2417        b.gt            1b
2418        mov             v0.16b,  v2.16b
2419        mov             v1.16b,  v3.16b
2420        b               L(ipred_cfl_ac_420_w16_hpad)
2421
2422L(ipred_cfl_ac_422_w16_wpad2):
24231:      // Copy and subsample input, padding 8
2424        ld1             {v0.16b}, [x1],  x2
2425        ld1             {v2.16b}, [x10], x2
2426        uaddlp          v0.8h,   v0.16b
2427        uaddlp          v2.8h,   v2.16b
2428        shl             v0.8h,   v0.8h,   #2
2429        shl             v2.8h,   v2.8h,   #2
2430        dup             v1.8h,   v0.h[7]
2431        dup             v3.8h,   v2.h[7]
2432        subs            w8,  w8,  #2
2433        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
2434        add             v16.8h,  v16.8h,  v0.8h
2435        add             v17.8h,  v17.8h,  v1.8h
2436        add             v18.8h,  v18.8h,  v2.8h
2437        add             v19.8h,  v19.8h,  v3.8h
2438        b.gt            1b
2439        mov             v0.16b,  v2.16b
2440        mov             v1.16b,  v3.16b
2441        b               L(ipred_cfl_ac_420_w16_hpad)
2442
2443L(ipred_cfl_ac_422_w16_wpad3):
24441:      // Copy and subsample input, padding 12
2445        ld1             {v0.8b}, [x1],  x2
2446        ld1             {v2.8b}, [x10], x2
2447        uaddlp          v0.4h,   v0.8b
2448        uaddlp          v2.4h,   v2.8b
2449        shl             v0.4h,   v0.4h,   #2
2450        shl             v2.4h,   v2.4h,   #2
2451        dup             v1.8h,   v0.h[3]
2452        dup             v3.8h,   v2.h[3]
2453        trn1            v0.2d,   v0.2d,   v1.2d
2454        trn1            v2.2d,   v2.2d,   v3.2d
2455        subs            w8,  w8,  #2
2456        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
2457        add             v16.8h,  v16.8h,  v0.8h
2458        add             v17.8h,  v17.8h,  v1.8h
2459        add             v18.8h,  v18.8h,  v2.8h
2460        add             v19.8h,  v19.8h,  v3.8h
2461        b.gt            1b
2462        mov             v0.16b,  v2.16b
2463        mov             v1.16b,  v3.16b
2464        b               L(ipred_cfl_ac_420_w16_hpad)
2465
2466L(ipred_cfl_ac_422_tbl):
2467        .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w16)
2468        .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w8)
2469        .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w4)
2470        .hword 0
2471
2472L(ipred_cfl_ac_422_w16_tbl):
2473        .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad0)
2474        .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad1)
2475        .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad2)
2476        .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad3)
2477endfunc
2478
2479// void cfl_ac_444_8bpc_neon(int16_t *const ac, const pixel *const ypx,
2480//                           const ptrdiff_t stride, const int w_pad,
2481//                           const int h_pad, const int cw, const int ch);
2482function ipred_cfl_ac_444_8bpc_neon, export=1
2483        clz             w8,  w5
2484        lsl             w4,  w4,  #2
2485        adr             x7,  L(ipred_cfl_ac_444_tbl)
2486        sub             w8,  w8,  #26
2487        ldrh            w8,  [x7, w8, uxtw #1]
2488        movi            v16.8h,  #0
2489        movi            v17.8h,  #0
2490        movi            v18.8h,  #0
2491        movi            v19.8h,  #0
2492        sub             x7,  x7,  w8, uxtw
2493        sub             w8,  w6,  w4         // height - h_pad
2494        rbit            w9,  w5              // rbit(width)
2495        rbit            w10, w6              // rbit(height)
2496        clz             w9,  w9              // ctz(width)
2497        clz             w10, w10             // ctz(height)
2498        add             w9,  w9,  w10        // log2sz
2499        add             x10, x1,  x2
2500        dup             v31.4s,  w9
2501        lsl             x2,  x2,  #1
2502        neg             v31.4s,  v31.4s      // -log2sz
2503        br              x7
2504
2505L(ipred_cfl_ac_444_w4):
25061:      // Copy and expand input
2507        ld1             {v0.s}[0], [x1],  x2
2508        ld1             {v0.s}[1], [x10], x2
2509        ld1             {v1.s}[0], [x1],  x2
2510        ld1             {v1.s}[1], [x10], x2
2511        ushll           v0.8h,   v0.8b,   #3
2512        ushll           v1.8h,   v1.8b,   #3
2513        subs            w8,  w8,  #4
2514        add             v16.8h,  v16.8h,  v0.8h
2515        add             v17.8h,  v17.8h,  v1.8h
2516        st1             {v0.8h, v1.8h}, [x0], #32
2517        b.gt            1b
2518        trn2            v0.2d,   v1.2d,   v1.2d
2519        trn2            v1.2d,   v1.2d,   v1.2d
2520        b               L(ipred_cfl_ac_420_w4_hpad)
2521
2522L(ipred_cfl_ac_444_w8):
25231:      // Copy and expand input
2524        ld1             {v0.8b}, [x1],  x2
2525        ld1             {v1.8b}, [x10], x2
2526        ld1             {v2.8b}, [x1],  x2
2527        ushll           v0.8h,   v0.8b,   #3
2528        ld1             {v3.8b}, [x10], x2
2529        ushll           v1.8h,   v1.8b,   #3
2530        ushll           v2.8h,   v2.8b,   #3
2531        ushll           v3.8h,   v3.8b,   #3
2532        subs            w8,  w8,  #4
2533        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
2534        add             v16.8h,  v16.8h,  v0.8h
2535        add             v17.8h,  v17.8h,  v1.8h
2536        add             v18.8h,  v18.8h,  v2.8h
2537        add             v19.8h,  v19.8h,  v3.8h
2538        b.gt            1b
2539        mov             v0.16b,  v3.16b
2540        mov             v1.16b,  v3.16b
2541        b               L(ipred_cfl_ac_420_w8_hpad)
2542
2543L(ipred_cfl_ac_444_w16):
2544        cbnz            w3,  L(ipred_cfl_ac_444_w16_wpad)
25451:      // Copy and expand input, without padding
2546        ld1             {v0.16b}, [x1],  x2
2547        ld1             {v2.16b}, [x10], x2
2548        ld1             {v4.16b}, [x1],  x2
2549        ushll2          v1.8h,   v0.16b,  #3
2550        ushll           v0.8h,   v0.8b,   #3
2551        ld1             {v6.16b}, [x10], x2
2552        ushll2          v3.8h,   v2.16b,  #3
2553        ushll           v2.8h,   v2.8b,   #3
2554        ushll2          v5.8h,   v4.16b,  #3
2555        ushll           v4.8h,   v4.8b,   #3
2556        ushll2          v7.8h,   v6.16b,  #3
2557        ushll           v6.8h,   v6.8b,   #3
2558        subs            w8,  w8,  #4
2559        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
2560        add             v16.8h,  v16.8h,  v0.8h
2561        add             v17.8h,  v17.8h,  v1.8h
2562        add             v18.8h,  v18.8h,  v2.8h
2563        add             v19.8h,  v19.8h,  v3.8h
2564        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
2565        add             v16.8h,  v16.8h,  v4.8h
2566        add             v17.8h,  v17.8h,  v5.8h
2567        add             v18.8h,  v18.8h,  v6.8h
2568        add             v19.8h,  v19.8h,  v7.8h
2569        b.gt            1b
2570        mov             v0.16b,  v6.16b
2571        mov             v1.16b,  v7.16b
2572        mov             v2.16b,  v6.16b
2573        mov             v3.16b,  v7.16b
2574        b               L(ipred_cfl_ac_420_w16_hpad)
2575
2576L(ipred_cfl_ac_444_w16_wpad):
25771:      // Copy and expand input, padding 8
2578        ld1             {v0.8b}, [x1],  x2
2579        ld1             {v2.8b}, [x10], x2
2580        ld1             {v4.8b}, [x1],  x2
2581        ld1             {v6.8b}, [x10], x2
2582        ushll           v0.8h,   v0.8b,   #3
2583        ushll           v2.8h,   v2.8b,   #3
2584        ushll           v4.8h,   v4.8b,   #3
2585        ushll           v6.8h,   v6.8b,   #3
2586        dup             v1.8h,   v0.h[7]
2587        dup             v3.8h,   v2.h[7]
2588        dup             v5.8h,   v4.h[7]
2589        dup             v7.8h,   v6.h[7]
2590        subs            w8,  w8,  #4
2591        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
2592        add             v16.8h,  v16.8h,  v0.8h
2593        add             v17.8h,  v17.8h,  v1.8h
2594        add             v18.8h,  v18.8h,  v2.8h
2595        add             v19.8h,  v19.8h,  v3.8h
2596        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
2597        add             v16.8h,  v16.8h,  v4.8h
2598        add             v17.8h,  v17.8h,  v5.8h
2599        add             v18.8h,  v18.8h,  v6.8h
2600        add             v19.8h,  v19.8h,  v7.8h
2601        b.gt            1b
2602        mov             v0.16b,  v6.16b
2603        mov             v1.16b,  v7.16b
2604        mov             v2.16b,  v6.16b
2605        mov             v3.16b,  v7.16b
2606        b               L(ipred_cfl_ac_420_w16_hpad)
2607
2608L(ipred_cfl_ac_444_w32):
2609        adr             x7,  L(ipred_cfl_ac_444_w32_tbl)
2610        ldrh            w3,  [x7, w3, uxtw] // (w3>>1) << 1
2611        sub             x7,  x7,  w3, uxtw
2612        br              x7
2613
2614L(ipred_cfl_ac_444_w32_wpad0):
26151:      // Copy and expand input, without padding
2616        ld1             {v2.16b, v3.16b}, [x1],  x2
2617        ld1             {v6.16b, v7.16b}, [x10], x2
2618        ushll           v0.8h,   v2.8b,   #3
2619        ushll2          v1.8h,   v2.16b,  #3
2620        ushll           v2.8h,   v3.8b,   #3
2621        ushll2          v3.8h,   v3.16b,  #3
2622        ushll           v4.8h,   v6.8b,   #3
2623        ushll2          v5.8h,   v6.16b,  #3
2624        ushll           v6.8h,   v7.8b,   #3
2625        ushll2          v7.8h,   v7.16b,  #3
2626        subs            w8,  w8,  #2
2627        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
2628        add             v16.8h,  v16.8h,  v0.8h
2629        add             v17.8h,  v17.8h,  v1.8h
2630        add             v18.8h,  v18.8h,  v2.8h
2631        add             v19.8h,  v19.8h,  v3.8h
2632        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
2633        add             v16.8h,  v16.8h,  v4.8h
2634        add             v17.8h,  v17.8h,  v5.8h
2635        add             v18.8h,  v18.8h,  v6.8h
2636        add             v19.8h,  v19.8h,  v7.8h
2637        b.gt            1b
2638        b               L(ipred_cfl_ac_444_w32_hpad)
2639
2640L(ipred_cfl_ac_444_w32_wpad2):
26411:      // Copy and expand input, padding 8
2642        ldr             d2,  [x1,  #16]
2643        ld1             {v1.16b}, [x1],  x2
2644        ldr             d6,  [x10, #16]
2645        ld1             {v5.16b}, [x10], x2
2646        ushll           v2.8h,   v2.8b,   #3
2647        ushll           v0.8h,   v1.8b,   #3
2648        ushll2          v1.8h,   v1.16b,  #3
2649        ushll           v6.8h,   v6.8b,   #3
2650        ushll           v4.8h,   v5.8b,   #3
2651        ushll2          v5.8h,   v5.16b,  #3
2652        dup             v3.8h,   v2.h[7]
2653        dup             v7.8h,   v6.h[7]
2654        subs            w8,  w8,  #2
2655        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
2656        add             v16.8h,  v16.8h,  v0.8h
2657        add             v17.8h,  v17.8h,  v1.8h
2658        add             v18.8h,  v18.8h,  v2.8h
2659        add             v19.8h,  v19.8h,  v3.8h
2660        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
2661        add             v16.8h,  v16.8h,  v4.8h
2662        add             v17.8h,  v17.8h,  v5.8h
2663        add             v18.8h,  v18.8h,  v6.8h
2664        add             v19.8h,  v19.8h,  v7.8h
2665        b.gt            1b
2666        b               L(ipred_cfl_ac_444_w32_hpad)
2667
2668L(ipred_cfl_ac_444_w32_wpad4):
26691:      // Copy and expand input, padding 16
2670        ld1             {v1.16b}, [x1],  x2
2671        ld1             {v5.16b}, [x10], x2
2672        ushll           v0.8h,   v1.8b,   #3
2673        ushll2          v1.8h,   v1.16b,  #3
2674        ushll           v4.8h,   v5.8b,   #3
2675        ushll2          v5.8h,   v5.16b,  #3
2676        dup             v2.8h,   v1.h[7]
2677        dup             v3.8h,   v1.h[7]
2678        dup             v6.8h,   v5.h[7]
2679        dup             v7.8h,   v5.h[7]
2680        subs            w8,  w8,  #2
2681        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
2682        add             v16.8h,  v16.8h,  v0.8h
2683        add             v17.8h,  v17.8h,  v1.8h
2684        add             v18.8h,  v18.8h,  v2.8h
2685        add             v19.8h,  v19.8h,  v3.8h
2686        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
2687        add             v16.8h,  v16.8h,  v4.8h
2688        add             v17.8h,  v17.8h,  v5.8h
2689        add             v18.8h,  v18.8h,  v6.8h
2690        add             v19.8h,  v19.8h,  v7.8h
2691        b.gt            1b
2692        b               L(ipred_cfl_ac_444_w32_hpad)
2693
2694L(ipred_cfl_ac_444_w32_wpad6):
26951:      // Copy and expand input, padding 24
2696        ld1             {v0.8b}, [x1],  x2
2697        ld1             {v4.8b}, [x10], x2
2698        ushll           v0.8h,   v0.8b,   #3
2699        ushll           v4.8h,   v4.8b,   #3
2700        dup             v1.8h,   v0.h[7]
2701        dup             v2.8h,   v0.h[7]
2702        dup             v3.8h,   v0.h[7]
2703        dup             v5.8h,   v4.h[7]
2704        dup             v6.8h,   v4.h[7]
2705        dup             v7.8h,   v4.h[7]
2706        subs            w8,  w8,  #2
2707        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
2708        add             v16.8h,  v16.8h,  v0.8h
2709        add             v17.8h,  v17.8h,  v1.8h
2710        add             v18.8h,  v18.8h,  v2.8h
2711        add             v19.8h,  v19.8h,  v3.8h
2712        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
2713        add             v16.8h,  v16.8h,  v4.8h
2714        add             v17.8h,  v17.8h,  v5.8h
2715        add             v18.8h,  v18.8h,  v6.8h
2716        add             v19.8h,  v19.8h,  v7.8h
2717        b.gt            1b
2718
2719L(ipred_cfl_ac_444_w32_hpad):
2720        cbz             w4,  3f
27212:      // Vertical padding (h_pad > 0)
2722        subs            w4,  w4,  #2
2723        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
2724        add             v16.8h,  v16.8h,  v4.8h
2725        add             v17.8h,  v17.8h,  v5.8h
2726        add             v18.8h,  v18.8h,  v6.8h
2727        add             v19.8h,  v19.8h,  v7.8h
2728        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
2729        add             v16.8h,  v16.8h,  v4.8h
2730        add             v17.8h,  v17.8h,  v5.8h
2731        add             v18.8h,  v18.8h,  v6.8h
2732        add             v19.8h,  v19.8h,  v7.8h
2733        b.gt            2b
27343:
2735
2736        // Quadruple the height and reuse the w8 subtracting
2737        lsl             w6,  w6,  #2
2738        // Aggregate the sums, with wider intermediates earlier than in
2739        // ipred_cfl_ac_420_w8_calc_subtract_dc.
2740        uaddlp          v0.4s,   v16.8h
2741        uaddlp          v1.4s,   v17.8h
2742        uaddlp          v2.4s,   v18.8h
2743        uaddlp          v3.4s,   v19.8h
2744        add             v0.4s,   v0.4s,   v1.4s
2745        add             v2.4s,   v2.4s,   v3.4s
2746        add             v0.4s,   v0.4s,   v2.4s
2747        addv            s0,  v0.4s                // sum
2748        sub             x0,  x0,  w6, uxtw #4
2749        urshl           v4.2s,   v0.2s,   v31.2s  // (sum + (1 << (log2sz - 1))) >>= log2sz
2750        dup             v4.8h,   v4.h[0]
2751        b               L(ipred_cfl_ac_420_w8_subtract_dc)
2752
2753L(ipred_cfl_ac_444_tbl):
2754        .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w32)
2755        .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w16)
2756        .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w8)
2757        .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w4)
2758
2759L(ipred_cfl_ac_444_w32_tbl):
2760        .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad0)
2761        .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad2)
2762        .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad4)
2763        .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad6)
2764endfunc
2765