1/*
2 * Copyright © 2018, VideoLAN and dav1d authors
3 * Copyright © 2019, Martin Storsjo
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright notice, this
10 *    list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright notice,
13 *    this list of conditions and the following disclaimer in the documentation
14 *    and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include "src/arm/asm.S"
29#include "util.S"
30
31// void ipred_dc_128_16bpc_neon(pixel *dst, const ptrdiff_t stride,
32//                              const pixel *const topleft,
33//                              const int width, const int height, const int a,
34//                              const int max_width, const int max_height,
35//                              const int bitdepth_max);
36function ipred_dc_128_16bpc_neon, export=1
37        ldr             w8,  [sp]
38        clz             w3,  w3
39        adr             x5,  L(ipred_dc_128_tbl)
40        sub             w3,  w3,  #25
41        ldrh            w3,  [x5, w3, uxtw #1]
42        dup             v0.8h,   w8
43        sub             x5,  x5,  w3, uxtw
44        add             x6,  x0,  x1
45        lsl             x1,  x1,  #1
46        urshr           v0.8h,   v0.8h,  #1
47        br              x5
484:
49        AARCH64_VALID_JUMP_TARGET
50        st1             {v0.4h},  [x0], x1
51        st1             {v0.4h},  [x6], x1
52        subs            w4,  w4,  #4
53        st1             {v0.4h},  [x0], x1
54        st1             {v0.4h},  [x6], x1
55        b.gt            4b
56        ret
578:
58        AARCH64_VALID_JUMP_TARGET
59        st1             {v0.8h},  [x0], x1
60        st1             {v0.8h},  [x6], x1
61        subs            w4,  w4,  #4
62        st1             {v0.8h},  [x0], x1
63        st1             {v0.8h},  [x6], x1
64        b.gt            8b
65        ret
66160:
67        AARCH64_VALID_JUMP_TARGET
68        mov             v1.16b,  v0.16b
6916:
70        st1             {v0.8h, v1.8h}, [x0], x1
71        st1             {v0.8h, v1.8h}, [x6], x1
72        subs            w4,  w4,  #4
73        st1             {v0.8h, v1.8h}, [x0], x1
74        st1             {v0.8h, v1.8h}, [x6], x1
75        b.gt            16b
76        ret
77320:
78        AARCH64_VALID_JUMP_TARGET
79        mov             v1.16b,  v0.16b
80        mov             v2.16b,  v0.16b
81        mov             v3.16b,  v0.16b
8232:
83        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
84        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
85        subs            w4,  w4,  #4
86        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
87        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
88        b.gt            32b
89        ret
90640:
91        AARCH64_VALID_JUMP_TARGET
92        mov             v1.16b,  v0.16b
93        mov             v2.16b,  v0.16b
94        mov             v3.16b,  v0.16b
95        sub             x1,  x1,  #64
9664:
97        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
98        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
99        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
100        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
101        subs            w4,  w4,  #4
102        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
103        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
104        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
105        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
106        b.gt            64b
107        ret
108
109L(ipred_dc_128_tbl):
110        .hword L(ipred_dc_128_tbl) - 640b
111        .hword L(ipred_dc_128_tbl) - 320b
112        .hword L(ipred_dc_128_tbl) - 160b
113        .hword L(ipred_dc_128_tbl) -   8b
114        .hword L(ipred_dc_128_tbl) -   4b
115endfunc
116
117// void ipred_v_16bpc_neon(pixel *dst, const ptrdiff_t stride,
118//                         const pixel *const topleft,
119//                         const int width, const int height, const int a,
120//                         const int max_width, const int max_height);
121function ipred_v_16bpc_neon, export=1
122        clz             w3,  w3
123        adr             x5,  L(ipred_v_tbl)
124        sub             w3,  w3,  #25
125        ldrh            w3,  [x5, w3, uxtw #1]
126        add             x2,  x2,  #2
127        sub             x5,  x5,  w3, uxtw
128        add             x6,  x0,  x1
129        lsl             x1,  x1,  #1
130        br              x5
13140:
132        AARCH64_VALID_JUMP_TARGET
133        ld1             {v0.4h},  [x2]
1344:
135        st1             {v0.4h},  [x0], x1
136        st1             {v0.4h},  [x6], x1
137        subs            w4,  w4,  #4
138        st1             {v0.4h},  [x0], x1
139        st1             {v0.4h},  [x6], x1
140        b.gt            4b
141        ret
14280:
143        AARCH64_VALID_JUMP_TARGET
144        ld1             {v0.8h},  [x2]
1458:
146        st1             {v0.8h},  [x0], x1
147        st1             {v0.8h},  [x6], x1
148        subs            w4,  w4,  #4
149        st1             {v0.8h},  [x0], x1
150        st1             {v0.8h},  [x6], x1
151        b.gt            8b
152        ret
153160:
154        AARCH64_VALID_JUMP_TARGET
155        ld1             {v0.8h, v1.8h}, [x2]
15616:
157        st1             {v0.8h, v1.8h}, [x0], x1
158        st1             {v0.8h, v1.8h}, [x6], x1
159        subs            w4,  w4,  #4
160        st1             {v0.8h, v1.8h}, [x0], x1
161        st1             {v0.8h, v1.8h}, [x6], x1
162        b.gt            16b
163        ret
164320:
165        AARCH64_VALID_JUMP_TARGET
166        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2]
16732:
168        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
169        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
170        subs            w4,  w4,  #4
171        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
172        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
173        b.gt            32b
174        ret
175640:
176        AARCH64_VALID_JUMP_TARGET
177        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
178        sub             x1,  x1,  #64
179        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2]
18064:
181        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
182        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
183        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1
184        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], x1
185        subs            w4,  w4,  #4
186        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
187        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
188        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1
189        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], x1
190        b.gt            64b
191        ret
192
193L(ipred_v_tbl):
194        .hword L(ipred_v_tbl) - 640b
195        .hword L(ipred_v_tbl) - 320b
196        .hword L(ipred_v_tbl) - 160b
197        .hword L(ipred_v_tbl) -  80b
198        .hword L(ipred_v_tbl) -  40b
199endfunc
200
201// void ipred_h_16bpc_neon(pixel *dst, const ptrdiff_t stride,
202//                         const pixel *const topleft,
203//                         const int width, const int height, const int a,
204//                         const int max_width, const int max_height);
205function ipred_h_16bpc_neon, export=1
206        clz             w3,  w3
207        adr             x5,  L(ipred_h_tbl)
208        sub             w3,  w3,  #25
209        ldrh            w3,  [x5, w3, uxtw #1]
210        sub             x2,  x2,  #8
211        sub             x5,  x5,  w3, uxtw
212        mov             x7,  #-8
213        add             x6,  x0,  x1
214        lsl             x1,  x1,  #1
215        br              x5
2164:
217        AARCH64_VALID_JUMP_TARGET
218        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7
219        st1             {v3.4h},  [x0], x1
220        st1             {v2.4h},  [x6], x1
221        subs            w4,  w4,  #4
222        st1             {v1.4h},  [x0], x1
223        st1             {v0.4h},  [x6], x1
224        b.gt            4b
225        ret
2268:
227        AARCH64_VALID_JUMP_TARGET
228        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7
229        st1             {v3.8h},  [x0], x1
230        st1             {v2.8h},  [x6], x1
231        subs            w4,  w4,  #4
232        st1             {v1.8h},  [x0], x1
233        st1             {v0.8h},  [x6], x1
234        b.gt            8b
235        ret
23616:
237        AARCH64_VALID_JUMP_TARGET
238        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7
239        str             q3,  [x0, #16]
240        str             q2,  [x6, #16]
241        st1             {v3.8h}, [x0], x1
242        st1             {v2.8h}, [x6], x1
243        subs            w4,  w4,  #4
244        str             q1,  [x0, #16]
245        str             q0,  [x6, #16]
246        st1             {v1.8h}, [x0], x1
247        st1             {v0.8h}, [x6], x1
248        b.gt            16b
249        ret
25032:
251        AARCH64_VALID_JUMP_TARGET
252        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7
253        str             q3,  [x0, #16]
254        str             q2,  [x6, #16]
255        stp             q3,  q3,  [x0, #32]
256        stp             q2,  q2,  [x6, #32]
257        st1             {v3.8h}, [x0], x1
258        st1             {v2.8h}, [x6], x1
259        subs            w4,  w4,  #4
260        str             q1,  [x0, #16]
261        str             q0,  [x6, #16]
262        stp             q1,  q1,  [x0, #32]
263        stp             q0,  q0,  [x6, #32]
264        st1             {v1.8h}, [x0], x1
265        st1             {v0.8h}, [x6], x1
266        b.gt            32b
267        ret
26864:
269        AARCH64_VALID_JUMP_TARGET
270        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7
271        str             q3,  [x0, #16]
272        str             q2,  [x6, #16]
273        stp             q3,  q3,  [x0, #32]
274        stp             q2,  q2,  [x6, #32]
275        stp             q3,  q3,  [x0, #64]
276        stp             q2,  q2,  [x6, #64]
277        stp             q3,  q3,  [x0, #96]
278        stp             q2,  q2,  [x6, #96]
279        st1             {v3.8h}, [x0], x1
280        st1             {v2.8h}, [x6], x1
281        subs            w4,  w4,  #4
282        str             q1,  [x0, #16]
283        str             q0,  [x6, #16]
284        stp             q1,  q1,  [x0, #32]
285        stp             q0,  q0,  [x6, #32]
286        stp             q1,  q1,  [x0, #64]
287        stp             q0,  q0,  [x6, #64]
288        stp             q1,  q1,  [x0, #96]
289        stp             q0,  q0,  [x6, #96]
290        st1             {v1.8h}, [x0], x1
291        st1             {v0.8h}, [x6], x1
292        b.gt            64b
293        ret
294
295L(ipred_h_tbl):
296        .hword L(ipred_h_tbl) - 64b
297        .hword L(ipred_h_tbl) - 32b
298        .hword L(ipred_h_tbl) - 16b
299        .hword L(ipred_h_tbl) -  8b
300        .hword L(ipred_h_tbl) -  4b
301endfunc
302
303// void ipred_dc_top_16bpc_neon(pixel *dst, const ptrdiff_t stride,
304//                              const pixel *const topleft,
305//                              const int width, const int height, const int a,
306//                              const int max_width, const int max_height);
307function ipred_dc_top_16bpc_neon, export=1
308        clz             w3,  w3
309        adr             x5,  L(ipred_dc_top_tbl)
310        sub             w3,  w3,  #25
311        ldrh            w3,  [x5, w3, uxtw #1]
312        add             x2,  x2,  #2
313        sub             x5,  x5,  w3, uxtw
314        add             x6,  x0,  x1
315        lsl             x1,  x1,  #1
316        br              x5
31740:
318        AARCH64_VALID_JUMP_TARGET
319        ld1             {v0.4h},  [x2]
320        addv            h0,      v0.4h
321        urshr           v0.4h,   v0.4h,   #2
322        dup             v0.4h,   v0.h[0]
3234:
324        st1             {v0.4h},  [x0], x1
325        st1             {v0.4h},  [x6], x1
326        subs            w4,  w4,  #4
327        st1             {v0.4h},  [x0], x1
328        st1             {v0.4h},  [x6], x1
329        b.gt            4b
330        ret
33180:
332        AARCH64_VALID_JUMP_TARGET
333        ld1             {v0.8h},  [x2]
334        addv            h0,      v0.8h
335        urshr           v0.4h,   v0.4h,   #3
336        dup             v0.8h,   v0.h[0]
3378:
338        st1             {v0.8h},  [x0], x1
339        st1             {v0.8h},  [x6], x1
340        subs            w4,  w4,  #4
341        st1             {v0.8h},  [x0], x1
342        st1             {v0.8h},  [x6], x1
343        b.gt            8b
344        ret
345160:
346        AARCH64_VALID_JUMP_TARGET
347        ld1             {v0.8h, v1.8h}, [x2]
348        addp            v0.8h,   v0.8h,   v1.8h
349        addv            h0,      v0.8h
350        urshr           v2.4h,   v0.4h,   #4
351        dup             v0.8h,   v2.h[0]
352        dup             v1.8h,   v2.h[0]
35316:
354        st1             {v0.8h, v1.8h}, [x0], x1
355        st1             {v0.8h, v1.8h}, [x6], x1
356        subs            w4,  w4,  #4
357        st1             {v0.8h, v1.8h}, [x0], x1
358        st1             {v0.8h, v1.8h}, [x6], x1
359        b.gt            16b
360        ret
361320:
362        AARCH64_VALID_JUMP_TARGET
363        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2]
364        addp            v0.8h,   v0.8h,   v1.8h
365        addp            v2.8h,   v2.8h,   v3.8h
366        addp            v0.8h,   v0.8h,   v2.8h
367        uaddlv          s0,      v0.8h
368        rshrn           v4.4h,   v0.4s,   #5
369        dup             v0.8h,   v4.h[0]
370        dup             v1.8h,   v4.h[0]
371        dup             v2.8h,   v4.h[0]
372        dup             v3.8h,   v4.h[0]
37332:
374        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
375        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
376        subs            w4,  w4,  #4
377        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
378        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
379        b.gt            32b
380        ret
381640:
382        AARCH64_VALID_JUMP_TARGET
383        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
384        addp            v0.8h,   v0.8h,   v1.8h
385        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2]
386        addp            v2.8h,   v2.8h,   v3.8h
387        addp            v4.8h,   v4.8h,   v5.8h
388        addp            v6.8h,   v6.8h,   v7.8h
389        addp            v0.8h,   v0.8h,   v2.8h
390        addp            v4.8h,   v4.8h,   v6.8h
391        addp            v0.8h,   v0.8h,   v4.8h
392        uaddlv          s0,      v0.8h
393        rshrn           v4.4h,   v0.4s,   #6
394        sub             x1,  x1,  #64
395        dup             v0.8h,   v4.h[0]
396        dup             v1.8h,   v4.h[0]
397        dup             v2.8h,   v4.h[0]
398        dup             v3.8h,   v4.h[0]
39964:
400        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
401        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
402        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
403        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
404        subs            w4,  w4,  #4
405        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
406        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
407        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
408        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
409        b.gt            64b
410        ret
411
412L(ipred_dc_top_tbl):
413        .hword L(ipred_dc_top_tbl) - 640b
414        .hword L(ipred_dc_top_tbl) - 320b
415        .hword L(ipred_dc_top_tbl) - 160b
416        .hword L(ipred_dc_top_tbl) -  80b
417        .hword L(ipred_dc_top_tbl) -  40b
418endfunc
419
420// void ipred_dc_left_16bpc_neon(pixel *dst, const ptrdiff_t stride,
421//                               const pixel *const topleft,
422//                               const int width, const int height, const int a,
423//                               const int max_width, const int max_height);
424function ipred_dc_left_16bpc_neon, export=1
425        sub             x2,  x2,  w4, uxtw #1
426        clz             w3,  w3
427        clz             w7,  w4
428        adr             x5,  L(ipred_dc_left_tbl)
429        sub             w3,  w3,  #20 // 25 leading bits, minus table offset 5
430        sub             w7,  w7,  #25
431        ldrh            w3,  [x5, w3, uxtw #1]
432        ldrh            w7,  [x5, w7, uxtw #1]
433        sub             x3,  x5,  w3, uxtw
434        sub             x5,  x5,  w7, uxtw
435        add             x6,  x0,  x1
436        lsl             x1,  x1,  #1
437        br              x5
438
439L(ipred_dc_left_h4):
440        AARCH64_VALID_JUMP_TARGET
441        ld1             {v0.4h},  [x2]
442        addv            h0,      v0.4h
443        urshr           v0.4h,   v0.4h,   #2
444        dup             v0.8h,   v0.h[0]
445        br              x3
446L(ipred_dc_left_w4):
447        AARCH64_VALID_JUMP_TARGET
448        st1             {v0.4h},  [x0], x1
449        st1             {v0.4h},  [x6], x1
450        subs            w4,  w4,  #4
451        st1             {v0.4h},  [x0], x1
452        st1             {v0.4h},  [x6], x1
453        b.gt            L(ipred_dc_left_w4)
454        ret
455
456L(ipred_dc_left_h8):
457        AARCH64_VALID_JUMP_TARGET
458        ld1             {v0.8h},  [x2]
459        addv            h0,      v0.8h
460        urshr           v0.4h,   v0.4h,   #3
461        dup             v0.8h,   v0.h[0]
462        br              x3
463L(ipred_dc_left_w8):
464        AARCH64_VALID_JUMP_TARGET
465        st1             {v0.8h},  [x0], x1
466        st1             {v0.8h},  [x6], x1
467        subs            w4,  w4,  #4
468        st1             {v0.8h},  [x0], x1
469        st1             {v0.8h},  [x6], x1
470        b.gt            L(ipred_dc_left_w8)
471        ret
472
473L(ipred_dc_left_h16):
474        AARCH64_VALID_JUMP_TARGET
475        ld1             {v0.8h, v1.8h}, [x2]
476        addp            v0.8h,   v0.8h,   v1.8h
477        addv            h0,      v0.8h
478        urshr           v2.4h,   v0.4h,   #4
479        dup             v0.8h,   v2.h[0]
480        dup             v1.8h,   v2.h[0]
481        br              x3
482L(ipred_dc_left_w16):
483        AARCH64_VALID_JUMP_TARGET
484        mov             v1.16b,  v0.16b
4851:
486        st1             {v0.8h, v1.8h}, [x0], x1
487        st1             {v0.8h, v1.8h}, [x6], x1
488        subs            w4,  w4,  #4
489        st1             {v0.8h, v1.8h}, [x0], x1
490        st1             {v0.8h, v1.8h}, [x6], x1
491        b.gt            1b
492        ret
493
494L(ipred_dc_left_h32):
495        AARCH64_VALID_JUMP_TARGET
496        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2]
497        addp            v0.8h,   v0.8h,   v1.8h
498        addp            v2.8h,   v2.8h,   v3.8h
499        addp            v0.8h,   v0.8h,   v2.8h
500        uaddlp          v0.4s,   v0.8h
501        addv            s0,      v0.4s
502        rshrn           v4.4h,   v0.4s,   #5
503        dup             v0.8h,   v4.h[0]
504        br              x3
505L(ipred_dc_left_w32):
506        AARCH64_VALID_JUMP_TARGET
507        mov             v1.16b,  v0.16b
508        mov             v2.16b,  v0.16b
509        mov             v3.16b,  v0.16b
5101:
511        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
512        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
513        subs            w4,  w4,  #4
514        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
515        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
516        b.gt            1b
517        ret
518
519L(ipred_dc_left_h64):
520        AARCH64_VALID_JUMP_TARGET
521        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
522        addp            v0.8h,   v0.8h,   v1.8h
523        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2]
524        addp            v2.8h,   v2.8h,   v3.8h
525        addp            v4.8h,   v4.8h,   v5.8h
526        addp            v6.8h,   v6.8h,   v7.8h
527        addp            v0.8h,   v0.8h,   v2.8h
528        addp            v4.8h,   v4.8h,   v6.8h
529        addp            v0.8h,   v0.8h,   v4.8h
530        uaddlv          s0,      v0.8h
531        rshrn           v4.4h,   v0.4s,   #6
532        dup             v0.8h,   v4.h[0]
533        br              x3
534L(ipred_dc_left_w64):
535        AARCH64_VALID_JUMP_TARGET
536        mov             v1.16b,  v0.16b
537        mov             v2.16b,  v0.16b
538        mov             v3.16b,  v0.16b
539        sub             x1,  x1,  #64
5401:
541        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
542        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
543        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
544        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
545        subs            w4,  w4,  #4
546        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
547        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
548        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
549        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
550        b.gt            1b
551        ret
552
553L(ipred_dc_left_tbl):
554        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h64)
555        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h32)
556        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h16)
557        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h8)
558        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h4)
559        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w64)
560        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w32)
561        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w16)
562        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w8)
563        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w4)
564endfunc
565
566// void ipred_dc_16bpc_neon(pixel *dst, const ptrdiff_t stride,
567//                          const pixel *const topleft,
568//                          const int width, const int height, const int a,
569//                          const int max_width, const int max_height);
570function ipred_dc_16bpc_neon, export=1
571        sub             x2,  x2,  w4, uxtw #1
572        add             w7,  w3,  w4             // width + height
573        clz             w3,  w3
574        clz             w6,  w4
575        dup             v16.4s, w7               // width + height
576        adr             x5,  L(ipred_dc_tbl)
577        rbit            w7,  w7                  // rbit(width + height)
578        sub             w3,  w3,  #20            // 25 leading bits, minus table offset 5
579        sub             w6,  w6,  #25
580        clz             w7,  w7                  // ctz(width + height)
581        ldrh            w3,  [x5, w3, uxtw #1]
582        ldrh            w6,  [x5, w6, uxtw #1]
583        neg             w7,  w7                  // -ctz(width + height)
584        sub             x3,  x5,  w3, uxtw
585        sub             x5,  x5,  w6, uxtw
586        ushr            v16.4s,  v16.4s,  #1     // (width + height) >> 1
587        dup             v17.4s,  w7              // -ctz(width + height)
588        add             x6,  x0,  x1
589        lsl             x1,  x1,  #1
590        br              x5
591
592L(ipred_dc_h4):
593        AARCH64_VALID_JUMP_TARGET
594        ld1             {v0.4h},  [x2], #8
595        uaddlv          s0,      v0.4h
596        add             x2,  x2,  #2
597        br              x3
598L(ipred_dc_w4):
599        AARCH64_VALID_JUMP_TARGET
600        ld1             {v1.4h},  [x2]
601        add             v0.2s,   v0.2s,   v16.2s
602        uaddlv          s1,      v1.4h
603        cmp             w4,  #4
604        add             v0.2s,   v0.2s,   v1.2s
605        ushl            v0.2s,   v0.2s,   v17.2s
606        b.eq            1f
607        // h = 8/16
608        cmp             w4,  #16
609        mov             w16, #0x6667
610        mov             w17, #0xAAAB
611        csel            w16, w16, w17, eq
612        dup             v16.2s,  w16
613        mul             v0.2s,   v0.2s,   v16.2s
614        ushr            v0.2s,   v0.2s,   #17
6151:
616        dup             v0.4h,   v0.h[0]
6172:
618        st1             {v0.4h},  [x0], x1
619        st1             {v0.4h},  [x6], x1
620        subs            w4,  w4,  #4
621        st1             {v0.4h},  [x0], x1
622        st1             {v0.4h},  [x6], x1
623        b.gt            2b
624        ret
625
626L(ipred_dc_h8):
627        AARCH64_VALID_JUMP_TARGET
628        ld1             {v0.8h},  [x2], #16
629        uaddlv          s0,      v0.8h
630        add             x2,  x2,  #2
631        br              x3
632L(ipred_dc_w8):
633        AARCH64_VALID_JUMP_TARGET
634        ld1             {v1.8h},  [x2]
635        add             v0.2s,   v0.2s,   v16.2s
636        uaddlv          s1,      v1.8h
637        cmp             w4,  #8
638        add             v0.2s,   v0.2s,   v1.2s
639        ushl            v0.2s,   v0.2s,   v17.2s
640        b.eq            1f
641        // h = 4/16/32
642        cmp             w4,  #32
643        mov             w16, #0x6667
644        mov             w17, #0xAAAB
645        csel            w16, w16, w17, eq
646        dup             v16.2s,  w16
647        mul             v0.2s,   v0.2s,   v16.2s
648        ushr            v0.2s,   v0.2s,   #17
6491:
650        dup             v0.8h,   v0.h[0]
6512:
652        st1             {v0.8h},  [x0], x1
653        st1             {v0.8h},  [x6], x1
654        subs            w4,  w4,  #4
655        st1             {v0.8h},  [x0], x1
656        st1             {v0.8h},  [x6], x1
657        b.gt            2b
658        ret
659
660L(ipred_dc_h16):
661        AARCH64_VALID_JUMP_TARGET
662        ld1             {v0.8h, v1.8h}, [x2], #32
663        addp            v0.8h,   v0.8h,   v1.8h
664        add             x2,  x2,  #2
665        uaddlv          s0,      v0.8h
666        br              x3
667L(ipred_dc_w16):
668        AARCH64_VALID_JUMP_TARGET
669        ld1             {v1.8h, v2.8h}, [x2]
670        add             v0.2s,   v0.2s,   v16.2s
671        addp            v1.8h,   v1.8h,   v2.8h
672        uaddlv          s1,      v1.8h
673        cmp             w4,  #16
674        add             v0.2s,   v0.2s,   v1.2s
675        ushl            v4.2s,   v0.2s,   v17.2s
676        b.eq            1f
677        // h = 4/8/32/64
678        tst             w4,  #(32+16+8) // 16 added to make a consecutive bitmask
679        mov             w16, #0x6667
680        mov             w17, #0xAAAB
681        csel            w16, w16, w17, eq
682        dup             v16.2s,  w16
683        mul             v4.2s,   v4.2s,   v16.2s
684        ushr            v4.2s,   v4.2s,   #17
6851:
686        dup             v0.8h,   v4.h[0]
687        dup             v1.8h,   v4.h[0]
6882:
689        st1             {v0.8h, v1.8h}, [x0], x1
690        st1             {v0.8h, v1.8h}, [x6], x1
691        subs            w4,  w4,  #4
692        st1             {v0.8h, v1.8h}, [x0], x1
693        st1             {v0.8h, v1.8h}, [x6], x1
694        b.gt            2b
695        ret
696
697L(ipred_dc_h32):
698        AARCH64_VALID_JUMP_TARGET
699        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
700        addp            v0.8h,   v0.8h,   v1.8h
701        addp            v2.8h,   v2.8h,   v3.8h
702        addp            v0.8h,   v0.8h,   v2.8h
703        add             x2,  x2,  #2
704        uaddlv          s0,      v0.8h
705        br              x3
706L(ipred_dc_w32):
707        AARCH64_VALID_JUMP_TARGET
708        ld1             {v1.8h, v2.8h, v3.8h, v4.8h}, [x2]
709        add             v0.2s,   v0.2s,   v16.2s
710        addp            v1.8h,   v1.8h,   v2.8h
711        addp            v3.8h,   v3.8h,   v4.8h
712        addp            v1.8h,   v1.8h,   v3.8h
713        uaddlv          s1,      v1.8h
714        cmp             w4,  #32
715        add             v0.2s,   v0.2s,   v1.2s
716        ushl            v4.2s,   v0.2s,   v17.2s
717        b.eq            1f
718        // h = 8/16/64
719        cmp             w4,  #8
720        mov             w16, #0x6667
721        mov             w17, #0xAAAB
722        csel            w16, w16, w17, eq
723        dup             v16.2s,  w16
724        mul             v4.2s,   v4.2s,   v16.2s
725        ushr            v4.2s,   v4.2s,   #17
7261:
727        dup             v0.8h,   v4.h[0]
728        dup             v1.8h,   v4.h[0]
729        dup             v2.8h,   v4.h[0]
730        dup             v3.8h,   v4.h[0]
7312:
732        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
733        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
734        subs            w4,  w4,  #4
735        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
736        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
737        b.gt            2b
738        ret
739
740L(ipred_dc_h64):
741        AARCH64_VALID_JUMP_TARGET
742        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
743        addp            v0.8h,   v0.8h,   v1.8h
744        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
745        addp            v2.8h,   v2.8h,   v3.8h
746        addp            v4.8h,   v4.8h,   v5.8h
747        addp            v6.8h,   v6.8h,   v7.8h
748        addp            v0.8h,   v0.8h,   v2.8h
749        addp            v4.8h,   v4.8h,   v6.8h
750        addp            v0.8h,   v0.8h,   v4.8h
751        add             x2,  x2,  #2
752        uaddlv          s0,      v0.8h
753        br              x3
754L(ipred_dc_w64):
755        AARCH64_VALID_JUMP_TARGET
756        ld1             {v1.8h, v2.8h, v3.8h, v4.8h}, [x2], #64
757        add             v0.2s,   v0.2s,   v16.2s
758        addp            v1.8h,   v1.8h,   v2.8h
759        ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x2]
760        addp            v3.8h,   v3.8h,   v4.8h
761        addp            v20.8h,  v20.8h,  v21.8h
762        addp            v22.8h,  v22.8h,  v23.8h
763        addp            v1.8h,   v1.8h,   v3.8h
764        addp            v20.8h,  v20.8h,  v22.8h
765        addp            v1.8h,   v1.8h,   v20.8h
766        uaddlv          s1,      v1.8h
767        cmp             w4,  #64
768        add             v0.2s,   v0.2s,   v1.2s
769        ushl            v4.2s,   v0.2s,   v17.2s
770        b.eq            1f
771        // h = 16/32
772        cmp             w4,  #16
773        mov             w16, #0x6667
774        mov             w17, #0xAAAB
775        csel            w16, w16, w17, eq
776        dup             v16.2s,  w16
777        mul             v4.2s,   v4.2s,   v16.2s
778        ushr            v4.2s,   v4.2s,   #17
7791:
780        sub             x1,  x1,  #64
781        dup             v0.8h,   v4.h[0]
782        dup             v1.8h,   v4.h[0]
783        dup             v2.8h,   v4.h[0]
784        dup             v3.8h,   v4.h[0]
7852:
786        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
787        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
788        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
789        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
790        subs            w4,  w4,  #4
791        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
792        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
793        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
794        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
795        b.gt            2b
796        ret
797
798L(ipred_dc_tbl):
799        .hword L(ipred_dc_tbl) - L(ipred_dc_h64)
800        .hword L(ipred_dc_tbl) - L(ipred_dc_h32)
801        .hword L(ipred_dc_tbl) - L(ipred_dc_h16)
802        .hword L(ipred_dc_tbl) - L(ipred_dc_h8)
803        .hword L(ipred_dc_tbl) - L(ipred_dc_h4)
804        .hword L(ipred_dc_tbl) - L(ipred_dc_w64)
805        .hword L(ipred_dc_tbl) - L(ipred_dc_w32)
806        .hword L(ipred_dc_tbl) - L(ipred_dc_w16)
807        .hword L(ipred_dc_tbl) - L(ipred_dc_w8)
808        .hword L(ipred_dc_tbl) - L(ipred_dc_w4)
809endfunc
810
811// void ipred_paeth_16bpc_neon(pixel *dst, const ptrdiff_t stride,
812//                             const pixel *const topleft,
813//                             const int width, const int height, const int a,
814//                             const int max_width, const int max_height);
815function ipred_paeth_16bpc_neon, export=1
816        clz             w9,  w3
817        adr             x5,  L(ipred_paeth_tbl)
818        sub             w9,  w9,  #25
819        ldrh            w9,  [x5, w9, uxtw #1]
820        ld1r            {v4.8h},  [x2]
821        add             x8,  x2,  #2
822        sub             x2,  x2,  #8
823        sub             x5,  x5,  w9, uxtw
824        mov             x7,  #-8
825        add             x6,  x0,  x1
826        lsl             x1,  x1,  #1
827        br              x5
82840:
829        AARCH64_VALID_JUMP_TARGET
830        ld1r            {v5.2d},  [x8]
831        sub             v6.8h,   v5.8h,   v4.8h   // top - topleft
8324:
833        ld4r            {v0.4h, v1.4h, v2.4h, v3.4h},  [x2], x7
834        zip1            v0.2d,   v0.2d,   v1.2d
835        zip1            v2.2d,   v2.2d,   v3.2d
836        add             v16.8h,  v6.8h,   v0.8h   // base
837        add             v17.8h,  v6.8h,   v2.8h
838        sabd            v20.8h,  v5.8h,   v16.8h  // tdiff
839        sabd            v21.8h,  v5.8h,   v17.8h
840        sabd            v22.8h,  v4.8h,   v16.8h  // tldiff
841        sabd            v23.8h,  v4.8h,   v17.8h
842        sabd            v16.8h,  v0.8h,   v16.8h  // ldiff
843        sabd            v17.8h,  v2.8h,   v17.8h
844        umin            v18.8h,  v20.8h,  v22.8h  // min(tdiff, tldiff)
845        umin            v19.8h,  v21.8h,  v23.8h
846        cmge            v20.8h,  v22.8h,  v20.8h  // tldiff >= tdiff
847        cmge            v21.8h,  v23.8h,  v21.8h
848        cmge            v16.8h,  v18.8h,  v16.8h  // min(tdiff, tldiff) >= ldiff
849        cmge            v17.8h,  v19.8h,  v17.8h
850        bsl             v21.16b, v5.16b,  v4.16b  // tdiff <= tldiff ? top : topleft
851        bsl             v20.16b, v5.16b,  v4.16b
852        bit             v21.16b, v2.16b,  v17.16b // ldiff <= min ? left : ...
853        bit             v20.16b, v0.16b,  v16.16b
854        st1             {v21.d}[1], [x0], x1
855        st1             {v21.d}[0], [x6], x1
856        subs            w4,  w4,  #4
857        st1             {v20.d}[1], [x0], x1
858        st1             {v20.d}[0], [x6], x1
859        b.gt            4b
860        ret
86180:
862160:
863320:
864640:
865        AARCH64_VALID_JUMP_TARGET
866        ld1             {v5.8h},  [x8], #16
867        mov             w9,  w3
868        // Set up pointers for four rows in parallel; x0, x6, x5, x10
869        add             x5,  x0,  x1
870        add             x10, x6,  x1
871        lsl             x1,  x1,  #1
872        sub             x1,  x1,  w3, uxtw #1
8731:
874        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7
8752:
876        sub             v6.8h,   v5.8h,   v4.8h   // top - topleft
877        add             v16.8h,  v6.8h,   v0.8h   // base
878        add             v17.8h,  v6.8h,   v1.8h
879        add             v18.8h,  v6.8h,   v2.8h
880        add             v19.8h,  v6.8h,   v3.8h
881        sabd            v20.8h,  v5.8h,   v16.8h  // tdiff
882        sabd            v21.8h,  v5.8h,   v17.8h
883        sabd            v22.8h,  v5.8h,   v18.8h
884        sabd            v23.8h,  v5.8h,   v19.8h
885        sabd            v24.8h,  v4.8h,   v16.8h  // tldiff
886        sabd            v25.8h,  v4.8h,   v17.8h
887        sabd            v26.8h,  v4.8h,   v18.8h
888        sabd            v27.8h,  v4.8h,   v19.8h
889        sabd            v16.8h,  v0.8h,   v16.8h  // ldiff
890        sabd            v17.8h,  v1.8h,   v17.8h
891        sabd            v18.8h,  v2.8h,   v18.8h
892        sabd            v19.8h,  v3.8h,   v19.8h
893        umin            v28.8h,  v20.8h,  v24.8h  // min(tdiff, tldiff)
894        umin            v29.8h,  v21.8h,  v25.8h
895        umin            v30.8h,  v22.8h,  v26.8h
896        umin            v31.8h,  v23.8h,  v27.8h
897        cmge            v20.8h,  v24.8h,  v20.8h  // tldiff >= tdiff
898        cmge            v21.8h,  v25.8h,  v21.8h
899        cmge            v22.8h,  v26.8h,  v22.8h
900        cmge            v23.8h,  v27.8h,  v23.8h
901        cmge            v16.8h,  v28.8h,  v16.8h  // min(tdiff, tldiff) >= ldiff
902        cmge            v17.8h,  v29.8h,  v17.8h
903        cmge            v18.8h,  v30.8h,  v18.8h
904        cmge            v19.8h,  v31.8h,  v19.8h
905        bsl             v23.16b, v5.16b,  v4.16b  // tdiff <= tldiff ? top : topleft
906        bsl             v22.16b, v5.16b,  v4.16b
907        bsl             v21.16b, v5.16b,  v4.16b
908        bsl             v20.16b, v5.16b,  v4.16b
909        bit             v23.16b, v3.16b,  v19.16b // ldiff <= min ? left : ...
910        bit             v22.16b, v2.16b,  v18.16b
911        bit             v21.16b, v1.16b,  v17.16b
912        bit             v20.16b, v0.16b,  v16.16b
913        st1             {v23.8h}, [x0], #16
914        st1             {v22.8h}, [x6], #16
915        subs            w3,  w3,  #8
916        st1             {v21.8h}, [x5], #16
917        st1             {v20.8h}, [x10], #16
918        b.le            8f
919        ld1             {v5.8h},  [x8], #16
920        b               2b
9218:
922        subs            w4,  w4,  #4
923        b.le            9f
924        // End of horizontal loop, move pointers to next four rows
925        sub             x8,  x8,  w9, uxtw #1
926        add             x0,  x0,  x1
927        add             x6,  x6,  x1
928        // Load the top row as early as possible
929        ld1             {v5.8h},  [x8], #16
930        add             x5,  x5,  x1
931        add             x10, x10, x1
932        mov             w3,  w9
933        b               1b
9349:
935        ret
936
937L(ipred_paeth_tbl):
938        .hword L(ipred_paeth_tbl) - 640b
939        .hword L(ipred_paeth_tbl) - 320b
940        .hword L(ipred_paeth_tbl) - 160b
941        .hword L(ipred_paeth_tbl) -  80b
942        .hword L(ipred_paeth_tbl) -  40b
943endfunc
944
945// void ipred_smooth_16bpc_neon(pixel *dst, const ptrdiff_t stride,
946//                              const pixel *const topleft,
947//                              const int width, const int height, const int a,
948//                              const int max_width, const int max_height);
949function ipred_smooth_16bpc_neon, export=1
950        movrel          x10, X(sm_weights)
951        add             x11, x10, w4, uxtw
952        add             x10, x10, w3, uxtw
953        clz             w9,  w3
954        adr             x5,  L(ipred_smooth_tbl)
955        sub             x12, x2,  w4, uxtw #1
956        sub             w9,  w9,  #25
957        ldrh            w9,  [x5, w9, uxtw #1]
958        ld1r            {v4.8h},  [x12] // bottom
959        add             x8,  x2,  #2
960        sub             x5,  x5,  w9, uxtw
961        add             x6,  x0,  x1
962        lsl             x1,  x1,  #1
963        br              x5
96440:
965        AARCH64_VALID_JUMP_TARGET
966        ld1r            {v6.2d}, [x8]             // top
967        ld1r            {v7.2s}, [x10]            // weights_hor
968        sub             x2,  x2,  #8
969        mov             x7,  #-8
970        dup             v5.8h,   v6.h[3]          // right
971        sub             v6.8h,   v6.8h,   v4.8h   // top-bottom
972        uxtl            v7.8h,   v7.8b            // weights_hor
973        add             v31.4h,  v4.4h,   v5.4h   // bottom+right
9744:
975        ld4r            {v0.4h, v1.4h, v2.4h, v3.4h},  [x2], x7 // left
976        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x11], #4 // weights_ver
977        ushll           v20.4s,  v31.4h,  #8      // (bottom+right)*256
978        ushll           v21.4s,  v31.4h,  #8
979        ushll           v22.4s,  v31.4h,  #8
980        ushll           v23.4s,  v31.4h,  #8
981        zip1            v1.2d,   v1.2d,   v0.2d   // left, flipped
982        zip1            v0.2d,   v3.2d,   v2.2d
983        zip1            v16.2s,  v16.2s,  v17.2s  // weights_ver
984        zip1            v18.2s,  v18.2s,  v19.2s
985        sub             v0.8h,   v0.8h,   v5.8h   // left-right
986        sub             v1.8h,   v1.8h,   v5.8h
987        uxtl            v16.8h,  v16.8b           // weights_ver
988        uxtl            v18.8h,  v18.8b
989        smlal           v20.4s,  v0.4h,   v7.4h   // += (left-right)*weights_hor
990        smlal2          v21.4s,  v0.8h,   v7.8h
991        smlal           v22.4s,  v1.4h,   v7.4h
992        smlal2          v23.4s,  v1.8h,   v7.8h
993        smlal           v20.4s,  v6.4h,   v16.4h  // += (top-bottom)*weights_ver
994        smlal2          v21.4s,  v6.8h,   v16.8h
995        smlal           v22.4s,  v6.4h,   v18.4h
996        smlal2          v23.4s,  v6.8h,   v18.8h
997        rshrn           v20.4h,  v20.4s,  #9
998        rshrn           v21.4h,  v21.4s,  #9
999        rshrn           v22.4h,  v22.4s,  #9
1000        rshrn           v23.4h,  v23.4s,  #9
1001        st1             {v20.4h}, [x0], x1
1002        st1             {v21.4h}, [x6], x1
1003        subs            w4,  w4,  #4
1004        st1             {v22.4h}, [x0], x1
1005        st1             {v23.4h}, [x6], x1
1006        b.gt            4b
1007        ret
100880:
1009        AARCH64_VALID_JUMP_TARGET
1010        ld1             {v6.8h}, [x8]             // top
1011        ld1             {v7.8b}, [x10]            // weights_hor
1012        sub             x2,  x2,  #8
1013        mov             x7,  #-8
1014        dup             v5.8h,   v6.h[7]          // right
1015        sub             v6.8h,   v6.8h,   v4.8h   // top-bottom
1016        uxtl            v7.8h,   v7.8b            // weights_hor
1017        add             v31.4h,  v4.4h,   v5.4h   // bottom+right
10188:
1019        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7 // left
1020        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x11], #4 // weights_ver
1021        ushll           v20.4s,  v31.4h,  #8      // (bottom+right)*256
1022        ushll           v21.4s,  v31.4h,  #8
1023        ushll           v22.4s,  v31.4h,  #8
1024        ushll           v23.4s,  v31.4h,  #8
1025        ushll           v24.4s,  v31.4h,  #8
1026        ushll           v25.4s,  v31.4h,  #8
1027        ushll           v26.4s,  v31.4h,  #8
1028        ushll           v27.4s,  v31.4h,  #8
1029        sub             v0.8h,   v0.8h,   v5.8h   // left-right
1030        sub             v1.8h,   v1.8h,   v5.8h
1031        sub             v2.8h,   v2.8h,   v5.8h
1032        sub             v3.8h,   v3.8h,   v5.8h
1033        uxtl            v16.8h,  v16.8b           // weights_ver
1034        uxtl            v17.8h,  v17.8b
1035        uxtl            v18.8h,  v18.8b
1036        uxtl            v19.8h,  v19.8b
1037        smlal           v20.4s,  v3.4h,   v7.4h   // += (left-right)*weights_hor
1038        smlal2          v21.4s,  v3.8h,   v7.8h   // (left flipped)
1039        smlal           v22.4s,  v2.4h,   v7.4h
1040        smlal2          v23.4s,  v2.8h,   v7.8h
1041        smlal           v24.4s,  v1.4h,   v7.4h
1042        smlal2          v25.4s,  v1.8h,   v7.8h
1043        smlal           v26.4s,  v0.4h,   v7.4h
1044        smlal2          v27.4s,  v0.8h,   v7.8h
1045        smlal           v20.4s,  v6.4h,   v16.4h  // += (top-bottom)*weights_ver
1046        smlal2          v21.4s,  v6.8h,   v16.8h
1047        smlal           v22.4s,  v6.4h,   v17.4h
1048        smlal2          v23.4s,  v6.8h,   v17.8h
1049        smlal           v24.4s,  v6.4h,   v18.4h
1050        smlal2          v25.4s,  v6.8h,   v18.8h
1051        smlal           v26.4s,  v6.4h,   v19.4h
1052        smlal2          v27.4s,  v6.8h,   v19.8h
1053        rshrn           v20.4h,  v20.4s,  #9
1054        rshrn2          v20.8h,  v21.4s,  #9
1055        rshrn           v21.4h,  v22.4s,  #9
1056        rshrn2          v21.8h,  v23.4s,  #9
1057        rshrn           v22.4h,  v24.4s,  #9
1058        rshrn2          v22.8h,  v25.4s,  #9
1059        rshrn           v23.4h,  v26.4s,  #9
1060        rshrn2          v23.8h,  v27.4s,  #9
1061        st1             {v20.8h}, [x0], x1
1062        st1             {v21.8h}, [x6], x1
1063        subs            w4,  w4,  #4
1064        st1             {v22.8h}, [x0], x1
1065        st1             {v23.8h}, [x6], x1
1066        b.gt            8b
1067        ret
1068160:
1069320:
1070640:
1071        AARCH64_VALID_JUMP_TARGET
1072        add             x12, x2,  w3, uxtw #1
1073        sub             x1,  x1,  w3, uxtw #1
1074        ld1r            {v5.8h}, [x12]            // right
1075        sub             x2,  x2,  #4
1076        mov             x7,  #-4
1077        mov             w9,  w3
1078        add             v31.4h,  v4.4h,   v5.4h   // bottom+right
1079
10801:
1081        ld2r            {v0.8h, v1.8h},   [x2],  x7 // left
1082        ld2r            {v16.8b, v17.8b}, [x11], #2 // weights_ver
1083        sub             v0.8h,   v0.8h,   v5.8h   // left-right
1084        sub             v1.8h,   v1.8h,   v5.8h
1085        uxtl            v16.8h,  v16.8b           // weights_ver
1086        uxtl            v17.8h,  v17.8b
10872:
1088        ld1             {v7.16b}, [x10],  #16     // weights_hor
1089        ld1             {v2.8h, v3.8h}, [x8], #32 // top
1090        ushll           v20.4s,  v31.4h,  #8      // (bottom+right)*256
1091        ushll           v21.4s,  v31.4h,  #8
1092        ushll           v22.4s,  v31.4h,  #8
1093        ushll           v23.4s,  v31.4h,  #8
1094        ushll           v24.4s,  v31.4h,  #8
1095        ushll           v25.4s,  v31.4h,  #8
1096        ushll           v26.4s,  v31.4h,  #8
1097        ushll           v27.4s,  v31.4h,  #8
1098        uxtl            v6.8h,   v7.8b            // weights_hor
1099        uxtl2           v7.8h,   v7.16b
1100        sub             v2.8h,   v2.8h,   v4.8h   // top-bottom
1101        sub             v3.8h,   v3.8h,   v4.8h
1102        smlal           v20.4s,  v1.4h,   v6.4h   // += (left-right)*weights_hor
1103        smlal2          v21.4s,  v1.8h,   v6.8h   // (left flipped)
1104        smlal           v22.4s,  v1.4h,   v7.4h
1105        smlal2          v23.4s,  v1.8h,   v7.8h
1106        smlal           v24.4s,  v0.4h,   v6.4h
1107        smlal2          v25.4s,  v0.8h,   v6.8h
1108        smlal           v26.4s,  v0.4h,   v7.4h
1109        smlal2          v27.4s,  v0.8h,   v7.8h
1110        smlal           v20.4s,  v2.4h,   v16.4h  // += (top-bottom)*weights_ver
1111        smlal2          v21.4s,  v2.8h,   v16.8h
1112        smlal           v22.4s,  v3.4h,   v16.4h
1113        smlal2          v23.4s,  v3.8h,   v16.8h
1114        smlal           v24.4s,  v2.4h,   v17.4h
1115        smlal2          v25.4s,  v2.8h,   v17.8h
1116        smlal           v26.4s,  v3.4h,   v17.4h
1117        smlal2          v27.4s,  v3.8h,   v17.8h
1118        rshrn           v20.4h,  v20.4s,  #9
1119        rshrn2          v20.8h,  v21.4s,  #9
1120        rshrn           v21.4h,  v22.4s,  #9
1121        rshrn2          v21.8h,  v23.4s,  #9
1122        rshrn           v22.4h,  v24.4s,  #9
1123        rshrn2          v22.8h,  v25.4s,  #9
1124        rshrn           v23.4h,  v26.4s,  #9
1125        rshrn2          v23.8h,  v27.4s,  #9
1126        subs            w3,  w3,  #16
1127        st1             {v20.8h, v21.8h}, [x0], #32
1128        st1             {v22.8h, v23.8h}, [x6], #32
1129        b.gt            2b
1130        subs            w4,  w4,  #2
1131        b.le            9f
1132        sub             x8,  x8,  w9, uxtw #1
1133        sub             x10, x10, w9, uxtw
1134        add             x0,  x0,  x1
1135        add             x6,  x6,  x1
1136        mov             w3,  w9
1137        b               1b
11389:
1139        ret
1140
1141L(ipred_smooth_tbl):
1142        .hword L(ipred_smooth_tbl) - 640b
1143        .hword L(ipred_smooth_tbl) - 320b
1144        .hword L(ipred_smooth_tbl) - 160b
1145        .hword L(ipred_smooth_tbl) -  80b
1146        .hword L(ipred_smooth_tbl) -  40b
1147endfunc
1148
1149// void ipred_smooth_v_16bpc_neon(pixel *dst, const ptrdiff_t stride,
1150//                                const pixel *const topleft,
1151//                                const int width, const int height, const int a,
1152//                                const int max_width, const int max_height);
1153function ipred_smooth_v_16bpc_neon, export=1
1154        movrel          x7,  X(sm_weights)
1155        add             x7,  x7,  w4, uxtw
1156        clz             w9,  w3
1157        adr             x5,  L(ipred_smooth_v_tbl)
1158        sub             x8,  x2,  w4, uxtw #1
1159        sub             w9,  w9,  #25
1160        ldrh            w9,  [x5, w9, uxtw #1]
1161        ld1r            {v4.8h},  [x8] // bottom
1162        add             x2,  x2,  #2
1163        sub             x5,  x5,  w9, uxtw
1164        add             x6,  x0,  x1
1165        lsl             x1,  x1,  #1
1166        br              x5
116740:
1168        AARCH64_VALID_JUMP_TARGET
1169        ld1r            {v6.2d}, [x2]             // top
1170        sub             v6.8h,   v6.8h,   v4.8h   // top-bottom
11714:
1172        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x7], #4 // weights_ver
1173        zip1            v16.2s,  v16.2s,  v17.2s  // weights_ver
1174        zip1            v18.2s,  v18.2s,  v19.2s
1175        ushll           v16.8h,  v16.8b,  #7      // weights_ver << 7
1176        ushll           v18.8h,  v18.8b,  #7
1177        sqrdmulh        v20.8h,  v6.8h,   v16.8h  // ((top-bottom)*weights_ver + 128) >> 8
1178        sqrdmulh        v21.8h,  v6.8h,   v18.8h
1179        add             v20.8h,  v20.8h,  v4.8h
1180        add             v21.8h,  v21.8h,  v4.8h
1181        st1             {v20.d}[0], [x0], x1
1182        st1             {v20.d}[1], [x6], x1
1183        subs            w4,  w4,  #4
1184        st1             {v21.d}[0], [x0], x1
1185        st1             {v21.d}[1], [x6], x1
1186        b.gt            4b
1187        ret
118880:
1189        AARCH64_VALID_JUMP_TARGET
1190        ld1             {v6.8h}, [x2]             // top
1191        sub             v6.8h,   v6.8h,   v4.8h   // top-bottom
11928:
1193        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x7], #4 // weights_ver
1194        ushll           v16.8h,  v16.8b,  #7      // weights_ver << 7
1195        ushll           v17.8h,  v17.8b,  #7
1196        ushll           v18.8h,  v18.8b,  #7
1197        ushll           v19.8h,  v19.8b,  #7
1198        sqrdmulh        v20.8h,  v6.8h,   v16.8h  // ((top-bottom)*weights_ver + 128) >> 8
1199        sqrdmulh        v21.8h,  v6.8h,   v17.8h
1200        sqrdmulh        v22.8h,  v6.8h,   v18.8h
1201        sqrdmulh        v23.8h,  v6.8h,   v19.8h
1202        add             v20.8h,  v20.8h,  v4.8h
1203        add             v21.8h,  v21.8h,  v4.8h
1204        add             v22.8h,  v22.8h,  v4.8h
1205        add             v23.8h,  v23.8h,  v4.8h
1206        st1             {v20.8h}, [x0], x1
1207        st1             {v21.8h}, [x6], x1
1208        subs            w4,  w4,  #4
1209        st1             {v22.8h}, [x0], x1
1210        st1             {v23.8h}, [x6], x1
1211        b.gt            8b
1212        ret
1213160:
1214320:
1215640:
1216        AARCH64_VALID_JUMP_TARGET
1217        // Set up pointers for four rows in parallel; x0, x6, x5, x8
1218        add             x5,  x0,  x1
1219        add             x8,  x6,  x1
1220        lsl             x1,  x1,  #1
1221        sub             x1,  x1,  w3, uxtw #1
1222        mov             w9,  w3
1223
12241:
1225        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver
1226        ushll           v16.8h,  v16.8b,  #7      // weights_ver << 7
1227        ushll           v17.8h,  v17.8b,  #7
1228        ushll           v18.8h,  v18.8b,  #7
1229        ushll           v19.8h,  v19.8b,  #7
12302:
1231        ld1             {v2.8h, v3.8h}, [x2], #32 // top
1232        sub             v2.8h,   v2.8h,   v4.8h   // top-bottom
1233        sub             v3.8h,   v3.8h,   v4.8h
1234        sqrdmulh        v20.8h,  v2.8h,   v16.8h  // ((top-bottom)*weights_ver + 128) >> 8
1235        sqrdmulh        v21.8h,  v3.8h,   v16.8h
1236        sqrdmulh        v22.8h,  v2.8h,   v17.8h
1237        sqrdmulh        v23.8h,  v3.8h,   v17.8h
1238        sqrdmulh        v24.8h,  v2.8h,   v18.8h
1239        sqrdmulh        v25.8h,  v3.8h,   v18.8h
1240        sqrdmulh        v26.8h,  v2.8h,   v19.8h
1241        sqrdmulh        v27.8h,  v3.8h,   v19.8h
1242        add             v20.8h,  v20.8h,  v4.8h
1243        add             v21.8h,  v21.8h,  v4.8h
1244        add             v22.8h,  v22.8h,  v4.8h
1245        add             v23.8h,  v23.8h,  v4.8h
1246        add             v24.8h,  v24.8h,  v4.8h
1247        add             v25.8h,  v25.8h,  v4.8h
1248        add             v26.8h,  v26.8h,  v4.8h
1249        add             v27.8h,  v27.8h,  v4.8h
1250        subs            w3,  w3,  #16
1251        st1             {v20.8h, v21.8h}, [x0], #32
1252        st1             {v22.8h, v23.8h}, [x6], #32
1253        st1             {v24.8h, v25.8h}, [x5], #32
1254        st1             {v26.8h, v27.8h}, [x8], #32
1255        b.gt            2b
1256        subs            w4,  w4,  #4
1257        b.le            9f
1258        sub             x2,  x2,  w9, uxtw #1
1259        add             x0,  x0,  x1
1260        add             x6,  x6,  x1
1261        add             x5,  x5,  x1
1262        add             x8,  x8,  x1
1263        mov             w3,  w9
1264        b               1b
12659:
1266        ret
1267
1268L(ipred_smooth_v_tbl):
1269        .hword L(ipred_smooth_v_tbl) - 640b
1270        .hword L(ipred_smooth_v_tbl) - 320b
1271        .hword L(ipred_smooth_v_tbl) - 160b
1272        .hword L(ipred_smooth_v_tbl) -  80b
1273        .hword L(ipred_smooth_v_tbl) -  40b
1274endfunc
1275
1276// void ipred_smooth_h_16bpc_neon(pixel *dst, const ptrdiff_t stride,
1277//                                const pixel *const topleft,
1278//                                const int width, const int height, const int a,
1279//                                const int max_width, const int max_height);
1280function ipred_smooth_h_16bpc_neon, export=1
1281        movrel          x8,  X(sm_weights)
1282        add             x8,  x8,  w3, uxtw
1283        clz             w9,  w3
1284        adr             x5,  L(ipred_smooth_h_tbl)
1285        add             x12, x2,  w3, uxtw #1
1286        sub             w9,  w9,  #25
1287        ldrh            w9,  [x5, w9, uxtw #1]
1288        ld1r            {v5.8h},  [x12] // right
1289        sub             x5,  x5,  w9, uxtw
1290        add             x6,  x0,  x1
1291        lsl             x1,  x1,  #1
1292        br              x5
129340:
1294        AARCH64_VALID_JUMP_TARGET
1295        ld1r            {v7.2s}, [x8]             // weights_hor
1296        sub             x2,  x2,  #8
1297        mov             x7,  #-8
1298        ushll           v7.8h,   v7.8b,   #7      // weights_hor << 7
12994:
1300        ld4r            {v0.4h, v1.4h, v2.4h, v3.4h},  [x2], x7 // left
1301        zip1            v1.2d,   v1.2d,   v0.2d   // left, flipped
1302        zip1            v0.2d,   v3.2d,   v2.2d
1303        sub             v0.8h,   v0.8h,   v5.8h   // left-right
1304        sub             v1.8h,   v1.8h,   v5.8h
1305        sqrdmulh        v20.8h,  v0.8h,   v7.8h   // ((left-right)*weights_hor + 128) >> 8
1306        sqrdmulh        v21.8h,  v1.8h,   v7.8h
1307        add             v20.8h,  v20.8h,  v5.8h
1308        add             v21.8h,  v21.8h,  v5.8h
1309        st1             {v20.d}[0], [x0], x1
1310        st1             {v20.d}[1], [x6], x1
1311        subs            w4,  w4,  #4
1312        st1             {v21.d}[0], [x0], x1
1313        st1             {v21.d}[1], [x6], x1
1314        b.gt            4b
1315        ret
131680:
1317        AARCH64_VALID_JUMP_TARGET
1318        ld1             {v7.8b}, [x8]             // weights_hor
1319        sub             x2,  x2,  #8
1320        mov             x7,  #-8
1321        ushll           v7.8h,   v7.8b,   #7      // weights_hor << 7
13228:
1323        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7 // left
1324        sub             v3.8h,   v3.8h,   v5.8h   // left-right
1325        sub             v2.8h,   v2.8h,   v5.8h
1326        sub             v1.8h,   v1.8h,   v5.8h
1327        sub             v0.8h,   v0.8h,   v5.8h
1328        sqrdmulh        v20.8h,  v3.8h,   v7.8h   // ((left-right)*weights_hor + 128) >> 8
1329        sqrdmulh        v21.8h,  v2.8h,   v7.8h   // (left flipped)
1330        sqrdmulh        v22.8h,  v1.8h,   v7.8h
1331        sqrdmulh        v23.8h,  v0.8h,   v7.8h
1332        add             v20.8h,  v20.8h,  v5.8h
1333        add             v21.8h,  v21.8h,  v5.8h
1334        add             v22.8h,  v22.8h,  v5.8h
1335        add             v23.8h,  v23.8h,  v5.8h
1336        st1             {v20.8h}, [x0], x1
1337        st1             {v21.8h}, [x6], x1
1338        subs            w4,  w4,  #4
1339        st1             {v22.8h}, [x0], x1
1340        st1             {v23.8h}, [x6], x1
1341        b.gt            8b
1342        ret
1343160:
1344320:
1345640:
1346        AARCH64_VALID_JUMP_TARGET
1347        sub             x2,  x2,  #8
1348        mov             x7,  #-8
1349        // Set up pointers for four rows in parallel; x0, x6, x5, x10
1350        add             x5,  x0,  x1
1351        add             x10, x6,  x1
1352        lsl             x1,  x1,  #1
1353        sub             x1,  x1,  w3, uxtw #1
1354        mov             w9,  w3
1355
13561:
1357        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},   [x2],  x7 // left
1358        sub             v0.8h,   v0.8h,   v5.8h   // left-right
1359        sub             v1.8h,   v1.8h,   v5.8h
1360        sub             v2.8h,   v2.8h,   v5.8h
1361        sub             v3.8h,   v3.8h,   v5.8h
13622:
1363        ld1             {v7.16b}, [x8],   #16     // weights_hor
1364        ushll           v6.8h,   v7.8b,   #7      // weights_hor << 7
1365        ushll2          v7.8h,   v7.16b,  #7
1366        sqrdmulh        v20.8h,  v3.8h,   v6.8h   // ((left-right)*weights_hor + 128) >> 8
1367        sqrdmulh        v21.8h,  v3.8h,   v7.8h   // (left flipped)
1368        sqrdmulh        v22.8h,  v2.8h,   v6.8h
1369        sqrdmulh        v23.8h,  v2.8h,   v7.8h
1370        sqrdmulh        v24.8h,  v1.8h,   v6.8h
1371        sqrdmulh        v25.8h,  v1.8h,   v7.8h
1372        sqrdmulh        v26.8h,  v0.8h,   v6.8h
1373        sqrdmulh        v27.8h,  v0.8h,   v7.8h
1374        add             v20.8h,  v20.8h,  v5.8h
1375        add             v21.8h,  v21.8h,  v5.8h
1376        add             v22.8h,  v22.8h,  v5.8h
1377        add             v23.8h,  v23.8h,  v5.8h
1378        add             v24.8h,  v24.8h,  v5.8h
1379        add             v25.8h,  v25.8h,  v5.8h
1380        add             v26.8h,  v26.8h,  v5.8h
1381        add             v27.8h,  v27.8h,  v5.8h
1382        subs            w3,  w3,  #16
1383        st1             {v20.8h, v21.8h}, [x0],  #32
1384        st1             {v22.8h, v23.8h}, [x6],  #32
1385        st1             {v24.8h, v25.8h}, [x5],  #32
1386        st1             {v26.8h, v27.8h}, [x10], #32
1387        b.gt            2b
1388        subs            w4,  w4,  #4
1389        b.le            9f
1390        sub             x8,  x8,  w9, uxtw
1391        add             x0,  x0,  x1
1392        add             x6,  x6,  x1
1393        add             x5,  x5,  x1
1394        add             x10, x10, x1
1395        mov             w3,  w9
1396        b               1b
13979:
1398        ret
1399
1400L(ipred_smooth_h_tbl):
1401        .hword L(ipred_smooth_h_tbl) - 640b
1402        .hword L(ipred_smooth_h_tbl) - 320b
1403        .hword L(ipred_smooth_h_tbl) - 160b
1404        .hword L(ipred_smooth_h_tbl) -  80b
1405        .hword L(ipred_smooth_h_tbl) -  40b
1406endfunc
1407
1408// void ipred_filter_16bpc_neon(pixel *dst, const ptrdiff_t stride,
1409//                              const pixel *const topleft,
1410//                              const int width, const int height, const int filt_idx,
1411//                              const int max_width, const int max_height,
1412//                              const int bitdepth_max);
1413.macro filter_fn bpc
1414function ipred_filter_\bpc\()bpc_neon
1415        and             w5,  w5,  #511
1416        movrel          x6,  X(filter_intra_taps)
1417        lsl             w5,  w5,  #6
1418        add             x6,  x6,  w5, uxtw
1419        ld1             {v16.8b, v17.8b, v18.8b, v19.8b}, [x6], #32
1420        clz             w9,  w3
1421        adr             x5,  L(ipred_filter\bpc\()_tbl)
1422        ld1             {v20.8b, v21.8b, v22.8b}, [x6]
1423        sub             w9,  w9,  #26
1424        ldrh            w9,  [x5, w9, uxtw #1]
1425        sxtl            v16.8h,  v16.8b
1426        sxtl            v17.8h,  v17.8b
1427        sub             x5,  x5,  w9, uxtw
1428        sxtl            v18.8h,  v18.8b
1429        sxtl            v19.8h,  v19.8b
1430        add             x6,  x0,  x1
1431        lsl             x1,  x1,  #1
1432        sxtl            v20.8h,  v20.8b
1433        sxtl            v21.8h,  v21.8b
1434        sxtl            v22.8h,  v22.8b
1435        dup             v31.8h,  w8
1436.if \bpc == 10
1437        movi            v30.8h,  #0
1438.endif
1439        br              x5
144040:
1441        AARCH64_VALID_JUMP_TARGET
1442        ldur            d0,  [x2, #2]             // top (0-3)
1443        sub             x2,  x2,  #4
1444        mov             x7,  #-4
14454:
1446        ld1             {v1.4h}, [x2], x7         // left (0-1) + topleft (2)
1447.if \bpc == 10
1448        mul             v2.8h,   v17.8h,  v0.h[0] // p1(top[0]) * filter(1)
1449        mla             v2.8h,   v18.8h,  v0.h[1] // p2(top[1]) * filter(2)
1450        mla             v2.8h,   v19.8h,  v0.h[2] // p3(top[2]) * filter(3)
1451        mla             v2.8h,   v20.8h,  v0.h[3] // p4(top[3]) * filter(4)
1452        mla             v2.8h,   v16.8h,  v1.h[2] // p0(topleft) * filter(0)
1453        mla             v2.8h,   v21.8h,  v1.h[1] // p5(left[0]) * filter(5)
1454        mla             v2.8h,   v22.8h,  v1.h[0] // p6(left[1]) * filter(6)
1455        srshr           v2.8h,   v2.8h,   #4
1456        smax            v2.8h,   v2.8h,   v30.8h
1457.else
1458        smull           v2.4s,   v17.4h,  v0.h[0] // p1(top[0]) * filter(1)
1459        smlal           v2.4s,   v18.4h,  v0.h[1] // p2(top[1]) * filter(2)
1460        smlal           v2.4s,   v19.4h,  v0.h[2] // p3(top[2]) * filter(3)
1461        smlal           v2.4s,   v20.4h,  v0.h[3] // p4(top[3]) * filter(4)
1462        smlal           v2.4s,   v16.4h,  v1.h[2] // p0(topleft) * filter(0)
1463        smlal           v2.4s,   v21.4h,  v1.h[1] // p5(left[0]) * filter(5)
1464        smlal           v2.4s,   v22.4h,  v1.h[0] // p6(left[1]) * filter(6)
1465        smull2          v3.4s,   v17.8h,  v0.h[0] // p1(top[0]) * filter(1)
1466        smlal2          v3.4s,   v18.8h,  v0.h[1] // p2(top[1]) * filter(2)
1467        smlal2          v3.4s,   v19.8h,  v0.h[2] // p3(top[2]) * filter(3)
1468        smlal2          v3.4s,   v20.8h,  v0.h[3] // p4(top[3]) * filter(4)
1469        smlal2          v3.4s,   v16.8h,  v1.h[2] // p0(topleft) * filter(0)
1470        smlal2          v3.4s,   v21.8h,  v1.h[1] // p5(left[0]) * filter(5)
1471        smlal2          v3.4s,   v22.8h,  v1.h[0] // p6(left[1]) * filter(6)
1472        sqrshrun        v2.4h,   v2.4s,   #4
1473        sqrshrun2       v2.8h,   v3.4s,   #4
1474.endif
1475        smin            v2.8h,   v2.8h,   v31.8h
1476        subs            w4,  w4,  #2
1477        st1             {v2.d}[0], [x0], x1
1478        ext             v0.16b,  v2.16b,  v2.16b, #8 // move top from [4-7] to [0-3]
1479        st1             {v2.d}[1], [x6], x1
1480        b.gt            4b
1481        ret
148280:
1483        AARCH64_VALID_JUMP_TARGET
1484        ldur            q0,  [x2, #2]             // top (0-7)
1485        sub             x2,  x2,  #4
1486        mov             x7,  #-4
14878:
1488        ld1             {v1.4h}, [x2], x7         // left (0-1) + topleft (2)
1489.if \bpc == 10
1490        mul             v2.8h,   v17.8h,  v0.h[0] // p1(top[0]) * filter(1)
1491        mla             v2.8h,   v18.8h,  v0.h[1] // p2(top[1]) * filter(2)
1492        mla             v2.8h,   v19.8h,  v0.h[2] // p3(top[2]) * filter(3)
1493        mla             v2.8h,   v20.8h,  v0.h[3] // p4(top[3]) * filter(4)
1494        mla             v2.8h,   v16.8h,  v1.h[2] // p0(topleft) * filter(0)
1495        mla             v2.8h,   v21.8h,  v1.h[1] // p5(left[0]) * filter(5)
1496        mla             v2.8h,   v22.8h,  v1.h[0] // p6(left[1]) * filter(6)
1497        mul             v3.8h,   v17.8h,  v0.h[4] // p1(top[0]) * filter(1)
1498        mla             v3.8h,   v18.8h,  v0.h[5] // p2(top[1]) * filter(2)
1499        mla             v3.8h,   v19.8h,  v0.h[6] // p3(top[2]) * filter(3)
1500        srshr           v2.8h,   v2.8h,   #4
1501        smax            v2.8h,   v2.8h,   v30.8h
1502        smin            v2.8h,   v2.8h,   v31.8h
1503        mla             v3.8h,   v20.8h,  v0.h[7] // p4(top[3]) * filter(4)
1504        mla             v3.8h,   v16.8h,  v0.h[3] // p0(topleft) * filter(0)
1505        mla             v3.8h,   v21.8h,  v2.h[3] // p5(left[0]) * filter(5)
1506        mla             v3.8h,   v22.8h,  v2.h[7] // p6(left[1]) * filter(6)
1507        srshr           v3.8h,   v3.8h,   #4
1508        smax            v3.8h,   v3.8h,   v30.8h
1509.else
1510        smull           v2.4s,   v17.4h,  v0.h[0] // p1(top[0]) * filter(1)
1511        smlal           v2.4s,   v18.4h,  v0.h[1] // p2(top[1]) * filter(2)
1512        smlal           v2.4s,   v19.4h,  v0.h[2] // p3(top[2]) * filter(3)
1513        smlal           v2.4s,   v20.4h,  v0.h[3] // p4(top[3]) * filter(4)
1514        smlal           v2.4s,   v16.4h,  v1.h[2] // p0(topleft) * filter(0)
1515        smlal           v2.4s,   v21.4h,  v1.h[1] // p5(left[0]) * filter(5)
1516        smlal           v2.4s,   v22.4h,  v1.h[0] // p6(left[1]) * filter(6)
1517        smull2          v3.4s,   v17.8h,  v0.h[0] // p1(top[0]) * filter(1)
1518        smlal2          v3.4s,   v18.8h,  v0.h[1] // p2(top[1]) * filter(2)
1519        smlal2          v3.4s,   v19.8h,  v0.h[2] // p3(top[2]) * filter(3)
1520        smlal2          v3.4s,   v20.8h,  v0.h[3] // p4(top[3]) * filter(4)
1521        smlal2          v3.4s,   v16.8h,  v1.h[2] // p0(topleft) * filter(0)
1522        smlal2          v3.4s,   v21.8h,  v1.h[1] // p5(left[0]) * filter(5)
1523        smlal2          v3.4s,   v22.8h,  v1.h[0] // p6(left[1]) * filter(6)
1524        smull           v4.4s,   v17.4h,  v0.h[4] // p1(top[0]) * filter(1)
1525        smlal           v4.4s,   v18.4h,  v0.h[5] // p2(top[1]) * filter(2)
1526        smlal           v4.4s,   v19.4h,  v0.h[6] // p3(top[2]) * filter(3)
1527        sqrshrun        v2.4h,   v2.4s,   #4
1528        sqrshrun2       v2.8h,   v3.4s,   #4
1529        smin            v2.8h,   v2.8h,   v31.8h
1530        smlal           v4.4s,   v20.4h,  v0.h[7] // p4(top[3]) * filter(4)
1531        smlal           v4.4s,   v16.4h,  v0.h[3] // p0(topleft) * filter(0)
1532        smlal           v4.4s,   v21.4h,  v2.h[3] // p5(left[0]) * filter(5)
1533        smlal           v4.4s,   v22.4h,  v2.h[7] // p6(left[1]) * filter(6)
1534        smull2          v5.4s,   v17.8h,  v0.h[4] // p1(top[0]) * filter(1)
1535        smlal2          v5.4s,   v18.8h,  v0.h[5] // p2(top[1]) * filter(2)
1536        smlal2          v5.4s,   v19.8h,  v0.h[6] // p3(top[2]) * filter(3)
1537        smlal2          v5.4s,   v20.8h,  v0.h[7] // p4(top[3]) * filter(4)
1538        smlal2          v5.4s,   v16.8h,  v0.h[3] // p0(topleft) * filter(0)
1539        smlal2          v5.4s,   v21.8h,  v2.h[3] // p5(left[0]) * filter(5)
1540        smlal2          v5.4s,   v22.8h,  v2.h[7] // p6(left[1]) * filter(6)
1541        sqrshrun        v3.4h,   v4.4s,   #4
1542        sqrshrun2       v3.8h,   v5.4s,   #4
1543.endif
1544        smin            v3.8h,   v3.8h,   v31.8h
1545        subs            w4,  w4,  #2
1546        st2             {v2.d, v3.d}[0], [x0], x1
1547        zip2            v0.2d,   v2.2d,   v3.2d
1548        st2             {v2.d, v3.d}[1], [x6], x1
1549        b.gt            8b
1550        ret
1551160:
1552320:
1553        AARCH64_VALID_JUMP_TARGET
1554        add             x8,  x2,  #2
1555        sub             x2,  x2,  #4
1556        mov             x7,  #-4
1557        sub             x1,  x1,  w3, uxtw #1
1558        mov             w9,  w3
1559
15601:
1561        ld1             {v0.4h}, [x2], x7         // left (0-1) + topleft (2)
15622:
1563        ld1             {v1.8h, v2.8h}, [x8], #32 // top(0-15)
1564.if \bpc == 10
1565        mul             v3.8h,   v16.8h,  v0.h[2] // p0(topleft) * filter(0)
1566        mla             v3.8h,   v21.8h,  v0.h[1] // p5(left[0]) * filter(5)
1567        mla             v3.8h,   v22.8h,  v0.h[0] // p6(left[1]) * filter(6)
1568        mla             v3.8h,   v17.8h,  v1.h[0] // p1(top[0]) * filter(1)
1569        mla             v3.8h,   v18.8h,  v1.h[1] // p2(top[1]) * filter(2)
1570        mla             v3.8h,   v19.8h,  v1.h[2] // p3(top[2]) * filter(3)
1571        mla             v3.8h,   v20.8h,  v1.h[3] // p4(top[3]) * filter(4)
1572
1573        mul             v4.8h,   v17.8h,  v1.h[4] // p1(top[0]) * filter(1)
1574        mla             v4.8h,   v18.8h,  v1.h[5] // p2(top[1]) * filter(2)
1575        mla             v4.8h,   v19.8h,  v1.h[6] // p3(top[2]) * filter(3)
1576        srshr           v3.8h,   v3.8h,   #4
1577        smax            v3.8h,   v3.8h,   v30.8h
1578        smin            v3.8h,   v3.8h,   v31.8h
1579        mla             v4.8h,   v20.8h,  v1.h[7] // p4(top[3]) * filter(4)
1580        mla             v4.8h,   v16.8h,  v1.h[3] // p0(topleft) * filter(0)
1581        mla             v4.8h,   v21.8h,  v3.h[3] // p5(left[0]) * filter(5)
1582        mla             v4.8h,   v22.8h,  v3.h[7] // p6(left[1]) * filter(6)
1583
1584        mul             v5.8h,   v17.8h,  v2.h[0] // p1(top[0]) * filter(1)
1585        mla             v5.8h,   v18.8h,  v2.h[1] // p2(top[1]) * filter(2)
1586        mla             v5.8h,   v19.8h,  v2.h[2] // p3(top[2]) * filter(3)
1587        srshr           v4.8h,   v4.8h,   #4
1588        smax            v4.8h,   v4.8h,   v30.8h
1589        smin            v4.8h,   v4.8h,   v31.8h
1590        mla             v5.8h,   v20.8h,  v2.h[3] // p4(top[3]) * filter(4)
1591        mla             v5.8h,   v16.8h,  v1.h[7] // p0(topleft) * filter(0)
1592        mla             v5.8h,   v21.8h,  v4.h[3] // p5(left[0]) * filter(5)
1593        mla             v5.8h,   v22.8h,  v4.h[7] // p6(left[1]) * filter(6)
1594
1595        mul             v6.8h,   v17.8h,  v2.h[4] // p1(top[0]) * filter(1)
1596        mla             v6.8h,   v18.8h,  v2.h[5] // p2(top[1]) * filter(2)
1597        mla             v6.8h,   v19.8h,  v2.h[6] // p3(top[2]) * filter(3)
1598        srshr           v5.8h,   v5.8h,   #4
1599        smax            v5.8h,   v5.8h,   v30.8h
1600        smin            v5.8h,   v5.8h,   v31.8h
1601        mla             v6.8h,   v20.8h,  v2.h[7] // p4(top[3]) * filter(4)
1602        mla             v6.8h,   v16.8h,  v2.h[3] // p0(topleft) * filter(0)
1603        mla             v6.8h,   v21.8h,  v5.h[3] // p5(left[0]) * filter(5)
1604        mla             v6.8h,   v22.8h,  v5.h[7] // p6(left[1]) * filter(6)
1605
1606        subs            w3,  w3,  #16
1607        srshr           v6.8h,   v6.8h,   #4
1608        smax            v6.8h,   v6.8h,   v30.8h
1609.else
1610        smull           v3.4s,   v16.4h,  v0.h[2] // p0(topleft) * filter(0)
1611        smlal           v3.4s,   v21.4h,  v0.h[1] // p5(left[0]) * filter(5)
1612        smlal           v3.4s,   v22.4h,  v0.h[0] // p6(left[1]) * filter(6)
1613        smlal           v3.4s,   v17.4h,  v1.h[0] // p1(top[0]) * filter(1)
1614        smlal           v3.4s,   v18.4h,  v1.h[1] // p2(top[1]) * filter(2)
1615        smlal           v3.4s,   v19.4h,  v1.h[2] // p3(top[2]) * filter(3)
1616        smlal           v3.4s,   v20.4h,  v1.h[3] // p4(top[3]) * filter(4)
1617        smull2          v4.4s,   v16.8h,  v0.h[2] // p0(topleft) * filter(0)
1618        smlal2          v4.4s,   v21.8h,  v0.h[1] // p5(left[0]) * filter(5)
1619        smlal2          v4.4s,   v22.8h,  v0.h[0] // p6(left[1]) * filter(6)
1620        smlal2          v4.4s,   v17.8h,  v1.h[0] // p1(top[0]) * filter(1)
1621        smlal2          v4.4s,   v18.8h,  v1.h[1] // p2(top[1]) * filter(2)
1622        smlal2          v4.4s,   v19.8h,  v1.h[2] // p3(top[2]) * filter(3)
1623        smlal2          v4.4s,   v20.8h,  v1.h[3] // p4(top[3]) * filter(4)
1624
1625        smull           v5.4s,   v17.4h,  v1.h[4] // p1(top[0]) * filter(1)
1626        smlal           v5.4s,   v18.4h,  v1.h[5] // p2(top[1]) * filter(2)
1627        smlal           v5.4s,   v19.4h,  v1.h[6] // p3(top[2]) * filter(3)
1628        sqrshrun        v3.4h,   v3.4s,   #4
1629        sqrshrun2       v3.8h,   v4.4s,   #4
1630        smin            v3.8h,   v3.8h,   v31.8h
1631        smlal           v5.4s,   v20.4h,  v1.h[7] // p4(top[3]) * filter(4)
1632        smlal           v5.4s,   v16.4h,  v1.h[3] // p0(topleft) * filter(0)
1633        smlal           v5.4s,   v21.4h,  v3.h[3] // p5(left[0]) * filter(5)
1634        smlal           v5.4s,   v22.4h,  v3.h[7] // p6(left[1]) * filter(6)
1635        smull2          v6.4s,   v17.8h,  v1.h[4] // p1(top[0]) * filter(1)
1636        smlal2          v6.4s,   v18.8h,  v1.h[5] // p2(top[1]) * filter(2)
1637        smlal2          v6.4s,   v19.8h,  v1.h[6] // p3(top[2]) * filter(3)
1638        smlal2          v6.4s,   v20.8h,  v1.h[7] // p4(top[3]) * filter(4)
1639        smlal2          v6.4s,   v16.8h,  v1.h[3] // p0(topleft) * filter(0)
1640        smlal2          v6.4s,   v21.8h,  v3.h[3] // p5(left[0]) * filter(5)
1641        smlal2          v6.4s,   v22.8h,  v3.h[7] // p6(left[1]) * filter(6)
1642
1643        smull           v24.4s,  v17.4h,  v2.h[0] // p1(top[0]) * filter(1)
1644        smlal           v24.4s,  v18.4h,  v2.h[1] // p2(top[1]) * filter(2)
1645        smlal           v24.4s,  v19.4h,  v2.h[2] // p3(top[2]) * filter(3)
1646        sqrshrun        v4.4h,   v5.4s,   #4
1647        sqrshrun2       v4.8h,   v6.4s,   #4
1648        smin            v4.8h,   v4.8h,   v31.8h
1649        smlal           v24.4s,  v20.4h,  v2.h[3] // p4(top[3]) * filter(4)
1650        smlal           v24.4s,  v16.4h,  v1.h[7] // p0(topleft) * filter(0)
1651        smlal           v24.4s,  v21.4h,  v4.h[3] // p5(left[0]) * filter(5)
1652        smlal           v24.4s,  v22.4h,  v4.h[7] // p6(left[1]) * filter(6)
1653        smull2          v25.4s,  v17.8h,  v2.h[0] // p1(top[0]) * filter(1)
1654        smlal2          v25.4s,  v18.8h,  v2.h[1] // p2(top[1]) * filter(2)
1655        smlal2          v25.4s,  v19.8h,  v2.h[2] // p3(top[2]) * filter(3)
1656        smlal2          v25.4s,  v20.8h,  v2.h[3] // p4(top[3]) * filter(4)
1657        smlal2          v25.4s,  v16.8h,  v1.h[7] // p0(topleft) * filter(0)
1658        smlal2          v25.4s,  v21.8h,  v4.h[3] // p5(left[0]) * filter(5)
1659        smlal2          v25.4s,  v22.8h,  v4.h[7] // p6(left[1]) * filter(6)
1660
1661        smull           v26.4s,  v17.4h,  v2.h[4] // p1(top[0]) * filter(1)
1662        smlal           v26.4s,  v18.4h,  v2.h[5] // p2(top[1]) * filter(2)
1663        smlal           v26.4s,  v19.4h,  v2.h[6] // p3(top[2]) * filter(3)
1664        sqrshrun        v5.4h,   v24.4s,  #4
1665        sqrshrun2       v5.8h,   v25.4s,  #4
1666        smin            v5.8h,   v5.8h,   v31.8h
1667        smlal           v26.4s,  v20.4h,  v2.h[7] // p4(top[3]) * filter(4)
1668        smlal           v26.4s,  v16.4h,  v2.h[3] // p0(topleft) * filter(0)
1669        smlal           v26.4s,  v21.4h,  v5.h[3] // p5(left[0]) * filter(5)
1670        smlal           v26.4s,  v22.4h,  v5.h[7] // p6(left[1]) * filter(6)
1671        smull2          v27.4s,  v17.8h,  v2.h[4] // p1(top[0]) * filter(1)
1672        smlal2          v27.4s,  v18.8h,  v2.h[5] // p2(top[1]) * filter(2)
1673        smlal2          v27.4s,  v19.8h,  v2.h[6] // p3(top[2]) * filter(3)
1674        smlal2          v27.4s,  v20.8h,  v2.h[7] // p4(top[3]) * filter(4)
1675        smlal2          v27.4s,  v16.8h,  v2.h[3] // p0(topleft) * filter(0)
1676        smlal2          v27.4s,  v21.8h,  v5.h[3] // p5(left[0]) * filter(5)
1677        smlal2          v27.4s,  v22.8h,  v5.h[7] // p6(left[1]) * filter(6)
1678
1679        subs            w3,  w3,  #16
1680        sqrshrun        v6.4h,   v26.4s,  #4
1681        sqrshrun2       v6.8h,   v27.4s,  #4
1682.endif
1683        smin            v6.8h,   v6.8h,   v31.8h
1684
1685        ins             v0.h[2], v2.h[7]
1686        st4             {v3.d, v4.d, v5.d, v6.d}[0], [x0], #32
1687        ins             v0.h[0], v6.h[7]
1688        st4             {v3.d, v4.d, v5.d, v6.d}[1], [x6], #32
1689        ins             v0.h[1], v6.h[3]
1690        b.gt            2b
1691        subs            w4,  w4,  #2
1692        b.le            9f
1693        sub             x8,  x6,  w9, uxtw #1
1694        add             x0,  x0,  x1
1695        add             x6,  x6,  x1
1696        mov             w3,  w9
1697        b               1b
16989:
1699        ret
1700
1701L(ipred_filter\bpc\()_tbl):
1702        .hword L(ipred_filter\bpc\()_tbl) - 320b
1703        .hword L(ipred_filter\bpc\()_tbl) - 160b
1704        .hword L(ipred_filter\bpc\()_tbl) -  80b
1705        .hword L(ipred_filter\bpc\()_tbl) -  40b
1706endfunc
1707.endm
1708
1709filter_fn 10
1710filter_fn 12
1711
1712function ipred_filter_16bpc_neon, export=1
1713        ldr             w8,  [sp]
1714        cmp             w8,  0x3ff
1715        b.le            ipred_filter_10bpc_neon
1716        b               ipred_filter_12bpc_neon
1717endfunc
1718
1719// void pal_pred_16bpc_neon(pixel *dst, const ptrdiff_t stride,
1720//                          const uint16_t *const pal, const uint8_t *idx,
1721//                          const int w, const int h);
1722function pal_pred_16bpc_neon, export=1
1723        ld1             {v30.8h}, [x2]
1724        clz             w9,  w4
1725        adr             x6,  L(pal_pred_tbl)
1726        sub             w9,  w9,  #25
1727        ldrh            w9,  [x6, w9, uxtw #1]
1728        movi            v31.8h,  #1, lsl #8
1729        sub             x6,  x6,  w9, uxtw
1730        br              x6
173140:
1732        AARCH64_VALID_JUMP_TARGET
1733        add             x2,  x0,  x1
1734        lsl             x1,  x1,  #1
17354:
1736        ld1             {v1.16b}, [x3], #16
1737        subs            w5,  w5,  #4
1738        // Restructure v1 from a, b, c, ... into 2*a, 2*a+1, 2*b, 2*b+1, 2*c, 2*c+1, ...
1739        add             v1.16b,  v1.16b,  v1.16b
1740        zip1            v0.16b,  v1.16b,  v1.16b
1741        zip2            v1.16b,  v1.16b,  v1.16b
1742        add             v0.8h,   v0.8h,   v31.8h
1743        add             v1.8h,   v1.8h,   v31.8h
1744        tbl             v0.16b, {v30.16b}, v0.16b
1745        st1             {v0.d}[0], [x0], x1
1746        tbl             v1.16b, {v30.16b}, v1.16b
1747        st1             {v0.d}[1], [x2], x1
1748        st1             {v1.d}[0], [x0], x1
1749        st1             {v1.d}[1], [x2], x1
1750        b.gt            4b
1751        ret
175280:
1753        AARCH64_VALID_JUMP_TARGET
1754        add             x2,  x0,  x1
1755        lsl             x1,  x1,  #1
17568:
1757        ld1             {v2.16b, v3.16b}, [x3], #32
1758        subs            w5,  w5,  #4
1759        add             v2.16b,  v2.16b,  v2.16b
1760        add             v3.16b,  v3.16b,  v3.16b
1761        zip1            v0.16b,  v2.16b,  v2.16b
1762        zip2            v1.16b,  v2.16b,  v2.16b
1763        zip1            v2.16b,  v3.16b,  v3.16b
1764        zip2            v3.16b,  v3.16b,  v3.16b
1765        add             v0.8h,   v0.8h,   v31.8h
1766        add             v1.8h,   v1.8h,   v31.8h
1767        add             v2.8h,   v2.8h,   v31.8h
1768        add             v3.8h,   v3.8h,   v31.8h
1769        tbl             v0.16b, {v30.16b}, v0.16b
1770        tbl             v1.16b, {v30.16b}, v1.16b
1771        st1             {v0.8h}, [x0], x1
1772        tbl             v2.16b, {v30.16b}, v2.16b
1773        st1             {v1.8h}, [x2], x1
1774        tbl             v3.16b, {v30.16b}, v3.16b
1775        st1             {v2.8h}, [x0], x1
1776        st1             {v3.8h}, [x2], x1
1777        b.gt            8b
1778        ret
1779160:
1780        AARCH64_VALID_JUMP_TARGET
1781        add             x2,  x0,  x1
1782        lsl             x1,  x1,  #1
178316:
1784        ld1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x3], #64
1785        subs            w5,  w5,  #4
1786        add             v4.16b,  v4.16b,  v4.16b
1787        add             v5.16b,  v5.16b,  v5.16b
1788        add             v6.16b,  v6.16b,  v6.16b
1789        add             v7.16b,  v7.16b,  v7.16b
1790        zip1            v0.16b,  v4.16b,  v4.16b
1791        zip2            v1.16b,  v4.16b,  v4.16b
1792        zip1            v2.16b,  v5.16b,  v5.16b
1793        zip2            v3.16b,  v5.16b,  v5.16b
1794        zip1            v4.16b,  v6.16b,  v6.16b
1795        zip2            v5.16b,  v6.16b,  v6.16b
1796        zip1            v6.16b,  v7.16b,  v7.16b
1797        zip2            v7.16b,  v7.16b,  v7.16b
1798        add             v0.8h,   v0.8h,   v31.8h
1799        add             v1.8h,   v1.8h,   v31.8h
1800        add             v2.8h,   v2.8h,   v31.8h
1801        add             v3.8h,   v3.8h,   v31.8h
1802        add             v4.8h,   v4.8h,   v31.8h
1803        tbl             v0.16b, {v30.16b}, v0.16b
1804        add             v5.8h,   v5.8h,   v31.8h
1805        tbl             v1.16b, {v30.16b}, v1.16b
1806        add             v6.8h,   v6.8h,   v31.8h
1807        tbl             v2.16b, {v30.16b}, v2.16b
1808        add             v7.8h,   v7.8h,   v31.8h
1809        tbl             v3.16b, {v30.16b}, v3.16b
1810        tbl             v4.16b, {v30.16b}, v4.16b
1811        tbl             v5.16b, {v30.16b}, v5.16b
1812        st1             {v0.8h, v1.8h}, [x0], x1
1813        tbl             v6.16b, {v30.16b}, v6.16b
1814        st1             {v2.8h, v3.8h}, [x2], x1
1815        tbl             v7.16b, {v30.16b}, v7.16b
1816        st1             {v4.8h, v5.8h}, [x0], x1
1817        st1             {v6.8h, v7.8h}, [x2], x1
1818        b.gt            16b
1819        ret
1820320:
1821        AARCH64_VALID_JUMP_TARGET
1822        add             x2,  x0,  x1
1823        lsl             x1,  x1,  #1
182432:
1825        ld1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x3], #64
1826        subs            w5,  w5,  #2
1827        add             v4.16b,  v4.16b,  v4.16b
1828        add             v5.16b,  v5.16b,  v5.16b
1829        add             v6.16b,  v6.16b,  v6.16b
1830        add             v7.16b,  v7.16b,  v7.16b
1831        zip1            v0.16b,  v4.16b,  v4.16b
1832        zip2            v1.16b,  v4.16b,  v4.16b
1833        zip1            v2.16b,  v5.16b,  v5.16b
1834        zip2            v3.16b,  v5.16b,  v5.16b
1835        zip1            v4.16b,  v6.16b,  v6.16b
1836        zip2            v5.16b,  v6.16b,  v6.16b
1837        zip1            v6.16b,  v7.16b,  v7.16b
1838        zip2            v7.16b,  v7.16b,  v7.16b
1839        add             v0.8h,   v0.8h,   v31.8h
1840        add             v1.8h,   v1.8h,   v31.8h
1841        add             v2.8h,   v2.8h,   v31.8h
1842        add             v3.8h,   v3.8h,   v31.8h
1843        add             v4.8h,   v4.8h,   v31.8h
1844        tbl             v0.16b, {v30.16b}, v0.16b
1845        add             v5.8h,   v5.8h,   v31.8h
1846        tbl             v1.16b, {v30.16b}, v1.16b
1847        add             v6.8h,   v6.8h,   v31.8h
1848        tbl             v2.16b, {v30.16b}, v2.16b
1849        add             v7.8h,   v7.8h,   v31.8h
1850        tbl             v3.16b, {v30.16b}, v3.16b
1851        tbl             v4.16b, {v30.16b}, v4.16b
1852        tbl             v5.16b, {v30.16b}, v5.16b
1853        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
1854        tbl             v6.16b, {v30.16b}, v6.16b
1855        tbl             v7.16b, {v30.16b}, v7.16b
1856        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], x1
1857        b.gt            32b
1858        ret
1859640:
1860        AARCH64_VALID_JUMP_TARGET
1861        add             x2,  x0,  #64
186264:
1863        ld1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x3], #64
1864        subs            w5,  w5,  #1
1865        add             v4.16b,  v4.16b,  v4.16b
1866        add             v5.16b,  v5.16b,  v5.16b
1867        add             v6.16b,  v6.16b,  v6.16b
1868        add             v7.16b,  v7.16b,  v7.16b
1869        zip1            v0.16b,  v4.16b,  v4.16b
1870        zip2            v1.16b,  v4.16b,  v4.16b
1871        zip1            v2.16b,  v5.16b,  v5.16b
1872        zip2            v3.16b,  v5.16b,  v5.16b
1873        zip1            v4.16b,  v6.16b,  v6.16b
1874        zip2            v5.16b,  v6.16b,  v6.16b
1875        zip1            v6.16b,  v7.16b,  v7.16b
1876        zip2            v7.16b,  v7.16b,  v7.16b
1877        add             v0.8h,   v0.8h,   v31.8h
1878        add             v1.8h,   v1.8h,   v31.8h
1879        add             v2.8h,   v2.8h,   v31.8h
1880        add             v3.8h,   v3.8h,   v31.8h
1881        add             v4.8h,   v4.8h,   v31.8h
1882        tbl             v0.16b, {v30.16b}, v0.16b
1883        add             v5.8h,   v5.8h,   v31.8h
1884        tbl             v1.16b, {v30.16b}, v1.16b
1885        add             v6.8h,   v6.8h,   v31.8h
1886        tbl             v2.16b, {v30.16b}, v2.16b
1887        add             v7.8h,   v7.8h,   v31.8h
1888        tbl             v3.16b, {v30.16b}, v3.16b
1889        tbl             v4.16b, {v30.16b}, v4.16b
1890        tbl             v5.16b, {v30.16b}, v5.16b
1891        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
1892        tbl             v6.16b, {v30.16b}, v6.16b
1893        tbl             v7.16b, {v30.16b}, v7.16b
1894        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], x1
1895        b.gt            64b
1896        ret
1897
1898L(pal_pred_tbl):
1899        .hword L(pal_pred_tbl) - 640b
1900        .hword L(pal_pred_tbl) - 320b
1901        .hword L(pal_pred_tbl) - 160b
1902        .hword L(pal_pred_tbl) -  80b
1903        .hword L(pal_pred_tbl) -  40b
1904endfunc
1905
1906// void ipred_cfl_128_16bpc_neon(pixel *dst, const ptrdiff_t stride,
1907//                               const pixel *const topleft,
1908//                               const int width, const int height,
1909//                               const int16_t *ac, const int alpha,
1910//                               const int bitdepth_max);
1911function ipred_cfl_128_16bpc_neon, export=1
1912        dup             v31.8h,  w7   // bitdepth_max
1913        clz             w9,  w3
1914        adr             x7,  L(ipred_cfl_128_tbl)
1915        sub             w9,  w9,  #26
1916        ldrh            w9,  [x7, w9, uxtw #1]
1917        urshr           v0.8h,   v31.8h,  #1
1918        dup             v1.8h,   w6   // alpha
1919        sub             x7,  x7,  w9, uxtw
1920        add             x6,  x0,  x1
1921        lsl             x1,  x1,  #1
1922        movi            v30.8h,  #0
1923        br              x7
1924L(ipred_cfl_splat_w4):
1925        AARCH64_VALID_JUMP_TARGET
1926        ld1             {v4.8h, v5.8h}, [x5], #32
1927        subs            w4,  w4,  #4
1928        smull           v2.4s,   v4.4h,   v1.4h  // diff = ac * alpha
1929        smull2          v3.4s,   v4.8h,   v1.8h
1930        smull           v4.4s,   v5.4h,   v1.4h
1931        smull2          v5.4s,   v5.8h,   v1.8h
1932        cmlt            v16.4s,  v2.4s,   #0     // sign
1933        cmlt            v17.4s,  v3.4s,   #0
1934        cmlt            v18.4s,  v4.4s,   #0
1935        cmlt            v19.4s,  v5.4s,   #0
1936        add             v2.4s,   v2.4s,   v16.4s // diff + sign
1937        add             v3.4s,   v3.4s,   v17.4s
1938        add             v4.4s,   v4.4s,   v18.4s
1939        add             v5.4s,   v5.4s,   v19.4s
1940        rshrn           v2.4h,   v2.4s,   #6     // (diff + sign + 32) >> 6 = apply_sign()
1941        rshrn2          v2.8h,   v3.4s,   #6
1942        rshrn           v3.4h,   v4.4s,   #6
1943        rshrn2          v3.8h,   v5.4s,   #6
1944        add             v2.8h,   v2.8h,   v0.8h  // dc + apply_sign()
1945        add             v3.8h,   v3.8h,   v0.8h
1946        smax            v2.8h,   v2.8h,   v30.8h
1947        smax            v3.8h,   v3.8h,   v30.8h
1948        smin            v2.8h,   v2.8h,   v31.8h
1949        smin            v3.8h,   v3.8h,   v31.8h
1950        st1             {v2.d}[0],  [x0], x1
1951        st1             {v2.d}[1],  [x6], x1
1952        st1             {v3.d}[0],  [x0], x1
1953        st1             {v3.d}[1],  [x6], x1
1954        b.gt            L(ipred_cfl_splat_w4)
1955        ret
1956L(ipred_cfl_splat_w8):
1957        AARCH64_VALID_JUMP_TARGET
1958        ld1             {v4.8h, v5.8h}, [x5], #32
1959        subs            w4,  w4,  #2
1960        smull           v2.4s,   v4.4h,   v1.4h  // diff = ac * alpha
1961        smull2          v3.4s,   v4.8h,   v1.8h
1962        smull           v4.4s,   v5.4h,   v1.4h
1963        smull2          v5.4s,   v5.8h,   v1.8h
1964        cmlt            v16.4s,  v2.4s,   #0     // sign
1965        cmlt            v17.4s,  v3.4s,   #0
1966        cmlt            v18.4s,  v4.4s,   #0
1967        cmlt            v19.4s,  v5.4s,   #0
1968        add             v2.4s,   v2.4s,   v16.4s // diff + sign
1969        add             v3.4s,   v3.4s,   v17.4s
1970        add             v4.4s,   v4.4s,   v18.4s
1971        add             v5.4s,   v5.4s,   v19.4s
1972        rshrn           v2.4h,   v2.4s,   #6     // (diff + sign + 32) >> 6 = apply_sign()
1973        rshrn2          v2.8h,   v3.4s,   #6
1974        rshrn           v3.4h,   v4.4s,   #6
1975        rshrn2          v3.8h,   v5.4s,   #6
1976        add             v2.8h,   v2.8h,   v0.8h  // dc + apply_sign()
1977        add             v3.8h,   v3.8h,   v0.8h
1978        smax            v2.8h,   v2.8h,   v30.8h
1979        smax            v3.8h,   v3.8h,   v30.8h
1980        smin            v2.8h,   v2.8h,   v31.8h
1981        smin            v3.8h,   v3.8h,   v31.8h
1982        st1             {v2.8h},  [x0], x1
1983        st1             {v3.8h},  [x6], x1
1984        b.gt            L(ipred_cfl_splat_w8)
1985        ret
1986L(ipred_cfl_splat_w16):
1987        AARCH64_VALID_JUMP_TARGET
1988        add             x7,  x5,  w3, uxtw #1
1989        sub             x1,  x1,  w3, uxtw #1
1990        mov             w9,  w3
19911:
1992        ld1             {v2.8h, v3.8h}, [x5], #32
1993        ld1             {v4.8h, v5.8h}, [x7], #32
1994        subs            w3,  w3,  #16
1995        smull           v16.4s,  v2.4h,   v1.4h  // diff = ac * alpha
1996        smull2          v17.4s,  v2.8h,   v1.8h
1997        smull           v18.4s,  v3.4h,   v1.4h
1998        smull2          v19.4s,  v3.8h,   v1.8h
1999        smull           v2.4s,   v4.4h,   v1.4h
2000        smull2          v3.4s,   v4.8h,   v1.8h
2001        smull           v4.4s,   v5.4h,   v1.4h
2002        smull2          v5.4s,   v5.8h,   v1.8h
2003        cmlt            v20.4s,  v16.4s,  #0     // sign
2004        cmlt            v21.4s,  v17.4s,  #0
2005        cmlt            v22.4s,  v18.4s,  #0
2006        cmlt            v23.4s,  v19.4s,  #0
2007        cmlt            v24.4s,  v2.4s,   #0
2008        cmlt            v25.4s,  v3.4s,   #0
2009        cmlt            v26.4s,  v4.4s,   #0
2010        cmlt            v27.4s,  v5.4s,   #0
2011        add             v16.4s,  v16.4s,  v20.4s // diff + sign
2012        add             v17.4s,  v17.4s,  v21.4s
2013        add             v18.4s,  v18.4s,  v22.4s
2014        add             v19.4s,  v19.4s,  v23.4s
2015        add             v2.4s,   v2.4s,   v24.4s
2016        add             v3.4s,   v3.4s,   v25.4s
2017        add             v4.4s,   v4.4s,   v26.4s
2018        add             v5.4s,   v5.4s,   v27.4s
2019        rshrn           v16.4h,  v16.4s,  #6     // (diff + sign + 32) >> 6 = apply_sign()
2020        rshrn2          v16.8h,  v17.4s,  #6
2021        rshrn           v17.4h,  v18.4s,  #6
2022        rshrn2          v17.8h,  v19.4s,  #6
2023        rshrn           v6.4h,   v2.4s,   #6
2024        rshrn2          v6.8h,   v3.4s,   #6
2025        rshrn           v7.4h,   v4.4s,   #6
2026        rshrn2          v7.8h,   v5.4s,   #6
2027        add             v2.8h,   v16.8h,  v0.8h  // dc + apply_sign()
2028        add             v3.8h,   v17.8h,  v0.8h
2029        add             v4.8h,   v6.8h,   v0.8h
2030        add             v5.8h,   v7.8h,   v0.8h
2031        smax            v2.8h,   v2.8h,   v30.8h
2032        smax            v3.8h,   v3.8h,   v30.8h
2033        smax            v4.8h,   v4.8h,   v30.8h
2034        smax            v5.8h,   v5.8h,   v30.8h
2035        smin            v2.8h,   v2.8h,   v31.8h
2036        smin            v3.8h,   v3.8h,   v31.8h
2037        smin            v4.8h,   v4.8h,   v31.8h
2038        smin            v5.8h,   v5.8h,   v31.8h
2039        st1             {v2.8h, v3.8h},  [x0], #32
2040        st1             {v4.8h, v5.8h},  [x6], #32
2041        b.gt            1b
2042        subs            w4,  w4,  #2
2043        add             x5,  x5,  w9, uxtw #1
2044        add             x7,  x7,  w9, uxtw #1
2045        add             x0,  x0,  x1
2046        add             x6,  x6,  x1
2047        mov             w3,  w9
2048        b.gt            1b
2049        ret
2050
2051L(ipred_cfl_128_tbl):
2052L(ipred_cfl_splat_tbl):
2053        .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16)
2054        .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16)
2055        .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w8)
2056        .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w4)
2057endfunc
2058
2059// void ipred_cfl_top_16bpc_neon(pixel *dst, const ptrdiff_t stride,
2060//                               const pixel *const topleft,
2061//                               const int width, const int height,
2062//                               const int16_t *ac, const int alpha,
2063//                               const int bitdepth_max);
2064function ipred_cfl_top_16bpc_neon, export=1
2065        dup             v31.8h,  w7   // bitdepth_max
2066        clz             w9,  w3
2067        adr             x7,  L(ipred_cfl_top_tbl)
2068        sub             w9,  w9,  #26
2069        ldrh            w9,  [x7, w9, uxtw #1]
2070        dup             v1.8h,   w6   // alpha
2071        add             x2,  x2,  #2
2072        sub             x7,  x7,  w9, uxtw
2073        add             x6,  x0,  x1
2074        lsl             x1,  x1,  #1
2075        movi            v30.8h,  #0
2076        br              x7
20774:
2078        AARCH64_VALID_JUMP_TARGET
2079        ld1             {v0.4h},  [x2]
2080        addv            h0,      v0.4h
2081        urshr           v0.4h,   v0.4h,   #2
2082        dup             v0.8h,   v0.h[0]
2083        b               L(ipred_cfl_splat_w4)
20848:
2085        AARCH64_VALID_JUMP_TARGET
2086        ld1             {v0.8h},  [x2]
2087        addv            h0,      v0.8h
2088        urshr           v0.4h,   v0.4h,   #3
2089        dup             v0.8h,   v0.h[0]
2090        b               L(ipred_cfl_splat_w8)
209116:
2092        AARCH64_VALID_JUMP_TARGET
2093        ld1             {v2.8h, v3.8h}, [x2]
2094        addp            v0.8h,   v2.8h,   v3.8h
2095        addv            h0,      v0.8h
2096        urshr           v0.4h,   v0.4h,   #4
2097        dup             v0.8h,   v0.h[0]
2098        b               L(ipred_cfl_splat_w16)
209932:
2100        AARCH64_VALID_JUMP_TARGET
2101        ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [x2]
2102        addp            v2.8h,   v2.8h,   v3.8h
2103        addp            v4.8h,   v4.8h,   v5.8h
2104        addp            v0.8h,   v2.8h,   v4.8h
2105        uaddlv          s0,      v0.8h
2106        rshrn           v0.4h,   v0.4s,   #5
2107        dup             v0.8h,   v0.h[0]
2108        b               L(ipred_cfl_splat_w16)
2109
2110L(ipred_cfl_top_tbl):
2111        .hword L(ipred_cfl_top_tbl) - 32b
2112        .hword L(ipred_cfl_top_tbl) - 16b
2113        .hword L(ipred_cfl_top_tbl) -  8b
2114        .hword L(ipred_cfl_top_tbl) -  4b
2115endfunc
2116
2117// void ipred_cfl_left_16bpc_neon(pixel *dst, const ptrdiff_t stride,
2118//                                const pixel *const topleft,
2119//                                const int width, const int height,
2120//                                const int16_t *ac, const int alpha,
2121//                                const int bitdepth_max);
2122function ipred_cfl_left_16bpc_neon, export=1
2123        dup             v31.8h,  w7   // bitdepth_max
2124        sub             x2,  x2,  w4, uxtw #1
2125        clz             w9,  w3
2126        clz             w8,  w4
2127        adr             x10, L(ipred_cfl_splat_tbl)
2128        adr             x7,  L(ipred_cfl_left_tbl)
2129        sub             w9,  w9,  #26
2130        sub             w8,  w8,  #26
2131        ldrh            w9,  [x10, w9, uxtw #1]
2132        ldrh            w8,  [x7,  w8, uxtw #1]
2133        dup             v1.8h,   w6   // alpha
2134        sub             x9,  x10, w9, uxtw
2135        sub             x7,  x7,  w8, uxtw
2136        add             x6,  x0,  x1
2137        lsl             x1,  x1,  #1
2138        movi            v30.8h,  #0
2139        br              x7
2140
2141L(ipred_cfl_left_h4):
2142        AARCH64_VALID_JUMP_TARGET
2143        ld1             {v0.4h},  [x2]
2144        addv            h0,      v0.4h
2145        urshr           v0.4h,   v0.4h,   #2
2146        dup             v0.8h,   v0.h[0]
2147        br              x9
2148
2149L(ipred_cfl_left_h8):
2150        AARCH64_VALID_JUMP_TARGET
2151        ld1             {v0.8h},  [x2]
2152        addv            h0,      v0.8h
2153        urshr           v0.4h,   v0.4h,   #3
2154        dup             v0.8h,   v0.h[0]
2155        br              x9
2156
2157L(ipred_cfl_left_h16):
2158        AARCH64_VALID_JUMP_TARGET
2159        ld1             {v2.8h, v3.8h}, [x2]
2160        addp            v0.8h,   v2.8h,   v3.8h
2161        addv            h0,      v0.8h
2162        urshr           v0.4h,   v0.4h,   #4
2163        dup             v0.8h,   v0.h[0]
2164        br              x9
2165
2166L(ipred_cfl_left_h32):
2167        AARCH64_VALID_JUMP_TARGET
2168        ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [x2]
2169        addp            v2.8h,   v2.8h,   v3.8h
2170        addp            v4.8h,   v4.8h,   v5.8h
2171        addp            v0.8h,   v2.8h,   v4.8h
2172        uaddlv          s0,      v0.8h
2173        rshrn           v0.4h,   v0.4s,   #5
2174        dup             v0.8h,   v0.h[0]
2175        br              x9
2176
2177L(ipred_cfl_left_tbl):
2178        .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h32)
2179        .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h16)
2180        .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h8)
2181        .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h4)
2182endfunc
2183
2184// void ipred_cfl_16bpc_neon(pixel *dst, const ptrdiff_t stride,
2185//                           const pixel *const topleft,
2186//                           const int width, const int height,
2187//                           const int16_t *ac, const int alpha,
2188//                           const int bitdepth_max);
2189function ipred_cfl_16bpc_neon, export=1
2190        dup             v31.8h,  w7              // bitdepth_max
2191        sub             x2,  x2,  w4, uxtw #1
2192        add             w8,  w3,  w4             // width + height
2193        dup             v1.8h,   w6              // alpha
2194        clz             w9,  w3
2195        clz             w6,  w4
2196        dup             v16.4s, w8               // width + height
2197        adr             x7,  L(ipred_cfl_tbl)
2198        rbit            w8,  w8                  // rbit(width + height)
2199        sub             w9,  w9,  #22            // 26 leading bits, minus table offset 4
2200        sub             w6,  w6,  #26
2201        clz             w8,  w8                  // ctz(width + height)
2202        ldrh            w9,  [x7, w9, uxtw #1]
2203        ldrh            w6,  [x7, w6, uxtw #1]
2204        neg             w8,  w8                  // -ctz(width + height)
2205        sub             x9,  x7,  w9, uxtw
2206        sub             x7,  x7,  w6, uxtw
2207        ushr            v16.4s,  v16.4s,  #1     // (width + height) >> 1
2208        dup             v17.4s,  w8              // -ctz(width + height)
2209        add             x6,  x0,  x1
2210        lsl             x1,  x1,  #1
2211        movi            v30.8h,  #0
2212        br              x7
2213
2214L(ipred_cfl_h4):
2215        AARCH64_VALID_JUMP_TARGET
2216        ld1             {v0.4h},  [x2], #8
2217        uaddlv          s0,      v0.4h
2218        add             x2,  x2,  #2
2219        br              x9
2220L(ipred_cfl_w4):
2221        AARCH64_VALID_JUMP_TARGET
2222        ld1             {v2.4h},  [x2]
2223        add             v0.2s,   v0.2s,   v16.2s
2224        uaddlv          s2,      v2.4h
2225        cmp             w4,  #4
2226        add             v0.2s,   v0.2s,   v2.2s
2227        ushl            v0.2s,   v0.2s,   v17.2s
2228        b.eq            1f
2229        // h = 8/16
2230        cmp             w4,  #16
2231        mov             w16, #0x6667
2232        mov             w17, #0xAAAB
2233        csel            w16, w16, w17, eq
2234        dup             v16.2s,  w16
2235        mul             v0.2s,   v0.2s,   v16.2s
2236        ushr            v0.2s,   v0.2s,   #17
22371:
2238        dup             v0.8h,   v0.h[0]
2239        b               L(ipred_cfl_splat_w4)
2240
2241L(ipred_cfl_h8):
2242        AARCH64_VALID_JUMP_TARGET
2243        ld1             {v0.8h},  [x2], #16
2244        uaddlv          s0,      v0.8h
2245        add             x2,  x2,  #2
2246        br              x9
2247L(ipred_cfl_w8):
2248        AARCH64_VALID_JUMP_TARGET
2249        ld1             {v2.8h},  [x2]
2250        add             v0.2s,   v0.2s,   v16.2s
2251        uaddlv          s2,      v2.8h
2252        cmp             w4,  #8
2253        add             v0.2s,   v0.2s,   v2.2s
2254        ushl            v0.2s,   v0.2s,   v17.2s
2255        b.eq            1f
2256        // h = 4/16/32
2257        cmp             w4,  #32
2258        mov             w16, #0x6667
2259        mov             w17, #0xAAAB
2260        csel            w16, w16, w17, eq
2261        dup             v16.2s,  w16
2262        mul             v0.2s,   v0.2s,   v16.2s
2263        ushr            v0.2s,   v0.2s,   #17
22641:
2265        dup             v0.8h,   v0.h[0]
2266        b               L(ipred_cfl_splat_w8)
2267
2268L(ipred_cfl_h16):
2269        AARCH64_VALID_JUMP_TARGET
2270        ld1             {v2.8h, v3.8h}, [x2], #32
2271        addp            v0.8h,   v2.8h,   v3.8h
2272        add             x2,  x2,  #2
2273        uaddlv          s0,      v0.8h
2274        br              x9
2275L(ipred_cfl_w16):
2276        AARCH64_VALID_JUMP_TARGET
2277        ld1             {v2.8h, v3.8h}, [x2]
2278        add             v0.2s,   v0.2s,   v16.2s
2279        addp            v2.8h,   v2.8h,   v3.8h
2280        uaddlv          s2,      v2.8h
2281        cmp             w4,  #16
2282        add             v0.2s,   v0.2s,   v2.2s
2283        ushl            v0.2s,   v0.2s,   v17.2s
2284        b.eq            1f
2285        // h = 4/8/32
2286        tst             w4,  #(32+16+8) // 16 added to make a consecutive bitmask
2287        mov             w16, #0x6667
2288        mov             w17, #0xAAAB
2289        csel            w16, w16, w17, eq
2290        dup             v16.2s,  w16
2291        mul             v0.2s,   v0.2s,   v16.2s
2292        ushr            v0.2s,   v0.2s,   #17
22931:
2294        dup             v0.8h,   v0.h[0]
2295        b               L(ipred_cfl_splat_w16)
2296
2297L(ipred_cfl_h32):
2298        AARCH64_VALID_JUMP_TARGET
2299        ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [x2], #64
2300        addp            v2.8h,   v2.8h,   v3.8h
2301        addp            v4.8h,   v4.8h,   v5.8h
2302        addp            v0.8h,   v2.8h,   v4.8h
2303        add             x2,  x2,  #2
2304        uaddlv          s0,      v0.8h
2305        br              x9
2306L(ipred_cfl_w32):
2307        AARCH64_VALID_JUMP_TARGET
2308        ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [x2]
2309        add             v0.4s,   v0.4s,   v16.4s
2310        addp            v2.8h,   v2.8h,   v3.8h
2311        addp            v4.8h,   v4.8h,   v5.8h
2312        addp            v2.8h,   v2.8h,   v4.8h
2313        cmp             w4,  #32
2314        uaddlv          s2,      v2.8h
2315        add             v0.2s,   v0.2s,   v2.2s
2316        ushl            v0.2s,   v0.2s,   v17.2s
2317        b.eq            1f
2318        // h = 8/16
2319        cmp             w4,  #8
2320        mov             w16, #0x6667
2321        mov             w17, #0xAAAB
2322        csel            w16, w16, w17, eq
2323        dup             v16.2s,  w16
2324        mul             v0.2s,   v0.2s,   v16.2s
2325        ushr            v0.2s,   v0.2s,   #17
23261:
2327        dup             v0.8h,   v0.h[0]
2328        b               L(ipred_cfl_splat_w16)
2329
2330L(ipred_cfl_tbl):
2331        .hword L(ipred_cfl_tbl) - L(ipred_cfl_h32)
2332        .hword L(ipred_cfl_tbl) - L(ipred_cfl_h16)
2333        .hword L(ipred_cfl_tbl) - L(ipred_cfl_h8)
2334        .hword L(ipred_cfl_tbl) - L(ipred_cfl_h4)
2335        .hword L(ipred_cfl_tbl) - L(ipred_cfl_w32)
2336        .hword L(ipred_cfl_tbl) - L(ipred_cfl_w16)
2337        .hword L(ipred_cfl_tbl) - L(ipred_cfl_w8)
2338        .hword L(ipred_cfl_tbl) - L(ipred_cfl_w4)
2339endfunc
2340
2341// void cfl_ac_420_16bpc_neon(int16_t *const ac, const pixel *const ypx,
2342//                            const ptrdiff_t stride, const int w_pad,
2343//                            const int h_pad, const int cw, const int ch);
2344function ipred_cfl_ac_420_16bpc_neon, export=1
2345        clz             w8,  w5
2346        lsl             w4,  w4,  #2
2347        adr             x7,  L(ipred_cfl_ac_420_tbl)
2348        sub             w8,  w8,  #27
2349        ldrh            w8,  [x7, w8, uxtw #1]
2350        movi            v24.4s,  #0
2351        movi            v25.4s,  #0
2352        movi            v26.4s,  #0
2353        movi            v27.4s,  #0
2354        sub             x7,  x7,  w8, uxtw
2355        sub             w8,  w6,  w4         // height - h_pad
2356        rbit            w9,  w5              // rbit(width)
2357        rbit            w10, w6              // rbit(height)
2358        clz             w9,  w9              // ctz(width)
2359        clz             w10, w10             // ctz(height)
2360        add             w9,  w9,  w10        // log2sz
2361        add             x10, x1,  x2
2362        dup             v31.4s,  w9
2363        lsl             x2,  x2,  #1
2364        neg             v31.4s,  v31.4s      // -log2sz
2365        br              x7
2366
2367L(ipred_cfl_ac_420_w4):
2368        AARCH64_VALID_JUMP_TARGET
23691:      // Copy and subsample input
2370        ld1             {v0.8h}, [x1],  x2
2371        ld1             {v1.8h}, [x10], x2
2372        ld1             {v2.8h}, [x1],  x2
2373        ld1             {v3.8h}, [x10], x2
2374        addp            v0.8h,   v0.8h,   v2.8h
2375        addp            v1.8h,   v1.8h,   v3.8h
2376        add             v0.8h,   v0.8h,   v1.8h
2377        shl             v0.8h,   v0.8h,   #1
2378        subs            w8,  w8,  #2
2379        st1             {v0.8h}, [x0], #16
2380        uaddw           v24.4s,  v24.4s,  v0.4h
2381        uaddw2          v25.4s,  v25.4s,  v0.8h
2382        b.gt            1b
2383        trn2            v1.2d,   v0.2d,   v0.2d
2384        trn2            v0.2d,   v0.2d,   v0.2d
2385L(ipred_cfl_ac_420_w4_hpad):
2386        cbz             w4,  3f
23872:      // Vertical padding (h_pad > 0)
2388        subs            w4,  w4,  #4
2389        st1             {v0.8h, v1.8h}, [x0], #32
2390        uaddw           v24.4s,  v24.4s,  v0.4h
2391        uaddw2          v25.4s,  v25.4s,  v0.8h
2392        uaddw           v26.4s,  v26.4s,  v1.4h
2393        uaddw2          v27.4s,  v27.4s,  v1.8h
2394        b.gt            2b
23953:
2396L(ipred_cfl_ac_420_w4_calc_subtract_dc):
2397        // Aggregate the sums
2398        add             v24.4s,  v24.4s,  v25.4s
2399        add             v26.4s,  v26.4s,  v27.4s
2400        add             v0.4s,   v24.4s,  v26.4s
2401        addv            s0,  v0.4s                // sum
2402        sub             x0,  x0,  w6, uxtw #3
2403        urshl           v4.2s,   v0.2s,   v31.2s  // (sum + (1 << (log2sz - 1)))  >>= log2sz
2404        dup             v4.8h,   v4.h[0]
24056:      // Subtract dc from ac
2406        ld1             {v0.8h, v1.8h}, [x0]
2407        subs            w6,  w6,  #4
2408        sub             v0.8h,   v0.8h,   v4.8h
2409        sub             v1.8h,   v1.8h,   v4.8h
2410        st1             {v0.8h, v1.8h}, [x0], #32
2411        b.gt            6b
2412        ret
2413
2414L(ipred_cfl_ac_420_w8):
2415        AARCH64_VALID_JUMP_TARGET
2416        cbnz            w3,  L(ipred_cfl_ac_420_w8_wpad)
24171:      // Copy and subsample input, without padding
2418        ld1             {v0.8h, v1.8h}, [x1],  x2
2419        ld1             {v2.8h, v3.8h}, [x10], x2
2420        ld1             {v4.8h, v5.8h}, [x1],  x2
2421        addp            v0.8h,   v0.8h,   v1.8h
2422        ld1             {v6.8h, v7.8h}, [x10], x2
2423        addp            v2.8h,   v2.8h,   v3.8h
2424        addp            v4.8h,   v4.8h,   v5.8h
2425        addp            v6.8h,   v6.8h,   v7.8h
2426        add             v0.8h,   v0.8h,   v2.8h
2427        add             v4.8h,   v4.8h,   v6.8h
2428        shl             v0.8h,   v0.8h,   #1
2429        shl             v1.8h,   v4.8h,   #1
2430        subs            w8,  w8,  #2
2431        st1             {v0.8h, v1.8h}, [x0], #32
2432        uaddw           v24.4s,  v24.4s,  v0.4h
2433        uaddw2          v25.4s,  v25.4s,  v0.8h
2434        uaddw           v26.4s,  v26.4s,  v1.4h
2435        uaddw2          v27.4s,  v27.4s,  v1.8h
2436        b.gt            1b
2437        mov             v0.16b,  v1.16b
2438        b               L(ipred_cfl_ac_420_w8_hpad)
2439
2440L(ipred_cfl_ac_420_w8_wpad):
24411:      // Copy and subsample input, padding 4
2442        ld1             {v0.8h}, [x1],  x2
2443        ld1             {v1.8h}, [x10], x2
2444        ld1             {v2.8h}, [x1],  x2
2445        ld1             {v3.8h}, [x10], x2
2446        addp            v0.8h,   v0.8h,   v2.8h
2447        addp            v1.8h,   v1.8h,   v3.8h
2448        add             v0.8h,   v0.8h,   v1.8h
2449        shl             v0.8h,   v0.8h,   #1
2450        dup             v1.4h,   v0.h[3]
2451        dup             v3.4h,   v0.h[7]
2452        trn2            v2.2d,   v0.2d,   v0.2d
2453        subs            w8,  w8,  #2
2454        st1             {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32
2455        uaddw           v24.4s,  v24.4s,  v0.4h
2456        uaddw           v25.4s,  v25.4s,  v1.4h
2457        uaddw           v26.4s,  v26.4s,  v2.4h
2458        uaddw           v27.4s,  v27.4s,  v3.4h
2459        b.gt            1b
2460        trn1            v0.2d,   v2.2d,   v3.2d
2461        trn1            v1.2d,   v2.2d,   v3.2d
2462
2463L(ipred_cfl_ac_420_w8_hpad):
2464        cbz             w4,  3f
24652:      // Vertical padding (h_pad > 0)
2466        subs            w4,  w4,  #4
2467        st1             {v0.8h, v1.8h}, [x0], #32
2468        uaddw           v24.4s,  v24.4s,  v0.4h
2469        uaddw2          v25.4s,  v25.4s,  v0.8h
2470        uaddw           v26.4s,  v26.4s,  v1.4h
2471        uaddw2          v27.4s,  v27.4s,  v1.8h
2472        st1             {v0.8h, v1.8h}, [x0], #32
2473        uaddw           v24.4s,  v24.4s,  v0.4h
2474        uaddw2          v25.4s,  v25.4s,  v0.8h
2475        uaddw           v26.4s,  v26.4s,  v1.4h
2476        uaddw2          v27.4s,  v27.4s,  v1.8h
2477        b.gt            2b
24783:
2479
2480        // Double the height and reuse the w4 summing/subtracting
2481        lsl             w6,  w6,  #1
2482        b               L(ipred_cfl_ac_420_w4_calc_subtract_dc)
2483
2484L(ipred_cfl_ac_420_w16):
2485        AARCH64_VALID_JUMP_TARGET
2486        adr             x7,  L(ipred_cfl_ac_420_w16_tbl)
2487        ldrh            w3,  [x7, w3, uxtw #1]
2488        sub             x7,  x7,  w3, uxtw
2489        br              x7
2490
2491L(ipred_cfl_ac_420_w16_wpad0):
2492        AARCH64_VALID_JUMP_TARGET
24931:      // Copy and subsample input, without padding
2494        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x1],  x2
2495        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x10], x2
2496        addp            v0.8h,   v0.8h,   v1.8h
2497        addp            v2.8h,   v2.8h,   v3.8h
2498        addp            v4.8h,   v4.8h,   v5.8h
2499        addp            v6.8h,   v6.8h,   v7.8h
2500        ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x1],  x2
2501        add             v0.8h,   v0.8h,   v4.8h
2502        ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x10], x2
2503        add             v2.8h,   v2.8h,   v6.8h
2504        addp            v16.8h,  v16.8h,  v17.8h
2505        addp            v18.8h,  v18.8h,  v19.8h
2506        addp            v20.8h,  v20.8h,  v21.8h
2507        addp            v22.8h,  v22.8h,  v23.8h
2508        add             v16.8h,  v16.8h,  v20.8h
2509        add             v18.8h,  v18.8h,  v22.8h
2510        shl             v0.8h,   v0.8h,   #1
2511        shl             v1.8h,   v2.8h,   #1
2512        shl             v2.8h,   v16.8h,  #1
2513        shl             v3.8h,   v18.8h,  #1
2514        subs            w8,  w8,  #2
2515        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
2516        uaddw           v24.4s,  v24.4s,  v0.4h
2517        uaddw2          v25.4s,  v25.4s,  v0.8h
2518        uaddw           v26.4s,  v26.4s,  v1.4h
2519        uaddw2          v27.4s,  v27.4s,  v1.8h
2520        uaddw           v24.4s,  v24.4s,  v2.4h
2521        uaddw2          v25.4s,  v25.4s,  v2.8h
2522        uaddw           v26.4s,  v26.4s,  v3.4h
2523        uaddw2          v27.4s,  v27.4s,  v3.8h
2524        b.gt            1b
2525        mov             v0.16b,  v2.16b
2526        mov             v1.16b,  v3.16b
2527        b               L(ipred_cfl_ac_420_w16_hpad)
2528
2529L(ipred_cfl_ac_420_w16_wpad1):
2530        AARCH64_VALID_JUMP_TARGET
25311:      // Copy and subsample input, padding 4
2532        ldr             q2,  [x1,  #32]
2533        ld1             {v0.8h, v1.8h}, [x1],  x2
2534        ldr             q5,  [x10, #32]
2535        ld1             {v3.8h, v4.8h}, [x10], x2
2536        addp            v2.8h,   v2.8h,   v2.8h
2537        addp            v0.8h,   v0.8h,   v1.8h
2538        addp            v5.8h,   v5.8h,   v5.8h
2539        addp            v3.8h,   v3.8h,   v4.8h
2540        ldr             q18, [x1,  #32]
2541        add             v2.4h,   v2.4h,   v5.4h
2542        ld1             {v16.8h, v17.8h}, [x1],  x2
2543        add             v0.8h,   v0.8h,   v3.8h
2544        ldr             q21, [x10, #32]
2545        ld1             {v19.8h, v20.8h}, [x10], x2
2546        addp            v18.8h,  v18.8h,  v18.8h
2547        addp            v16.8h,  v16.8h,  v17.8h
2548        addp            v21.8h,  v21.8h,  v21.8h
2549        addp            v19.8h,  v19.8h,  v20.8h
2550        add             v18.4h,  v18.4h,  v21.4h
2551        add             v16.8h,  v16.8h,  v19.8h
2552        shl             v1.4h,   v2.4h,   #1
2553        shl             v0.8h,   v0.8h,   #1
2554        shl             v3.4h,   v18.4h,  #1
2555        shl             v2.8h,   v16.8h,  #1
2556        dup             v4.4h,   v1.h[3]
2557        dup             v5.4h,   v3.h[3]
2558        trn1            v1.2d,   v1.2d,   v4.2d
2559        trn1            v3.2d,   v3.2d,   v5.2d
2560        subs            w8,  w8,  #2
2561        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
2562        uaddw           v24.4s,  v24.4s,  v0.4h
2563        uaddw2          v25.4s,  v25.4s,  v0.8h
2564        uaddw           v26.4s,  v26.4s,  v1.4h
2565        uaddw2          v27.4s,  v27.4s,  v1.8h
2566        uaddw           v24.4s,  v24.4s,  v2.4h
2567        uaddw2          v25.4s,  v25.4s,  v2.8h
2568        uaddw           v26.4s,  v26.4s,  v3.4h
2569        uaddw2          v27.4s,  v27.4s,  v3.8h
2570        b.gt            1b
2571        mov             v0.16b,  v2.16b
2572        mov             v1.16b,  v3.16b
2573        b               L(ipred_cfl_ac_420_w16_hpad)
2574
2575L(ipred_cfl_ac_420_w16_wpad2):
2576        AARCH64_VALID_JUMP_TARGET
25771:      // Copy and subsample input, padding 8
2578        ld1             {v0.8h, v1.8h}, [x1],  x2
2579        ld1             {v2.8h, v3.8h}, [x10], x2
2580        ld1             {v4.8h, v5.8h}, [x1],  x2
2581        addp            v0.8h,   v0.8h,   v1.8h
2582        ld1             {v6.8h, v7.8h}, [x10], x2
2583        addp            v2.8h,   v2.8h,   v3.8h
2584        addp            v4.8h,   v4.8h,   v5.8h
2585        addp            v6.8h,   v6.8h,   v7.8h
2586        add             v0.8h,   v0.8h,   v2.8h
2587        add             v4.8h,   v4.8h,   v6.8h
2588        shl             v0.8h,   v0.8h,   #1
2589        shl             v2.8h,   v4.8h,   #1
2590        dup             v1.8h,   v0.h[7]
2591        dup             v3.8h,   v2.h[7]
2592        subs            w8,  w8,  #2
2593        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
2594        uaddw           v24.4s,  v24.4s,  v0.4h
2595        uaddw2          v25.4s,  v25.4s,  v0.8h
2596        uaddw           v26.4s,  v26.4s,  v1.4h
2597        uaddw2          v27.4s,  v27.4s,  v1.8h
2598        uaddw           v24.4s,  v24.4s,  v2.4h
2599        uaddw2          v25.4s,  v25.4s,  v2.8h
2600        uaddw           v26.4s,  v26.4s,  v3.4h
2601        uaddw2          v27.4s,  v27.4s,  v3.8h
2602        b.gt            1b
2603        mov             v0.16b,  v2.16b
2604        mov             v1.16b,  v3.16b
2605        b               L(ipred_cfl_ac_420_w16_hpad)
2606
2607L(ipred_cfl_ac_420_w16_wpad3):
2608        AARCH64_VALID_JUMP_TARGET
26091:      // Copy and subsample input, padding 12
2610        ld1             {v0.8h}, [x1],  x2
2611        ld1             {v2.8h}, [x10], x2
2612        ld1             {v4.8h}, [x1],  x2
2613        ld1             {v6.8h}, [x10], x2
2614        addp            v0.8h,   v0.8h,   v4.8h
2615        addp            v2.8h,   v2.8h,   v6.8h
2616        add             v0.8h,   v0.8h,   v2.8h
2617        shl             v0.8h,   v0.8h,   #1
2618        dup             v1.8h,   v0.h[3]
2619        dup             v3.8h,   v0.h[7]
2620        trn2            v2.2d,   v0.2d,   v3.2d
2621        trn1            v0.2d,   v0.2d,   v1.2d
2622        subs            w8,  w8,  #2
2623        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
2624        uaddw           v24.4s,  v24.4s,  v0.4h
2625        uaddw2          v25.4s,  v25.4s,  v0.8h
2626        uaddw           v26.4s,  v26.4s,  v1.4h
2627        uaddw2          v27.4s,  v27.4s,  v1.8h
2628        uaddw           v24.4s,  v24.4s,  v2.4h
2629        uaddw2          v25.4s,  v25.4s,  v2.8h
2630        uaddw           v26.4s,  v26.4s,  v3.4h
2631        uaddw2          v27.4s,  v27.4s,  v3.8h
2632        b.gt            1b
2633        mov             v0.16b,  v2.16b
2634        mov             v1.16b,  v3.16b
2635
2636L(ipred_cfl_ac_420_w16_hpad):
2637        cbz             w4,  3f
26382:      // Vertical padding (h_pad > 0)
2639        subs            w4,  w4,  #4
2640        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
2641        uaddw           v24.4s,  v24.4s,  v0.4h
2642        uaddw2          v25.4s,  v25.4s,  v0.8h
2643        uaddw           v26.4s,  v26.4s,  v1.4h
2644        uaddw2          v27.4s,  v27.4s,  v1.8h
2645        uaddw           v24.4s,  v24.4s,  v2.4h
2646        uaddw2          v25.4s,  v25.4s,  v2.8h
2647        uaddw           v26.4s,  v26.4s,  v3.4h
2648        uaddw2          v27.4s,  v27.4s,  v3.8h
2649        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
2650        uaddw           v24.4s,  v24.4s,  v0.4h
2651        uaddw2          v25.4s,  v25.4s,  v0.8h
2652        uaddw           v26.4s,  v26.4s,  v1.4h
2653        uaddw2          v27.4s,  v27.4s,  v1.8h
2654        uaddw           v24.4s,  v24.4s,  v2.4h
2655        uaddw2          v25.4s,  v25.4s,  v2.8h
2656        uaddw           v26.4s,  v26.4s,  v3.4h
2657        uaddw2          v27.4s,  v27.4s,  v3.8h
2658        b.gt            2b
26593:
2660
2661        // Quadruple the height and reuse the w4 summing/subtracting
2662        lsl             w6,  w6,  #2
2663        b               L(ipred_cfl_ac_420_w4_calc_subtract_dc)
2664
2665L(ipred_cfl_ac_420_tbl):
2666        .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w16)
2667        .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w8)
2668        .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w4)
2669        .hword 0
2670
2671L(ipred_cfl_ac_420_w16_tbl):
2672        .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad0)
2673        .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad1)
2674        .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad2)
2675        .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad3)
2676endfunc
2677
2678// void cfl_ac_422_16bpc_neon(int16_t *const ac, const pixel *const ypx,
2679//                            const ptrdiff_t stride, const int w_pad,
2680//                            const int h_pad, const int cw, const int ch);
2681function ipred_cfl_ac_422_16bpc_neon, export=1
2682        clz             w8,  w5
2683        lsl             w4,  w4,  #2
2684        adr             x7,  L(ipred_cfl_ac_422_tbl)
2685        sub             w8,  w8,  #27
2686        ldrh            w8,  [x7, w8, uxtw #1]
2687        movi            v24.4s,  #0
2688        movi            v25.4s,  #0
2689        movi            v26.4s,  #0
2690        movi            v27.4s,  #0
2691        sub             x7,  x7,  w8, uxtw
2692        sub             w8,  w6,  w4         // height - h_pad
2693        rbit            w9,  w5              // rbit(width)
2694        rbit            w10, w6              // rbit(height)
2695        clz             w9,  w9              // ctz(width)
2696        clz             w10, w10             // ctz(height)
2697        add             w9,  w9,  w10        // log2sz
2698        add             x10, x1,  x2
2699        dup             v31.4s,  w9
2700        lsl             x2,  x2,  #1
2701        neg             v31.4s,  v31.4s      // -log2sz
2702        br              x7
2703
2704L(ipred_cfl_ac_422_w4):
2705        AARCH64_VALID_JUMP_TARGET
27061:      // Copy and subsample input
2707        ld1             {v0.8h}, [x1],  x2
2708        ld1             {v1.8h}, [x10], x2
2709        ld1             {v2.8h}, [x1],  x2
2710        ld1             {v3.8h}, [x10], x2
2711        addp            v0.8h,   v0.8h,   v1.8h
2712        addp            v2.8h,   v2.8h,   v3.8h
2713        shl             v0.8h,   v0.8h,   #2
2714        shl             v1.8h,   v2.8h,   #2
2715        subs            w8,  w8,  #4
2716        st1             {v0.8h, v1.8h}, [x0], #32
2717        uaddw           v24.4s,  v24.4s,  v0.4h
2718        uaddw2          v25.4s,  v25.4s,  v0.8h
2719        uaddw           v26.4s,  v26.4s,  v1.4h
2720        uaddw2          v27.4s,  v27.4s,  v1.8h
2721        b.gt            1b
2722        trn2            v0.2d,   v1.2d,   v1.2d
2723        trn2            v1.2d,   v1.2d,   v1.2d
2724        b               L(ipred_cfl_ac_420_w4_hpad)
2725
2726L(ipred_cfl_ac_422_w8):
2727        AARCH64_VALID_JUMP_TARGET
2728        cbnz            w3,  L(ipred_cfl_ac_422_w8_wpad)
27291:      // Copy and subsample input, without padding
2730        ld1             {v0.8h, v1.8h}, [x1],  x2
2731        ld1             {v2.8h, v3.8h}, [x10], x2
2732        ld1             {v4.8h, v5.8h}, [x1],  x2
2733        addp            v0.8h,   v0.8h,   v1.8h
2734        ld1             {v6.8h, v7.8h}, [x10], x2
2735        addp            v2.8h,   v2.8h,   v3.8h
2736        addp            v4.8h,   v4.8h,   v5.8h
2737        addp            v6.8h,   v6.8h,   v7.8h
2738        shl             v0.8h,   v0.8h,   #2
2739        shl             v1.8h,   v2.8h,   #2
2740        shl             v2.8h,   v4.8h,   #2
2741        shl             v3.8h,   v6.8h,   #2
2742        subs            w8,  w8,  #4
2743        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
2744        uaddw           v24.4s,  v24.4s,  v0.4h
2745        uaddw2          v25.4s,  v25.4s,  v0.8h
2746        uaddw           v26.4s,  v26.4s,  v1.4h
2747        uaddw2          v27.4s,  v27.4s,  v1.8h
2748        uaddw           v24.4s,  v24.4s,  v2.4h
2749        uaddw2          v25.4s,  v25.4s,  v2.8h
2750        uaddw           v26.4s,  v26.4s,  v3.4h
2751        uaddw2          v27.4s,  v27.4s,  v3.8h
2752        b.gt            1b
2753        mov             v0.16b,  v3.16b
2754        mov             v1.16b,  v3.16b
2755        b               L(ipred_cfl_ac_420_w8_hpad)
2756
2757L(ipred_cfl_ac_422_w8_wpad):
27581:      // Copy and subsample input, padding 4
2759        ld1             {v0.8h}, [x1],  x2
2760        ld1             {v1.8h}, [x10], x2
2761        ld1             {v2.8h}, [x1],  x2
2762        ld1             {v3.8h}, [x10], x2
2763        addp            v0.8h,   v0.8h,   v1.8h
2764        addp            v2.8h,   v2.8h,   v3.8h
2765        shl             v0.8h,   v0.8h,   #2
2766        shl             v2.8h,   v2.8h,   #2
2767        dup             v4.4h,   v0.h[3]
2768        dup             v5.8h,   v0.h[7]
2769        dup             v6.4h,   v2.h[3]
2770        dup             v7.8h,   v2.h[7]
2771        trn2            v1.2d,   v0.2d,   v5.2d
2772        trn1            v0.2d,   v0.2d,   v4.2d
2773        trn2            v3.2d,   v2.2d,   v7.2d
2774        trn1            v2.2d,   v2.2d,   v6.2d
2775        subs            w8,  w8,  #4
2776        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
2777        uaddw           v24.4s,  v24.4s,  v0.4h
2778        uaddw2          v25.4s,  v25.4s,  v0.8h
2779        uaddw           v26.4s,  v26.4s,  v1.4h
2780        uaddw2          v27.4s,  v27.4s,  v1.8h
2781        uaddw           v24.4s,  v24.4s,  v2.4h
2782        uaddw2          v25.4s,  v25.4s,  v2.8h
2783        uaddw           v26.4s,  v26.4s,  v3.4h
2784        uaddw2          v27.4s,  v27.4s,  v3.8h
2785        b.gt            1b
2786        mov             v0.16b,  v3.16b
2787        mov             v1.16b,  v3.16b
2788        b               L(ipred_cfl_ac_420_w8_hpad)
2789
2790L(ipred_cfl_ac_422_w16):
2791        AARCH64_VALID_JUMP_TARGET
2792        adr             x7,  L(ipred_cfl_ac_422_w16_tbl)
2793        ldrh            w3,  [x7, w3, uxtw #1]
2794        sub             x7,  x7,  w3, uxtw
2795        br              x7
2796
2797L(ipred_cfl_ac_422_w16_wpad0):
2798        AARCH64_VALID_JUMP_TARGET
27991:      // Copy and subsample input, without padding
2800        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x1],  x2
2801        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x10], x2
2802        addp            v0.8h,   v0.8h,   v1.8h
2803        addp            v2.8h,   v2.8h,   v3.8h
2804        addp            v4.8h,   v4.8h,   v5.8h
2805        addp            v6.8h,   v6.8h,   v7.8h
2806        shl             v0.8h,   v0.8h,   #2
2807        shl             v1.8h,   v2.8h,   #2
2808        shl             v2.8h,   v4.8h,   #2
2809        shl             v3.8h,   v6.8h,   #2
2810        subs            w8,  w8,  #2
2811        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
2812        uaddw           v24.4s,  v24.4s,  v0.4h
2813        uaddw2          v25.4s,  v25.4s,  v0.8h
2814        uaddw           v26.4s,  v26.4s,  v1.4h
2815        uaddw2          v27.4s,  v27.4s,  v1.8h
2816        uaddw           v24.4s,  v24.4s,  v2.4h
2817        uaddw2          v25.4s,  v25.4s,  v2.8h
2818        uaddw           v26.4s,  v26.4s,  v3.4h
2819        uaddw2          v27.4s,  v27.4s,  v3.8h
2820        b.gt            1b
2821        mov             v0.16b,  v2.16b
2822        mov             v1.16b,  v3.16b
2823        b               L(ipred_cfl_ac_420_w16_hpad)
2824
2825L(ipred_cfl_ac_422_w16_wpad1):
2826        AARCH64_VALID_JUMP_TARGET
28271:      // Copy and subsample input, padding 4
2828        ldr             q2,  [x1,  #32]
2829        ld1             {v0.8h, v1.8h}, [x1],  x2
2830        ldr             q6,  [x10, #32]
2831        ld1             {v4.8h, v5.8h}, [x10], x2
2832        addp            v2.8h,   v2.8h,   v2.8h
2833        addp            v0.8h,   v0.8h,   v1.8h
2834        addp            v6.8h,   v6.8h,   v6.8h
2835        addp            v4.8h,   v4.8h,   v5.8h
2836        shl             v1.4h,   v2.4h,   #2
2837        shl             v0.8h,   v0.8h,   #2
2838        shl             v3.4h,   v6.4h,   #2
2839        shl             v2.8h,   v4.8h,   #2
2840        dup             v4.4h,   v1.h[3]
2841        dup             v5.4h,   v3.h[3]
2842        trn1            v1.2d,   v1.2d,   v4.2d
2843        trn1            v3.2d,   v3.2d,   v5.2d
2844        subs            w8,  w8,  #2
2845        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
2846        uaddw           v24.4s,  v24.4s,  v0.4h
2847        uaddw2          v25.4s,  v25.4s,  v0.8h
2848        uaddw           v26.4s,  v26.4s,  v1.4h
2849        uaddw2          v27.4s,  v27.4s,  v1.8h
2850        uaddw           v24.4s,  v24.4s,  v2.4h
2851        uaddw2          v25.4s,  v25.4s,  v2.8h
2852        uaddw           v26.4s,  v26.4s,  v3.4h
2853        uaddw2          v27.4s,  v27.4s,  v3.8h
2854        b.gt            1b
2855        mov             v0.16b,  v2.16b
2856        mov             v1.16b,  v3.16b
2857        b               L(ipred_cfl_ac_420_w16_hpad)
2858
2859L(ipred_cfl_ac_422_w16_wpad2):
2860        AARCH64_VALID_JUMP_TARGET
28611:      // Copy and subsample input, padding 8
2862        ld1             {v0.8h, v1.8h}, [x1],  x2
2863        ld1             {v2.8h, v3.8h}, [x10], x2
2864        addp            v0.8h,   v0.8h,   v1.8h
2865        addp            v2.8h,   v2.8h,   v3.8h
2866        shl             v0.8h,   v0.8h,   #2
2867        shl             v2.8h,   v2.8h,   #2
2868        dup             v1.8h,   v0.h[7]
2869        dup             v3.8h,   v2.h[7]
2870        subs            w8,  w8,  #2
2871        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
2872        uaddw           v24.4s,  v24.4s,  v0.4h
2873        uaddw2          v25.4s,  v25.4s,  v0.8h
2874        uaddw           v26.4s,  v26.4s,  v1.4h
2875        uaddw2          v27.4s,  v27.4s,  v1.8h
2876        uaddw           v24.4s,  v24.4s,  v2.4h
2877        uaddw2          v25.4s,  v25.4s,  v2.8h
2878        uaddw           v26.4s,  v26.4s,  v3.4h
2879        uaddw2          v27.4s,  v27.4s,  v3.8h
2880        b.gt            1b
2881        mov             v0.16b,  v2.16b
2882        mov             v1.16b,  v3.16b
2883        b               L(ipred_cfl_ac_420_w16_hpad)
2884
2885L(ipred_cfl_ac_422_w16_wpad3):
2886        AARCH64_VALID_JUMP_TARGET
28871:      // Copy and subsample input, padding 12
2888        ld1             {v0.8h}, [x1],  x2
2889        ld1             {v2.8h}, [x10], x2
2890        addp            v0.8h,   v0.8h,   v0.8h
2891        addp            v2.8h,   v2.8h,   v2.8h
2892        shl             v0.4h,   v0.4h,   #2
2893        shl             v2.4h,   v2.4h,   #2
2894        dup             v1.8h,   v0.h[3]
2895        dup             v3.8h,   v2.h[3]
2896        trn1            v0.2d,   v0.2d,   v1.2d
2897        trn1            v2.2d,   v2.2d,   v3.2d
2898        subs            w8,  w8,  #2
2899        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
2900        uaddw           v24.4s,  v24.4s,  v0.4h
2901        uaddw2          v25.4s,  v25.4s,  v0.8h
2902        uaddw           v26.4s,  v26.4s,  v1.4h
2903        uaddw2          v27.4s,  v27.4s,  v1.8h
2904        uaddw           v24.4s,  v24.4s,  v2.4h
2905        uaddw2          v25.4s,  v25.4s,  v2.8h
2906        uaddw           v26.4s,  v26.4s,  v3.4h
2907        uaddw2          v27.4s,  v27.4s,  v3.8h
2908        b.gt            1b
2909        mov             v0.16b,  v2.16b
2910        mov             v1.16b,  v3.16b
2911        b               L(ipred_cfl_ac_420_w16_hpad)
2912
2913L(ipred_cfl_ac_422_tbl):
2914        .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w16)
2915        .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w8)
2916        .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w4)
2917        .hword 0
2918
2919L(ipred_cfl_ac_422_w16_tbl):
2920        .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad0)
2921        .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad1)
2922        .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad2)
2923        .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad3)
2924endfunc
2925
2926// void cfl_ac_444_16bpc_neon(int16_t *const ac, const pixel *const ypx,
2927//                            const ptrdiff_t stride, const int w_pad,
2928//                            const int h_pad, const int cw, const int ch);
2929function ipred_cfl_ac_444_16bpc_neon, export=1
2930        clz             w8,  w5
2931        lsl             w4,  w4,  #2
2932        adr             x7,  L(ipred_cfl_ac_444_tbl)
2933        sub             w8,  w8,  #26
2934        ldrh            w8,  [x7, w8, uxtw #1]
2935        movi            v24.4s,  #0
2936        movi            v25.4s,  #0
2937        movi            v26.4s,  #0
2938        movi            v27.4s,  #0
2939        sub             x7,  x7,  w8, uxtw
2940        sub             w8,  w6,  w4         // height - h_pad
2941        rbit            w9,  w5              // rbit(width)
2942        rbit            w10, w6              // rbit(height)
2943        clz             w9,  w9              // ctz(width)
2944        clz             w10, w10             // ctz(height)
2945        add             w9,  w9,  w10        // log2sz
2946        add             x10, x1,  x2
2947        dup             v31.4s,  w9
2948        lsl             x2,  x2,  #1
2949        neg             v31.4s,  v31.4s      // -log2sz
2950        br              x7
2951
2952L(ipred_cfl_ac_444_w4):
2953        AARCH64_VALID_JUMP_TARGET
29541:      // Copy and expand input
2955        ld1             {v0.4h},   [x1],  x2
2956        ld1             {v0.d}[1], [x10], x2
2957        ld1             {v1.4h},   [x1],  x2
2958        ld1             {v1.d}[1], [x10], x2
2959        shl             v0.8h,   v0.8h,   #3
2960        shl             v1.8h,   v1.8h,   #3
2961        subs            w8,  w8,  #4
2962        st1             {v0.8h, v1.8h}, [x0], #32
2963        uaddw           v24.4s,  v24.4s,  v0.4h
2964        uaddw2          v25.4s,  v25.4s,  v0.8h
2965        uaddw           v26.4s,  v26.4s,  v1.4h
2966        uaddw2          v27.4s,  v27.4s,  v1.8h
2967        b.gt            1b
2968        trn2            v0.2d,   v1.2d,   v1.2d
2969        trn2            v1.2d,   v1.2d,   v1.2d
2970        b               L(ipred_cfl_ac_420_w4_hpad)
2971
2972L(ipred_cfl_ac_444_w8):
2973        AARCH64_VALID_JUMP_TARGET
29741:      // Copy and expand input
2975        ld1             {v0.8h}, [x1],  x2
2976        ld1             {v1.8h}, [x10], x2
2977        ld1             {v2.8h}, [x1],  x2
2978        shl             v0.8h,   v0.8h,   #3
2979        ld1             {v3.8h}, [x10], x2
2980        shl             v1.8h,   v1.8h,   #3
2981        shl             v2.8h,   v2.8h,   #3
2982        shl             v3.8h,   v3.8h,   #3
2983        subs            w8,  w8,  #4
2984        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
2985        uaddw           v24.4s,  v24.4s,  v0.4h
2986        uaddw2          v25.4s,  v25.4s,  v0.8h
2987        uaddw           v26.4s,  v26.4s,  v1.4h
2988        uaddw2          v27.4s,  v27.4s,  v1.8h
2989        uaddw           v24.4s,  v24.4s,  v2.4h
2990        uaddw2          v25.4s,  v25.4s,  v2.8h
2991        uaddw           v26.4s,  v26.4s,  v3.4h
2992        uaddw2          v27.4s,  v27.4s,  v3.8h
2993        b.gt            1b
2994        mov             v0.16b,  v3.16b
2995        mov             v1.16b,  v3.16b
2996        b               L(ipred_cfl_ac_420_w8_hpad)
2997
2998L(ipred_cfl_ac_444_w16):
2999        AARCH64_VALID_JUMP_TARGET
3000        cbnz            w3,  L(ipred_cfl_ac_444_w16_wpad)
30011:      // Copy and expand input, without padding
3002        ld1             {v0.8h, v1.8h}, [x1],  x2
3003        ld1             {v2.8h, v3.8h}, [x10], x2
3004        shl             v0.8h,   v0.8h,   #3
3005        shl             v1.8h,   v1.8h,   #3
3006        shl             v2.8h,   v2.8h,   #3
3007        shl             v3.8h,   v3.8h,   #3
3008        subs            w8,  w8,  #2
3009        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
3010        uaddw           v24.4s,  v24.4s,  v0.4h
3011        uaddw2          v25.4s,  v25.4s,  v0.8h
3012        uaddw           v26.4s,  v26.4s,  v1.4h
3013        uaddw2          v27.4s,  v27.4s,  v1.8h
3014        uaddw           v24.4s,  v24.4s,  v2.4h
3015        uaddw2          v25.4s,  v25.4s,  v2.8h
3016        uaddw           v26.4s,  v26.4s,  v3.4h
3017        uaddw2          v27.4s,  v27.4s,  v3.8h
3018        b.gt            1b
3019        mov             v0.16b,  v2.16b
3020        mov             v1.16b,  v3.16b
3021        b               L(ipred_cfl_ac_420_w16_hpad)
3022
3023L(ipred_cfl_ac_444_w16_wpad):
30241:      // Copy and expand input, padding 8
3025        ld1             {v0.8h}, [x1],  x2
3026        ld1             {v2.8h}, [x10], x2
3027        shl             v0.8h,   v0.8h,   #3
3028        shl             v2.8h,   v2.8h,   #3
3029        dup             v1.8h,   v0.h[7]
3030        dup             v3.8h,   v2.h[7]
3031        subs            w8,  w8,  #2
3032        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
3033        uaddw           v24.4s,  v24.4s,  v0.4h
3034        uaddw2          v25.4s,  v25.4s,  v0.8h
3035        uaddw           v26.4s,  v26.4s,  v1.4h
3036        uaddw2          v27.4s,  v27.4s,  v1.8h
3037        uaddw           v24.4s,  v24.4s,  v2.4h
3038        uaddw2          v25.4s,  v25.4s,  v2.8h
3039        uaddw           v26.4s,  v26.4s,  v3.4h
3040        uaddw2          v27.4s,  v27.4s,  v3.8h
3041        b.gt            1b
3042        mov             v0.16b,  v2.16b
3043        mov             v1.16b,  v3.16b
3044        b               L(ipred_cfl_ac_420_w16_hpad)
3045
3046L(ipred_cfl_ac_444_w32):
3047        AARCH64_VALID_JUMP_TARGET
3048        adr             x7,  L(ipred_cfl_ac_444_w32_tbl)
3049        ldrh            w3,  [x7, w3, uxtw] // (w3>>1) << 1
3050        lsr             x2,  x2,  #1 // Restore the stride to one line increments
3051        sub             x7,  x7,  w3, uxtw
3052        br              x7
3053
3054L(ipred_cfl_ac_444_w32_wpad0):
3055        AARCH64_VALID_JUMP_TARGET
30561:      // Copy and expand input, without padding
3057        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x1],  x2
3058        shl             v0.8h,   v0.8h,   #3
3059        shl             v1.8h,   v1.8h,   #3
3060        shl             v2.8h,   v2.8h,   #3
3061        shl             v3.8h,   v3.8h,   #3
3062        subs            w8,  w8,  #1
3063        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
3064        uaddw           v24.4s,  v24.4s,  v0.4h
3065        uaddw2          v25.4s,  v25.4s,  v0.8h
3066        uaddw           v26.4s,  v26.4s,  v1.4h
3067        uaddw2          v27.4s,  v27.4s,  v1.8h
3068        uaddw           v24.4s,  v24.4s,  v2.4h
3069        uaddw2          v25.4s,  v25.4s,  v2.8h
3070        uaddw           v26.4s,  v26.4s,  v3.4h
3071        uaddw2          v27.4s,  v27.4s,  v3.8h
3072        b.gt            1b
3073        b               L(ipred_cfl_ac_444_w32_hpad)
3074
3075L(ipred_cfl_ac_444_w32_wpad2):
3076        AARCH64_VALID_JUMP_TARGET
30771:      // Copy and expand input, padding 8
3078        ld1             {v0.8h, v1.8h, v2.8h}, [x1],  x2
3079        shl             v2.8h,   v2.8h,   #3
3080        shl             v0.8h,   v0.8h,   #3
3081        shl             v1.8h,   v1.8h,   #3
3082        dup             v3.8h,   v2.h[7]
3083        subs            w8,  w8,  #1
3084        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
3085        uaddw           v24.4s,  v24.4s,  v0.4h
3086        uaddw2          v25.4s,  v25.4s,  v0.8h
3087        uaddw           v26.4s,  v26.4s,  v1.4h
3088        uaddw2          v27.4s,  v27.4s,  v1.8h
3089        uaddw           v24.4s,  v24.4s,  v2.4h
3090        uaddw2          v25.4s,  v25.4s,  v2.8h
3091        uaddw           v26.4s,  v26.4s,  v3.4h
3092        uaddw2          v27.4s,  v27.4s,  v3.8h
3093        b.gt            1b
3094        b               L(ipred_cfl_ac_444_w32_hpad)
3095
3096L(ipred_cfl_ac_444_w32_wpad4):
3097        AARCH64_VALID_JUMP_TARGET
30981:      // Copy and expand input, padding 16
3099        ld1             {v0.8h, v1.8h}, [x1],  x2
3100        shl             v1.8h,   v1.8h,   #3
3101        shl             v0.8h,   v0.8h,   #3
3102        dup             v2.8h,   v1.h[7]
3103        dup             v3.8h,   v1.h[7]
3104        subs            w8,  w8,  #1
3105        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
3106        uaddw           v24.4s,  v24.4s,  v0.4h
3107        uaddw2          v25.4s,  v25.4s,  v0.8h
3108        uaddw           v26.4s,  v26.4s,  v1.4h
3109        uaddw2          v27.4s,  v27.4s,  v1.8h
3110        uaddw           v24.4s,  v24.4s,  v2.4h
3111        uaddw2          v25.4s,  v25.4s,  v2.8h
3112        uaddw           v26.4s,  v26.4s,  v3.4h
3113        uaddw2          v27.4s,  v27.4s,  v3.8h
3114        b.gt            1b
3115        b               L(ipred_cfl_ac_444_w32_hpad)
3116
3117L(ipred_cfl_ac_444_w32_wpad6):
3118        AARCH64_VALID_JUMP_TARGET
31191:      // Copy and expand input, padding 24
3120        ld1             {v0.8h}, [x1],  x2
3121        shl             v0.8h,   v0.8h,   #3
3122        dup             v1.8h,   v0.h[7]
3123        dup             v2.8h,   v0.h[7]
3124        dup             v3.8h,   v0.h[7]
3125        subs            w8,  w8,  #1
3126        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
3127        uaddw           v24.4s,  v24.4s,  v0.4h
3128        uaddw2          v25.4s,  v25.4s,  v0.8h
3129        uaddw           v26.4s,  v26.4s,  v1.4h
3130        uaddw2          v27.4s,  v27.4s,  v1.8h
3131        uaddw           v24.4s,  v24.4s,  v2.4h
3132        uaddw2          v25.4s,  v25.4s,  v2.8h
3133        uaddw           v26.4s,  v26.4s,  v3.4h
3134        uaddw2          v27.4s,  v27.4s,  v3.8h
3135        b.gt            1b
3136
3137L(ipred_cfl_ac_444_w32_hpad):
3138        cbz             w4,  3f
31392:      // Vertical padding (h_pad > 0)
3140        subs            w4,  w4,  #2
3141        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
3142        uaddw           v24.4s,  v24.4s,  v0.4h
3143        uaddw2          v25.4s,  v25.4s,  v0.8h
3144        uaddw           v26.4s,  v26.4s,  v1.4h
3145        uaddw2          v27.4s,  v27.4s,  v1.8h
3146        uaddw           v24.4s,  v24.4s,  v2.4h
3147        uaddw2          v25.4s,  v25.4s,  v2.8h
3148        uaddw           v26.4s,  v26.4s,  v3.4h
3149        uaddw2          v27.4s,  v27.4s,  v3.8h
3150        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
3151        uaddw           v24.4s,  v24.4s,  v0.4h
3152        uaddw2          v25.4s,  v25.4s,  v0.8h
3153        uaddw           v26.4s,  v26.4s,  v1.4h
3154        uaddw2          v27.4s,  v27.4s,  v1.8h
3155        uaddw           v24.4s,  v24.4s,  v2.4h
3156        uaddw2          v25.4s,  v25.4s,  v2.8h
3157        uaddw           v26.4s,  v26.4s,  v3.4h
3158        uaddw2          v27.4s,  v27.4s,  v3.8h
3159        b.gt            2b
31603:
3161
3162        //  Multiply the height by eight and reuse the w4 subtracting
3163        lsl             w6,  w6,  #3
3164        b               L(ipred_cfl_ac_420_w4_calc_subtract_dc)
3165
3166L(ipred_cfl_ac_444_tbl):
3167        .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w32)
3168        .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w16)
3169        .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w8)
3170        .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w4)
3171
3172L(ipred_cfl_ac_444_w32_tbl):
3173        .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad0)
3174        .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad2)
3175        .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad4)
3176        .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad6)
3177endfunc
3178