1/*
2 * Copyright © 2018, VideoLAN and dav1d authors
3 * Copyright © 2019, Martin Storsjo
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright notice, this
10 *    list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright notice,
13 *    this list of conditions and the following disclaimer in the documentation
14 *    and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include "src/arm/asm.S"
29#include "util.S"
30
31// void ipred_dc_128_16bpc_neon(pixel *dst, const ptrdiff_t stride,
32//                              const pixel *const topleft,
33//                              const int width, const int height, const int a,
34//                              const int max_width, const int max_height,
35//                              const int bitdepth_max);
36function ipred_dc_128_16bpc_neon, export=1
37        ldr             w8,  [sp]
38        clz             w3,  w3
39        adr             x5,  L(ipred_dc_128_tbl)
40        sub             w3,  w3,  #25
41        ldrh            w3,  [x5, w3, uxtw #1]
42        dup             v0.8h,   w8
43        sub             x5,  x5,  w3, uxtw
44        add             x6,  x0,  x1
45        lsl             x1,  x1,  #1
46        urshr           v0.8h,   v0.8h,  #1
47        br              x5
484:
49        st1             {v0.4h},  [x0], x1
50        st1             {v0.4h},  [x6], x1
51        subs            w4,  w4,  #4
52        st1             {v0.4h},  [x0], x1
53        st1             {v0.4h},  [x6], x1
54        b.gt            4b
55        ret
568:
57        st1             {v0.8h},  [x0], x1
58        st1             {v0.8h},  [x6], x1
59        subs            w4,  w4,  #4
60        st1             {v0.8h},  [x0], x1
61        st1             {v0.8h},  [x6], x1
62        b.gt            8b
63        ret
64160:
65        mov             v1.16b,  v0.16b
6616:
67        st1             {v0.8h, v1.8h}, [x0], x1
68        st1             {v0.8h, v1.8h}, [x6], x1
69        subs            w4,  w4,  #4
70        st1             {v0.8h, v1.8h}, [x0], x1
71        st1             {v0.8h, v1.8h}, [x6], x1
72        b.gt            16b
73        ret
74320:
75        mov             v1.16b,  v0.16b
76        mov             v2.16b,  v0.16b
77        mov             v3.16b,  v0.16b
7832:
79        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
80        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
81        subs            w4,  w4,  #4
82        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
83        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
84        b.gt            32b
85        ret
86640:
87        mov             v1.16b,  v0.16b
88        mov             v2.16b,  v0.16b
89        mov             v3.16b,  v0.16b
90        sub             x1,  x1,  #64
9164:
92        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
93        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
94        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
95        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
96        subs            w4,  w4,  #4
97        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
98        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
99        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
100        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
101        b.gt            64b
102        ret
103
104L(ipred_dc_128_tbl):
105        .hword L(ipred_dc_128_tbl) - 640b
106        .hword L(ipred_dc_128_tbl) - 320b
107        .hword L(ipred_dc_128_tbl) - 160b
108        .hword L(ipred_dc_128_tbl) -   8b
109        .hword L(ipred_dc_128_tbl) -   4b
110endfunc
111
112// void ipred_v_16bpc_neon(pixel *dst, const ptrdiff_t stride,
113//                         const pixel *const topleft,
114//                         const int width, const int height, const int a,
115//                         const int max_width, const int max_height);
116function ipred_v_16bpc_neon, export=1
117        clz             w3,  w3
118        adr             x5,  L(ipred_v_tbl)
119        sub             w3,  w3,  #25
120        ldrh            w3,  [x5, w3, uxtw #1]
121        add             x2,  x2,  #2
122        sub             x5,  x5,  w3, uxtw
123        add             x6,  x0,  x1
124        lsl             x1,  x1,  #1
125        br              x5
12640:
127        ld1             {v0.4h},  [x2]
1284:
129        st1             {v0.4h},  [x0], x1
130        st1             {v0.4h},  [x6], x1
131        subs            w4,  w4,  #4
132        st1             {v0.4h},  [x0], x1
133        st1             {v0.4h},  [x6], x1
134        b.gt            4b
135        ret
13680:
137        ld1             {v0.8h},  [x2]
1388:
139        st1             {v0.8h},  [x0], x1
140        st1             {v0.8h},  [x6], x1
141        subs            w4,  w4,  #4
142        st1             {v0.8h},  [x0], x1
143        st1             {v0.8h},  [x6], x1
144        b.gt            8b
145        ret
146160:
147        ld1             {v0.8h, v1.8h}, [x2]
14816:
149        st1             {v0.8h, v1.8h}, [x0], x1
150        st1             {v0.8h, v1.8h}, [x6], x1
151        subs            w4,  w4,  #4
152        st1             {v0.8h, v1.8h}, [x0], x1
153        st1             {v0.8h, v1.8h}, [x6], x1
154        b.gt            16b
155        ret
156320:
157        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2]
15832:
159        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
160        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
161        subs            w4,  w4,  #4
162        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
163        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
164        b.gt            32b
165        ret
166640:
167        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
168        sub             x1,  x1,  #64
169        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2]
17064:
171        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
172        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
173        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1
174        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], x1
175        subs            w4,  w4,  #4
176        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
177        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
178        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1
179        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], x1
180        b.gt            64b
181        ret
182
183L(ipred_v_tbl):
184        .hword L(ipred_v_tbl) - 640b
185        .hword L(ipred_v_tbl) - 320b
186        .hword L(ipred_v_tbl) - 160b
187        .hword L(ipred_v_tbl) -  80b
188        .hword L(ipred_v_tbl) -  40b
189endfunc
190
191// void ipred_h_16bpc_neon(pixel *dst, const ptrdiff_t stride,
192//                         const pixel *const topleft,
193//                         const int width, const int height, const int a,
194//                         const int max_width, const int max_height);
195function ipred_h_16bpc_neon, export=1
196        clz             w3,  w3
197        adr             x5,  L(ipred_h_tbl)
198        sub             w3,  w3,  #25
199        ldrh            w3,  [x5, w3, uxtw #1]
200        sub             x2,  x2,  #8
201        sub             x5,  x5,  w3, uxtw
202        mov             x7,  #-8
203        add             x6,  x0,  x1
204        lsl             x1,  x1,  #1
205        br              x5
2064:
207        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7
208        st1             {v3.4h},  [x0], x1
209        st1             {v2.4h},  [x6], x1
210        subs            w4,  w4,  #4
211        st1             {v1.4h},  [x0], x1
212        st1             {v0.4h},  [x6], x1
213        b.gt            4b
214        ret
2158:
216        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7
217        st1             {v3.8h},  [x0], x1
218        st1             {v2.8h},  [x6], x1
219        subs            w4,  w4,  #4
220        st1             {v1.8h},  [x0], x1
221        st1             {v0.8h},  [x6], x1
222        b.gt            8b
223        ret
22416:
225        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7
226        str             q3,  [x0, #16]
227        str             q2,  [x6, #16]
228        st1             {v3.8h}, [x0], x1
229        st1             {v2.8h}, [x6], x1
230        subs            w4,  w4,  #4
231        str             q1,  [x0, #16]
232        str             q0,  [x6, #16]
233        st1             {v1.8h}, [x0], x1
234        st1             {v0.8h}, [x6], x1
235        b.gt            16b
236        ret
23732:
238        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7
239        str             q3,  [x0, #16]
240        str             q2,  [x6, #16]
241        stp             q3,  q3,  [x0, #32]
242        stp             q2,  q2,  [x6, #32]
243        st1             {v3.8h}, [x0], x1
244        st1             {v2.8h}, [x6], x1
245        subs            w4,  w4,  #4
246        str             q1,  [x0, #16]
247        str             q0,  [x6, #16]
248        stp             q1,  q1,  [x0, #32]
249        stp             q0,  q0,  [x6, #32]
250        st1             {v1.8h}, [x0], x1
251        st1             {v0.8h}, [x6], x1
252        b.gt            32b
253        ret
25464:
255        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7
256        str             q3,  [x0, #16]
257        str             q2,  [x6, #16]
258        stp             q3,  q3,  [x0, #32]
259        stp             q2,  q2,  [x6, #32]
260        stp             q3,  q3,  [x0, #64]
261        stp             q2,  q2,  [x6, #64]
262        stp             q3,  q3,  [x0, #96]
263        stp             q2,  q2,  [x6, #96]
264        st1             {v3.8h}, [x0], x1
265        st1             {v2.8h}, [x6], x1
266        subs            w4,  w4,  #4
267        str             q1,  [x0, #16]
268        str             q0,  [x6, #16]
269        stp             q1,  q1,  [x0, #32]
270        stp             q0,  q0,  [x6, #32]
271        stp             q1,  q1,  [x0, #64]
272        stp             q0,  q0,  [x6, #64]
273        stp             q1,  q1,  [x0, #96]
274        stp             q0,  q0,  [x6, #96]
275        st1             {v1.8h}, [x0], x1
276        st1             {v0.8h}, [x6], x1
277        b.gt            64b
278        ret
279
280L(ipred_h_tbl):
281        .hword L(ipred_h_tbl) - 64b
282        .hword L(ipred_h_tbl) - 32b
283        .hword L(ipred_h_tbl) - 16b
284        .hword L(ipred_h_tbl) -  8b
285        .hword L(ipred_h_tbl) -  4b
286endfunc
287
288// void ipred_dc_top_16bpc_neon(pixel *dst, const ptrdiff_t stride,
289//                              const pixel *const topleft,
290//                              const int width, const int height, const int a,
291//                              const int max_width, const int max_height);
292function ipred_dc_top_16bpc_neon, export=1
293        clz             w3,  w3
294        adr             x5,  L(ipred_dc_top_tbl)
295        sub             w3,  w3,  #25
296        ldrh            w3,  [x5, w3, uxtw #1]
297        add             x2,  x2,  #2
298        sub             x5,  x5,  w3, uxtw
299        add             x6,  x0,  x1
300        lsl             x1,  x1,  #1
301        br              x5
30240:
303        ld1             {v0.4h},  [x2]
304        addv            h0,      v0.4h
305        urshr           v0.4h,   v0.4h,   #2
306        dup             v0.4h,   v0.h[0]
3074:
308        st1             {v0.4h},  [x0], x1
309        st1             {v0.4h},  [x6], x1
310        subs            w4,  w4,  #4
311        st1             {v0.4h},  [x0], x1
312        st1             {v0.4h},  [x6], x1
313        b.gt            4b
314        ret
31580:
316        ld1             {v0.8h},  [x2]
317        addv            h0,      v0.8h
318        urshr           v0.4h,   v0.4h,   #3
319        dup             v0.8h,   v0.h[0]
3208:
321        st1             {v0.8h},  [x0], x1
322        st1             {v0.8h},  [x6], x1
323        subs            w4,  w4,  #4
324        st1             {v0.8h},  [x0], x1
325        st1             {v0.8h},  [x6], x1
326        b.gt            8b
327        ret
328160:
329        ld1             {v0.8h, v1.8h}, [x2]
330        addp            v0.8h,   v0.8h,   v1.8h
331        addv            h0,      v0.8h
332        urshr           v2.4h,   v0.4h,   #4
333        dup             v0.8h,   v2.h[0]
334        dup             v1.8h,   v2.h[0]
33516:
336        st1             {v0.8h, v1.8h}, [x0], x1
337        st1             {v0.8h, v1.8h}, [x6], x1
338        subs            w4,  w4,  #4
339        st1             {v0.8h, v1.8h}, [x0], x1
340        st1             {v0.8h, v1.8h}, [x6], x1
341        b.gt            16b
342        ret
343320:
344        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2]
345        addp            v0.8h,   v0.8h,   v1.8h
346        addp            v2.8h,   v2.8h,   v3.8h
347        addp            v0.8h,   v0.8h,   v2.8h
348        uaddlv          s0,      v0.8h
349        rshrn           v4.4h,   v0.4s,   #5
350        dup             v0.8h,   v4.h[0]
351        dup             v1.8h,   v4.h[0]
352        dup             v2.8h,   v4.h[0]
353        dup             v3.8h,   v4.h[0]
35432:
355        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
356        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
357        subs            w4,  w4,  #4
358        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
359        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
360        b.gt            32b
361        ret
362640:
363        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
364        addp            v0.8h,   v0.8h,   v1.8h
365        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2]
366        addp            v2.8h,   v2.8h,   v3.8h
367        addp            v4.8h,   v4.8h,   v5.8h
368        addp            v6.8h,   v6.8h,   v7.8h
369        addp            v0.8h,   v0.8h,   v2.8h
370        addp            v4.8h,   v4.8h,   v6.8h
371        addp            v0.8h,   v0.8h,   v4.8h
372        uaddlv          s0,      v0.8h
373        rshrn           v4.4h,   v0.4s,   #6
374        sub             x1,  x1,  #64
375        dup             v0.8h,   v4.h[0]
376        dup             v1.8h,   v4.h[0]
377        dup             v2.8h,   v4.h[0]
378        dup             v3.8h,   v4.h[0]
37964:
380        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
381        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
382        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
383        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
384        subs            w4,  w4,  #4
385        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
386        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
387        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
388        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
389        b.gt            64b
390        ret
391
392L(ipred_dc_top_tbl):
393        .hword L(ipred_dc_top_tbl) - 640b
394        .hword L(ipred_dc_top_tbl) - 320b
395        .hword L(ipred_dc_top_tbl) - 160b
396        .hword L(ipred_dc_top_tbl) -  80b
397        .hword L(ipred_dc_top_tbl) -  40b
398endfunc
399
400// void ipred_dc_left_16bpc_neon(pixel *dst, const ptrdiff_t stride,
401//                               const pixel *const topleft,
402//                               const int width, const int height, const int a,
403//                               const int max_width, const int max_height);
404function ipred_dc_left_16bpc_neon, export=1
405        sub             x2,  x2,  w4, uxtw #1
406        clz             w3,  w3
407        clz             w7,  w4
408        adr             x5,  L(ipred_dc_left_tbl)
409        sub             w3,  w3,  #20 // 25 leading bits, minus table offset 5
410        sub             w7,  w7,  #25
411        ldrh            w3,  [x5, w3, uxtw #1]
412        ldrh            w7,  [x5, w7, uxtw #1]
413        sub             x3,  x5,  w3, uxtw
414        sub             x5,  x5,  w7, uxtw
415        add             x6,  x0,  x1
416        lsl             x1,  x1,  #1
417        br              x5
418
419L(ipred_dc_left_h4):
420        ld1             {v0.4h},  [x2]
421        addv            h0,      v0.4h
422        urshr           v0.4h,   v0.4h,   #2
423        dup             v0.8h,   v0.h[0]
424        br              x3
425L(ipred_dc_left_w4):
426        st1             {v0.4h},  [x0], x1
427        st1             {v0.4h},  [x6], x1
428        subs            w4,  w4,  #4
429        st1             {v0.4h},  [x0], x1
430        st1             {v0.4h},  [x6], x1
431        b.gt            L(ipred_dc_left_w4)
432        ret
433
434L(ipred_dc_left_h8):
435        ld1             {v0.8h},  [x2]
436        addv            h0,      v0.8h
437        urshr           v0.4h,   v0.4h,   #3
438        dup             v0.8h,   v0.h[0]
439        br              x3
440L(ipred_dc_left_w8):
441        st1             {v0.8h},  [x0], x1
442        st1             {v0.8h},  [x6], x1
443        subs            w4,  w4,  #4
444        st1             {v0.8h},  [x0], x1
445        st1             {v0.8h},  [x6], x1
446        b.gt            L(ipred_dc_left_w8)
447        ret
448
449L(ipred_dc_left_h16):
450        ld1             {v0.8h, v1.8h}, [x2]
451        addp            v0.8h,   v0.8h,   v1.8h
452        addv            h0,      v0.8h
453        urshr           v2.4h,   v0.4h,   #4
454        dup             v0.8h,   v2.h[0]
455        dup             v1.8h,   v2.h[0]
456        br              x3
457L(ipred_dc_left_w16):
458        mov             v1.16b,  v0.16b
4591:
460        st1             {v0.8h, v1.8h}, [x0], x1
461        st1             {v0.8h, v1.8h}, [x6], x1
462        subs            w4,  w4,  #4
463        st1             {v0.8h, v1.8h}, [x0], x1
464        st1             {v0.8h, v1.8h}, [x6], x1
465        b.gt            1b
466        ret
467
468L(ipred_dc_left_h32):
469        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2]
470        addp            v0.8h,   v0.8h,   v1.8h
471        addp            v2.8h,   v2.8h,   v3.8h
472        addp            v0.8h,   v0.8h,   v2.8h
473        uaddlp          v0.4s,   v0.8h
474        addv            s0,      v0.4s
475        rshrn           v4.4h,   v0.4s,   #5
476        dup             v0.8h,   v4.h[0]
477        br              x3
478L(ipred_dc_left_w32):
479        mov             v1.16b,  v0.16b
480        mov             v2.16b,  v0.16b
481        mov             v3.16b,  v0.16b
4821:
483        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
484        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
485        subs            w4,  w4,  #4
486        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
487        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
488        b.gt            1b
489        ret
490
491L(ipred_dc_left_h64):
492        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
493        addp            v0.8h,   v0.8h,   v1.8h
494        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2]
495        addp            v2.8h,   v2.8h,   v3.8h
496        addp            v4.8h,   v4.8h,   v5.8h
497        addp            v6.8h,   v6.8h,   v7.8h
498        addp            v0.8h,   v0.8h,   v2.8h
499        addp            v4.8h,   v4.8h,   v6.8h
500        addp            v0.8h,   v0.8h,   v4.8h
501        uaddlv          s0,      v0.8h
502        rshrn           v4.4h,   v0.4s,   #6
503        dup             v0.8h,   v4.h[0]
504        br              x3
505L(ipred_dc_left_w64):
506        mov             v1.16b,  v0.16b
507        mov             v2.16b,  v0.16b
508        mov             v3.16b,  v0.16b
509        sub             x1,  x1,  #64
5101:
511        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
512        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
513        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
514        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
515        subs            w4,  w4,  #4
516        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
517        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
518        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
519        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
520        b.gt            1b
521        ret
522
523L(ipred_dc_left_tbl):
524        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h64)
525        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h32)
526        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h16)
527        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h8)
528        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h4)
529        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w64)
530        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w32)
531        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w16)
532        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w8)
533        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w4)
534endfunc
535
536// void ipred_dc_16bpc_neon(pixel *dst, const ptrdiff_t stride,
537//                          const pixel *const topleft,
538//                          const int width, const int height, const int a,
539//                          const int max_width, const int max_height);
540function ipred_dc_16bpc_neon, export=1
541        sub             x2,  x2,  w4, uxtw #1
542        add             w7,  w3,  w4             // width + height
543        clz             w3,  w3
544        clz             w6,  w4
545        dup             v16.4s, w7               // width + height
546        adr             x5,  L(ipred_dc_tbl)
547        rbit            w7,  w7                  // rbit(width + height)
548        sub             w3,  w3,  #20            // 25 leading bits, minus table offset 5
549        sub             w6,  w6,  #25
550        clz             w7,  w7                  // ctz(width + height)
551        ldrh            w3,  [x5, w3, uxtw #1]
552        ldrh            w6,  [x5, w6, uxtw #1]
553        neg             w7,  w7                  // -ctz(width + height)
554        sub             x3,  x5,  w3, uxtw
555        sub             x5,  x5,  w6, uxtw
556        ushr            v16.4s,  v16.4s,  #1     // (width + height) >> 1
557        dup             v17.4s,  w7              // -ctz(width + height)
558        add             x6,  x0,  x1
559        lsl             x1,  x1,  #1
560        br              x5
561
562L(ipred_dc_h4):
563        ld1             {v0.4h},  [x2], #8
564        uaddlv          s0,      v0.4h
565        add             x2,  x2,  #2
566        br              x3
567L(ipred_dc_w4):
568        ld1             {v1.4h},  [x2]
569        add             v0.2s,   v0.2s,   v16.2s
570        uaddlv          s1,      v1.4h
571        cmp             w4,  #4
572        add             v0.2s,   v0.2s,   v1.2s
573        ushl            v0.2s,   v0.2s,   v17.2s
574        b.eq            1f
575        // h = 8/16
576        cmp             w4,  #16
577        mov             w16, #0x6667
578        mov             w17, #0xAAAB
579        csel            w16, w16, w17, eq
580        dup             v16.2s,  w16
581        mul             v0.2s,   v0.2s,   v16.2s
582        ushr            v0.2s,   v0.2s,   #17
5831:
584        dup             v0.4h,   v0.h[0]
5852:
586        st1             {v0.4h},  [x0], x1
587        st1             {v0.4h},  [x6], x1
588        subs            w4,  w4,  #4
589        st1             {v0.4h},  [x0], x1
590        st1             {v0.4h},  [x6], x1
591        b.gt            2b
592        ret
593
594L(ipred_dc_h8):
595        ld1             {v0.8h},  [x2], #16
596        uaddlv          s0,      v0.8h
597        add             x2,  x2,  #2
598        br              x3
599L(ipred_dc_w8):
600        ld1             {v1.8h},  [x2]
601        add             v0.2s,   v0.2s,   v16.2s
602        uaddlv          s1,      v1.8h
603        cmp             w4,  #8
604        add             v0.2s,   v0.2s,   v1.2s
605        ushl            v0.2s,   v0.2s,   v17.2s
606        b.eq            1f
607        // h = 4/16/32
608        cmp             w4,  #32
609        mov             w16, #0x6667
610        mov             w17, #0xAAAB
611        csel            w16, w16, w17, eq
612        dup             v16.2s,  w16
613        mul             v0.2s,   v0.2s,   v16.2s
614        ushr            v0.2s,   v0.2s,   #17
6151:
616        dup             v0.8h,   v0.h[0]
6172:
618        st1             {v0.8h},  [x0], x1
619        st1             {v0.8h},  [x6], x1
620        subs            w4,  w4,  #4
621        st1             {v0.8h},  [x0], x1
622        st1             {v0.8h},  [x6], x1
623        b.gt            2b
624        ret
625
626L(ipred_dc_h16):
627        ld1             {v0.8h, v1.8h}, [x2], #32
628        addp            v0.8h,   v0.8h,   v1.8h
629        add             x2,  x2,  #2
630        uaddlv          s0,      v0.8h
631        br              x3
632L(ipred_dc_w16):
633        ld1             {v1.8h, v2.8h}, [x2]
634        add             v0.2s,   v0.2s,   v16.2s
635        addp            v1.8h,   v1.8h,   v2.8h
636        uaddlv          s1,      v1.8h
637        cmp             w4,  #16
638        add             v0.2s,   v0.2s,   v1.2s
639        ushl            v4.2s,   v0.2s,   v17.2s
640        b.eq            1f
641        // h = 4/8/32/64
642        tst             w4,  #(32+16+8) // 16 added to make a consecutive bitmask
643        mov             w16, #0x6667
644        mov             w17, #0xAAAB
645        csel            w16, w16, w17, eq
646        dup             v16.2s,  w16
647        mul             v4.2s,   v4.2s,   v16.2s
648        ushr            v4.2s,   v4.2s,   #17
6491:
650        dup             v0.8h,   v4.h[0]
651        dup             v1.8h,   v4.h[0]
6522:
653        st1             {v0.8h, v1.8h}, [x0], x1
654        st1             {v0.8h, v1.8h}, [x6], x1
655        subs            w4,  w4,  #4
656        st1             {v0.8h, v1.8h}, [x0], x1
657        st1             {v0.8h, v1.8h}, [x6], x1
658        b.gt            2b
659        ret
660
661L(ipred_dc_h32):
662        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
663        addp            v0.8h,   v0.8h,   v1.8h
664        addp            v2.8h,   v2.8h,   v3.8h
665        addp            v0.8h,   v0.8h,   v2.8h
666        add             x2,  x2,  #2
667        uaddlv          s0,      v0.8h
668        br              x3
669L(ipred_dc_w32):
670        ld1             {v1.8h, v2.8h, v3.8h, v4.8h}, [x2]
671        add             v0.2s,   v0.2s,   v16.2s
672        addp            v1.8h,   v1.8h,   v2.8h
673        addp            v3.8h,   v3.8h,   v4.8h
674        addp            v1.8h,   v1.8h,   v3.8h
675        uaddlv          s1,      v1.8h
676        cmp             w4,  #32
677        add             v0.2s,   v0.2s,   v1.2s
678        ushl            v4.2s,   v0.2s,   v17.2s
679        b.eq            1f
680        // h = 8/16/64
681        cmp             w4,  #8
682        mov             w16, #0x6667
683        mov             w17, #0xAAAB
684        csel            w16, w16, w17, eq
685        dup             v16.2s,  w16
686        mul             v4.2s,   v4.2s,   v16.2s
687        ushr            v4.2s,   v4.2s,   #17
6881:
689        dup             v0.8h,   v4.h[0]
690        dup             v1.8h,   v4.h[0]
691        dup             v2.8h,   v4.h[0]
692        dup             v3.8h,   v4.h[0]
6932:
694        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
695        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
696        subs            w4,  w4,  #4
697        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
698        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
699        b.gt            2b
700        ret
701
702L(ipred_dc_h64):
703        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
704        addp            v0.8h,   v0.8h,   v1.8h
705        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
706        addp            v2.8h,   v2.8h,   v3.8h
707        addp            v4.8h,   v4.8h,   v5.8h
708        addp            v6.8h,   v6.8h,   v7.8h
709        addp            v0.8h,   v0.8h,   v2.8h
710        addp            v4.8h,   v4.8h,   v6.8h
711        addp            v0.8h,   v0.8h,   v4.8h
712        add             x2,  x2,  #2
713        uaddlv          s0,      v0.8h
714        br              x3
715L(ipred_dc_w64):
716        ld1             {v1.8h, v2.8h, v3.8h, v4.8h}, [x2], #64
717        add             v0.2s,   v0.2s,   v16.2s
718        addp            v1.8h,   v1.8h,   v2.8h
719        ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x2]
720        addp            v3.8h,   v3.8h,   v4.8h
721        addp            v20.8h,  v20.8h,  v21.8h
722        addp            v22.8h,  v22.8h,  v23.8h
723        addp            v1.8h,   v1.8h,   v3.8h
724        addp            v20.8h,  v20.8h,  v22.8h
725        addp            v1.8h,   v1.8h,   v20.8h
726        uaddlv          s1,      v1.8h
727        cmp             w4,  #64
728        add             v0.2s,   v0.2s,   v1.2s
729        ushl            v4.2s,   v0.2s,   v17.2s
730        b.eq            1f
731        // h = 16/32
732        cmp             w4,  #16
733        mov             w16, #0x6667
734        mov             w17, #0xAAAB
735        csel            w16, w16, w17, eq
736        dup             v16.2s,  w16
737        mul             v4.2s,   v4.2s,   v16.2s
738        ushr            v4.2s,   v4.2s,   #17
7391:
740        sub             x1,  x1,  #64
741        dup             v0.8h,   v4.h[0]
742        dup             v1.8h,   v4.h[0]
743        dup             v2.8h,   v4.h[0]
744        dup             v3.8h,   v4.h[0]
7452:
746        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
747        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
748        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
749        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
750        subs            w4,  w4,  #4
751        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
752        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
753        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
754        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
755        b.gt            2b
756        ret
757
758L(ipred_dc_tbl):
759        .hword L(ipred_dc_tbl) - L(ipred_dc_h64)
760        .hword L(ipred_dc_tbl) - L(ipred_dc_h32)
761        .hword L(ipred_dc_tbl) - L(ipred_dc_h16)
762        .hword L(ipred_dc_tbl) - L(ipred_dc_h8)
763        .hword L(ipred_dc_tbl) - L(ipred_dc_h4)
764        .hword L(ipred_dc_tbl) - L(ipred_dc_w64)
765        .hword L(ipred_dc_tbl) - L(ipred_dc_w32)
766        .hword L(ipred_dc_tbl) - L(ipred_dc_w16)
767        .hword L(ipred_dc_tbl) - L(ipred_dc_w8)
768        .hword L(ipred_dc_tbl) - L(ipred_dc_w4)
769endfunc
770
771// void ipred_paeth_16bpc_neon(pixel *dst, const ptrdiff_t stride,
772//                             const pixel *const topleft,
773//                             const int width, const int height, const int a,
774//                             const int max_width, const int max_height);
775function ipred_paeth_16bpc_neon, export=1
776        clz             w9,  w3
777        adr             x5,  L(ipred_paeth_tbl)
778        sub             w9,  w9,  #25
779        ldrh            w9,  [x5, w9, uxtw #1]
780        ld1r            {v4.8h},  [x2]
781        add             x8,  x2,  #2
782        sub             x2,  x2,  #8
783        sub             x5,  x5,  w9, uxtw
784        mov             x7,  #-8
785        add             x6,  x0,  x1
786        lsl             x1,  x1,  #1
787        br              x5
78840:
789        ld1r            {v5.2d},  [x8]
790        sub             v6.8h,   v5.8h,   v4.8h   // top - topleft
7914:
792        ld4r            {v0.4h, v1.4h, v2.4h, v3.4h},  [x2], x7
793        zip1            v0.2d,   v0.2d,   v1.2d
794        zip1            v2.2d,   v2.2d,   v3.2d
795        add             v16.8h,  v6.8h,   v0.8h   // base
796        add             v17.8h,  v6.8h,   v2.8h
797        sabd            v20.8h,  v5.8h,   v16.8h  // tdiff
798        sabd            v21.8h,  v5.8h,   v17.8h
799        sabd            v22.8h,  v4.8h,   v16.8h  // tldiff
800        sabd            v23.8h,  v4.8h,   v17.8h
801        sabd            v16.8h,  v0.8h,   v16.8h  // ldiff
802        sabd            v17.8h,  v2.8h,   v17.8h
803        umin            v18.8h,  v20.8h,  v22.8h  // min(tdiff, tldiff)
804        umin            v19.8h,  v21.8h,  v23.8h
805        cmge            v20.8h,  v22.8h,  v20.8h  // tldiff >= tdiff
806        cmge            v21.8h,  v23.8h,  v21.8h
807        cmge            v16.8h,  v18.8h,  v16.8h  // min(tdiff, tldiff) >= ldiff
808        cmge            v17.8h,  v19.8h,  v17.8h
809        bsl             v21.16b, v5.16b,  v4.16b  // tdiff <= tldiff ? top : topleft
810        bsl             v20.16b, v5.16b,  v4.16b
811        bit             v21.16b, v2.16b,  v17.16b // ldiff <= min ? left : ...
812        bit             v20.16b, v0.16b,  v16.16b
813        st1             {v21.d}[1], [x0], x1
814        st1             {v21.d}[0], [x6], x1
815        subs            w4,  w4,  #4
816        st1             {v20.d}[1], [x0], x1
817        st1             {v20.d}[0], [x6], x1
818        b.gt            4b
819        ret
82080:
821160:
822320:
823640:
824        ld1             {v5.8h},  [x8], #16
825        mov             w9,  w3
826        // Set up pointers for four rows in parallel; x0, x6, x5, x10
827        add             x5,  x0,  x1
828        add             x10, x6,  x1
829        lsl             x1,  x1,  #1
830        sub             x1,  x1,  w3, uxtw #1
8311:
832        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7
8332:
834        sub             v6.8h,   v5.8h,   v4.8h   // top - topleft
835        add             v16.8h,  v6.8h,   v0.8h   // base
836        add             v17.8h,  v6.8h,   v1.8h
837        add             v18.8h,  v6.8h,   v2.8h
838        add             v19.8h,  v6.8h,   v3.8h
839        sabd            v20.8h,  v5.8h,   v16.8h  // tdiff
840        sabd            v21.8h,  v5.8h,   v17.8h
841        sabd            v22.8h,  v5.8h,   v18.8h
842        sabd            v23.8h,  v5.8h,   v19.8h
843        sabd            v24.8h,  v4.8h,   v16.8h  // tldiff
844        sabd            v25.8h,  v4.8h,   v17.8h
845        sabd            v26.8h,  v4.8h,   v18.8h
846        sabd            v27.8h,  v4.8h,   v19.8h
847        sabd            v16.8h,  v0.8h,   v16.8h  // ldiff
848        sabd            v17.8h,  v1.8h,   v17.8h
849        sabd            v18.8h,  v2.8h,   v18.8h
850        sabd            v19.8h,  v3.8h,   v19.8h
851        umin            v28.8h,  v20.8h,  v24.8h  // min(tdiff, tldiff)
852        umin            v29.8h,  v21.8h,  v25.8h
853        umin            v30.8h,  v22.8h,  v26.8h
854        umin            v31.8h,  v23.8h,  v27.8h
855        cmge            v20.8h,  v24.8h,  v20.8h  // tldiff >= tdiff
856        cmge            v21.8h,  v25.8h,  v21.8h
857        cmge            v22.8h,  v26.8h,  v22.8h
858        cmge            v23.8h,  v27.8h,  v23.8h
859        cmge            v16.8h,  v28.8h,  v16.8h  // min(tdiff, tldiff) >= ldiff
860        cmge            v17.8h,  v29.8h,  v17.8h
861        cmge            v18.8h,  v30.8h,  v18.8h
862        cmge            v19.8h,  v31.8h,  v19.8h
863        bsl             v23.16b, v5.16b,  v4.16b  // tdiff <= tldiff ? top : topleft
864        bsl             v22.16b, v5.16b,  v4.16b
865        bsl             v21.16b, v5.16b,  v4.16b
866        bsl             v20.16b, v5.16b,  v4.16b
867        bit             v23.16b, v3.16b,  v19.16b // ldiff <= min ? left : ...
868        bit             v22.16b, v2.16b,  v18.16b
869        bit             v21.16b, v1.16b,  v17.16b
870        bit             v20.16b, v0.16b,  v16.16b
871        st1             {v23.8h}, [x0], #16
872        st1             {v22.8h}, [x6], #16
873        subs            w3,  w3,  #8
874        st1             {v21.8h}, [x5], #16
875        st1             {v20.8h}, [x10], #16
876        b.le            8f
877        ld1             {v5.8h},  [x8], #16
878        b               2b
8798:
880        subs            w4,  w4,  #4
881        b.le            9f
882        // End of horizontal loop, move pointers to next four rows
883        sub             x8,  x8,  w9, uxtw #1
884        add             x0,  x0,  x1
885        add             x6,  x6,  x1
886        // Load the top row as early as possible
887        ld1             {v5.8h},  [x8], #16
888        add             x5,  x5,  x1
889        add             x10, x10, x1
890        mov             w3,  w9
891        b               1b
8929:
893        ret
894
895L(ipred_paeth_tbl):
896        .hword L(ipred_paeth_tbl) - 640b
897        .hword L(ipred_paeth_tbl) - 320b
898        .hword L(ipred_paeth_tbl) - 160b
899        .hword L(ipred_paeth_tbl) -  80b
900        .hword L(ipred_paeth_tbl) -  40b
901endfunc
902
903// void ipred_smooth_16bpc_neon(pixel *dst, const ptrdiff_t stride,
904//                              const pixel *const topleft,
905//                              const int width, const int height, const int a,
906//                              const int max_width, const int max_height);
907function ipred_smooth_16bpc_neon, export=1
908        movrel          x10, X(sm_weights)
909        add             x11, x10, w4, uxtw
910        add             x10, x10, w3, uxtw
911        clz             w9,  w3
912        adr             x5,  L(ipred_smooth_tbl)
913        sub             x12, x2,  w4, uxtw #1
914        sub             w9,  w9,  #25
915        ldrh            w9,  [x5, w9, uxtw #1]
916        ld1r            {v4.8h},  [x12] // bottom
917        add             x8,  x2,  #2
918        sub             x5,  x5,  w9, uxtw
919        add             x6,  x0,  x1
920        lsl             x1,  x1,  #1
921        br              x5
92240:
923        ld1r            {v6.2d}, [x8]             // top
924        ld1r            {v7.2s}, [x10]            // weights_hor
925        sub             x2,  x2,  #8
926        mov             x7,  #-8
927        dup             v5.8h,   v6.h[3]          // right
928        sub             v6.8h,   v6.8h,   v4.8h   // top-bottom
929        uxtl            v7.8h,   v7.8b            // weights_hor
930        add             v31.4h,  v4.4h,   v5.4h   // bottom+right
9314:
932        ld4r            {v0.4h, v1.4h, v2.4h, v3.4h},  [x2], x7 // left
933        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x11], #4 // weights_ver
934        ushll           v20.4s,  v31.4h,  #8      // (bottom+right)*256
935        ushll           v21.4s,  v31.4h,  #8
936        ushll           v22.4s,  v31.4h,  #8
937        ushll           v23.4s,  v31.4h,  #8
938        zip1            v1.2d,   v1.2d,   v0.2d   // left, flipped
939        zip1            v0.2d,   v3.2d,   v2.2d
940        zip1            v16.2s,  v16.2s,  v17.2s  // weights_ver
941        zip1            v18.2s,  v18.2s,  v19.2s
942        sub             v0.8h,   v0.8h,   v5.8h   // left-right
943        sub             v1.8h,   v1.8h,   v5.8h
944        uxtl            v16.8h,  v16.8b           // weights_ver
945        uxtl            v18.8h,  v18.8b
946        smlal           v20.4s,  v0.4h,   v7.4h   // += (left-right)*weights_hor
947        smlal2          v21.4s,  v0.8h,   v7.8h
948        smlal           v22.4s,  v1.4h,   v7.4h
949        smlal2          v23.4s,  v1.8h,   v7.8h
950        smlal           v20.4s,  v6.4h,   v16.4h  // += (top-bottom)*weights_ver
951        smlal2          v21.4s,  v6.8h,   v16.8h
952        smlal           v22.4s,  v6.4h,   v18.4h
953        smlal2          v23.4s,  v6.8h,   v18.8h
954        rshrn           v20.4h,  v20.4s,  #9
955        rshrn           v21.4h,  v21.4s,  #9
956        rshrn           v22.4h,  v22.4s,  #9
957        rshrn           v23.4h,  v23.4s,  #9
958        st1             {v20.4h}, [x0], x1
959        st1             {v21.4h}, [x6], x1
960        subs            w4,  w4,  #4
961        st1             {v22.4h}, [x0], x1
962        st1             {v23.4h}, [x6], x1
963        b.gt            4b
964        ret
96580:
966        ld1             {v6.8h}, [x8]             // top
967        ld1             {v7.8b}, [x10]            // weights_hor
968        sub             x2,  x2,  #8
969        mov             x7,  #-8
970        dup             v5.8h,   v6.h[7]          // right
971        sub             v6.8h,   v6.8h,   v4.8h   // top-bottom
972        uxtl            v7.8h,   v7.8b            // weights_hor
973        add             v31.4h,  v4.4h,   v5.4h   // bottom+right
9748:
975        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7 // left
976        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x11], #4 // weights_ver
977        ushll           v20.4s,  v31.4h,  #8      // (bottom+right)*256
978        ushll           v21.4s,  v31.4h,  #8
979        ushll           v22.4s,  v31.4h,  #8
980        ushll           v23.4s,  v31.4h,  #8
981        ushll           v24.4s,  v31.4h,  #8
982        ushll           v25.4s,  v31.4h,  #8
983        ushll           v26.4s,  v31.4h,  #8
984        ushll           v27.4s,  v31.4h,  #8
985        sub             v0.8h,   v0.8h,   v5.8h   // left-right
986        sub             v1.8h,   v1.8h,   v5.8h
987        sub             v2.8h,   v2.8h,   v5.8h
988        sub             v3.8h,   v3.8h,   v5.8h
989        uxtl            v16.8h,  v16.8b           // weights_ver
990        uxtl            v17.8h,  v17.8b
991        uxtl            v18.8h,  v18.8b
992        uxtl            v19.8h,  v19.8b
993        smlal           v20.4s,  v3.4h,   v7.4h   // += (left-right)*weights_hor
994        smlal2          v21.4s,  v3.8h,   v7.8h   // (left flipped)
995        smlal           v22.4s,  v2.4h,   v7.4h
996        smlal2          v23.4s,  v2.8h,   v7.8h
997        smlal           v24.4s,  v1.4h,   v7.4h
998        smlal2          v25.4s,  v1.8h,   v7.8h
999        smlal           v26.4s,  v0.4h,   v7.4h
1000        smlal2          v27.4s,  v0.8h,   v7.8h
1001        smlal           v20.4s,  v6.4h,   v16.4h  // += (top-bottom)*weights_ver
1002        smlal2          v21.4s,  v6.8h,   v16.8h
1003        smlal           v22.4s,  v6.4h,   v17.4h
1004        smlal2          v23.4s,  v6.8h,   v17.8h
1005        smlal           v24.4s,  v6.4h,   v18.4h
1006        smlal2          v25.4s,  v6.8h,   v18.8h
1007        smlal           v26.4s,  v6.4h,   v19.4h
1008        smlal2          v27.4s,  v6.8h,   v19.8h
1009        rshrn           v20.4h,  v20.4s,  #9
1010        rshrn2          v20.8h,  v21.4s,  #9
1011        rshrn           v21.4h,  v22.4s,  #9
1012        rshrn2          v21.8h,  v23.4s,  #9
1013        rshrn           v22.4h,  v24.4s,  #9
1014        rshrn2          v22.8h,  v25.4s,  #9
1015        rshrn           v23.4h,  v26.4s,  #9
1016        rshrn2          v23.8h,  v27.4s,  #9
1017        st1             {v20.8h}, [x0], x1
1018        st1             {v21.8h}, [x6], x1
1019        subs            w4,  w4,  #4
1020        st1             {v22.8h}, [x0], x1
1021        st1             {v23.8h}, [x6], x1
1022        b.gt            8b
1023        ret
1024160:
1025320:
1026640:
1027        add             x12, x2,  w3, uxtw #1
1028        sub             x1,  x1,  w3, uxtw #1
1029        ld1r            {v5.8h}, [x12]            // right
1030        sub             x2,  x2,  #4
1031        mov             x7,  #-4
1032        mov             w9,  w3
1033        add             v31.4h,  v4.4h,   v5.4h   // bottom+right
1034
10351:
1036        ld2r            {v0.8h, v1.8h},   [x2],  x7 // left
1037        ld2r            {v16.8b, v17.8b}, [x11], #2 // weights_ver
1038        sub             v0.8h,   v0.8h,   v5.8h   // left-right
1039        sub             v1.8h,   v1.8h,   v5.8h
1040        uxtl            v16.8h,  v16.8b           // weights_ver
1041        uxtl            v17.8h,  v17.8b
10422:
1043        ld1             {v7.16b}, [x10],  #16     // weights_hor
1044        ld1             {v2.8h, v3.8h}, [x8], #32 // top
1045        ushll           v20.4s,  v31.4h,  #8      // (bottom+right)*256
1046        ushll           v21.4s,  v31.4h,  #8
1047        ushll           v22.4s,  v31.4h,  #8
1048        ushll           v23.4s,  v31.4h,  #8
1049        ushll           v24.4s,  v31.4h,  #8
1050        ushll           v25.4s,  v31.4h,  #8
1051        ushll           v26.4s,  v31.4h,  #8
1052        ushll           v27.4s,  v31.4h,  #8
1053        uxtl            v6.8h,   v7.8b            // weights_hor
1054        uxtl2           v7.8h,   v7.16b
1055        sub             v2.8h,   v2.8h,   v4.8h   // top-bottom
1056        sub             v3.8h,   v3.8h,   v4.8h
1057        smlal           v20.4s,  v1.4h,   v6.4h   // += (left-right)*weights_hor
1058        smlal2          v21.4s,  v1.8h,   v6.8h   // (left flipped)
1059        smlal           v22.4s,  v1.4h,   v7.4h
1060        smlal2          v23.4s,  v1.8h,   v7.8h
1061        smlal           v24.4s,  v0.4h,   v6.4h
1062        smlal2          v25.4s,  v0.8h,   v6.8h
1063        smlal           v26.4s,  v0.4h,   v7.4h
1064        smlal2          v27.4s,  v0.8h,   v7.8h
1065        smlal           v20.4s,  v2.4h,   v16.4h  // += (top-bottom)*weights_ver
1066        smlal2          v21.4s,  v2.8h,   v16.8h
1067        smlal           v22.4s,  v3.4h,   v16.4h
1068        smlal2          v23.4s,  v3.8h,   v16.8h
1069        smlal           v24.4s,  v2.4h,   v17.4h
1070        smlal2          v25.4s,  v2.8h,   v17.8h
1071        smlal           v26.4s,  v3.4h,   v17.4h
1072        smlal2          v27.4s,  v3.8h,   v17.8h
1073        rshrn           v20.4h,  v20.4s,  #9
1074        rshrn2          v20.8h,  v21.4s,  #9
1075        rshrn           v21.4h,  v22.4s,  #9
1076        rshrn2          v21.8h,  v23.4s,  #9
1077        rshrn           v22.4h,  v24.4s,  #9
1078        rshrn2          v22.8h,  v25.4s,  #9
1079        rshrn           v23.4h,  v26.4s,  #9
1080        rshrn2          v23.8h,  v27.4s,  #9
1081        subs            w3,  w3,  #16
1082        st1             {v20.8h, v21.8h}, [x0], #32
1083        st1             {v22.8h, v23.8h}, [x6], #32
1084        b.gt            2b
1085        subs            w4,  w4,  #2
1086        b.le            9f
1087        sub             x8,  x8,  w9, uxtw #1
1088        sub             x10, x10, w9, uxtw
1089        add             x0,  x0,  x1
1090        add             x6,  x6,  x1
1091        mov             w3,  w9
1092        b               1b
10939:
1094        ret
1095
1096L(ipred_smooth_tbl):
1097        .hword L(ipred_smooth_tbl) - 640b
1098        .hword L(ipred_smooth_tbl) - 320b
1099        .hword L(ipred_smooth_tbl) - 160b
1100        .hword L(ipred_smooth_tbl) -  80b
1101        .hword L(ipred_smooth_tbl) -  40b
1102endfunc
1103
1104// void ipred_smooth_v_16bpc_neon(pixel *dst, const ptrdiff_t stride,
1105//                                const pixel *const topleft,
1106//                                const int width, const int height, const int a,
1107//                                const int max_width, const int max_height);
1108function ipred_smooth_v_16bpc_neon, export=1
1109        movrel          x7,  X(sm_weights)
1110        add             x7,  x7,  w4, uxtw
1111        clz             w9,  w3
1112        adr             x5,  L(ipred_smooth_v_tbl)
1113        sub             x8,  x2,  w4, uxtw #1
1114        sub             w9,  w9,  #25
1115        ldrh            w9,  [x5, w9, uxtw #1]
1116        ld1r            {v4.8h},  [x8] // bottom
1117        add             x2,  x2,  #2
1118        sub             x5,  x5,  w9, uxtw
1119        add             x6,  x0,  x1
1120        lsl             x1,  x1,  #1
1121        br              x5
112240:
1123        ld1r            {v6.2d}, [x2]             // top
1124        sub             v6.8h,   v6.8h,   v4.8h   // top-bottom
11254:
1126        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x7], #4 // weights_ver
1127        zip1            v16.2s,  v16.2s,  v17.2s  // weights_ver
1128        zip1            v18.2s,  v18.2s,  v19.2s
1129        ushll           v16.8h,  v16.8b,  #7      // weights_ver << 7
1130        ushll           v18.8h,  v18.8b,  #7
1131        sqrdmulh        v20.8h,  v6.8h,   v16.8h  // ((top-bottom)*weights_ver + 128) >> 8
1132        sqrdmulh        v21.8h,  v6.8h,   v18.8h
1133        add             v20.8h,  v20.8h,  v4.8h
1134        add             v21.8h,  v21.8h,  v4.8h
1135        st1             {v20.d}[0], [x0], x1
1136        st1             {v20.d}[1], [x6], x1
1137        subs            w4,  w4,  #4
1138        st1             {v21.d}[0], [x0], x1
1139        st1             {v21.d}[1], [x6], x1
1140        b.gt            4b
1141        ret
114280:
1143        ld1             {v6.8h}, [x2]             // top
1144        sub             v6.8h,   v6.8h,   v4.8h   // top-bottom
11458:
1146        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x7], #4 // weights_ver
1147        ushll           v16.8h,  v16.8b,  #7      // weights_ver << 7
1148        ushll           v17.8h,  v17.8b,  #7
1149        ushll           v18.8h,  v18.8b,  #7
1150        ushll           v19.8h,  v19.8b,  #7
1151        sqrdmulh        v20.8h,  v6.8h,   v16.8h  // ((top-bottom)*weights_ver + 128) >> 8
1152        sqrdmulh        v21.8h,  v6.8h,   v17.8h
1153        sqrdmulh        v22.8h,  v6.8h,   v18.8h
1154        sqrdmulh        v23.8h,  v6.8h,   v19.8h
1155        add             v20.8h,  v20.8h,  v4.8h
1156        add             v21.8h,  v21.8h,  v4.8h
1157        add             v22.8h,  v22.8h,  v4.8h
1158        add             v23.8h,  v23.8h,  v4.8h
1159        st1             {v20.8h}, [x0], x1
1160        st1             {v21.8h}, [x6], x1
1161        subs            w4,  w4,  #4
1162        st1             {v22.8h}, [x0], x1
1163        st1             {v23.8h}, [x6], x1
1164        b.gt            8b
1165        ret
1166160:
1167320:
1168640:
1169        // Set up pointers for four rows in parallel; x0, x6, x5, x8
1170        add             x5,  x0,  x1
1171        add             x8,  x6,  x1
1172        lsl             x1,  x1,  #1
1173        sub             x1,  x1,  w3, uxtw #1
1174        mov             w9,  w3
1175
11761:
1177        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver
1178        ushll           v16.8h,  v16.8b,  #7      // weights_ver << 7
1179        ushll           v17.8h,  v17.8b,  #7
1180        ushll           v18.8h,  v18.8b,  #7
1181        ushll           v19.8h,  v19.8b,  #7
11822:
1183        ld1             {v2.8h, v3.8h}, [x2], #32 // top
1184        sub             v2.8h,   v2.8h,   v4.8h   // top-bottom
1185        sub             v3.8h,   v3.8h,   v4.8h
1186        sqrdmulh        v20.8h,  v2.8h,   v16.8h  // ((top-bottom)*weights_ver + 128) >> 8
1187        sqrdmulh        v21.8h,  v3.8h,   v16.8h
1188        sqrdmulh        v22.8h,  v2.8h,   v17.8h
1189        sqrdmulh        v23.8h,  v3.8h,   v17.8h
1190        sqrdmulh        v24.8h,  v2.8h,   v18.8h
1191        sqrdmulh        v25.8h,  v3.8h,   v18.8h
1192        sqrdmulh        v26.8h,  v2.8h,   v19.8h
1193        sqrdmulh        v27.8h,  v3.8h,   v19.8h
1194        add             v20.8h,  v20.8h,  v4.8h
1195        add             v21.8h,  v21.8h,  v4.8h
1196        add             v22.8h,  v22.8h,  v4.8h
1197        add             v23.8h,  v23.8h,  v4.8h
1198        add             v24.8h,  v24.8h,  v4.8h
1199        add             v25.8h,  v25.8h,  v4.8h
1200        add             v26.8h,  v26.8h,  v4.8h
1201        add             v27.8h,  v27.8h,  v4.8h
1202        subs            w3,  w3,  #16
1203        st1             {v20.8h, v21.8h}, [x0], #32
1204        st1             {v22.8h, v23.8h}, [x6], #32
1205        st1             {v24.8h, v25.8h}, [x5], #32
1206        st1             {v26.8h, v27.8h}, [x8], #32
1207        b.gt            2b
1208        subs            w4,  w4,  #4
1209        b.le            9f
1210        sub             x2,  x2,  w9, uxtw #1
1211        add             x0,  x0,  x1
1212        add             x6,  x6,  x1
1213        add             x5,  x5,  x1
1214        add             x8,  x8,  x1
1215        mov             w3,  w9
1216        b               1b
12179:
1218        ret
1219
1220L(ipred_smooth_v_tbl):
1221        .hword L(ipred_smooth_v_tbl) - 640b
1222        .hword L(ipred_smooth_v_tbl) - 320b
1223        .hword L(ipred_smooth_v_tbl) - 160b
1224        .hword L(ipred_smooth_v_tbl) -  80b
1225        .hword L(ipred_smooth_v_tbl) -  40b
1226endfunc
1227
1228// void ipred_smooth_h_16bpc_neon(pixel *dst, const ptrdiff_t stride,
1229//                                const pixel *const topleft,
1230//                                const int width, const int height, const int a,
1231//                                const int max_width, const int max_height);
1232function ipred_smooth_h_16bpc_neon, export=1
1233        movrel          x8,  X(sm_weights)
1234        add             x8,  x8,  w3, uxtw
1235        clz             w9,  w3
1236        adr             x5,  L(ipred_smooth_h_tbl)
1237        add             x12, x2,  w3, uxtw #1
1238        sub             w9,  w9,  #25
1239        ldrh            w9,  [x5, w9, uxtw #1]
1240        ld1r            {v5.8h},  [x12] // right
1241        sub             x5,  x5,  w9, uxtw
1242        add             x6,  x0,  x1
1243        lsl             x1,  x1,  #1
1244        br              x5
124540:
1246        ld1r            {v7.2s}, [x8]             // weights_hor
1247        sub             x2,  x2,  #8
1248        mov             x7,  #-8
1249        ushll           v7.8h,   v7.8b,   #7      // weights_hor << 7
12504:
1251        ld4r            {v0.4h, v1.4h, v2.4h, v3.4h},  [x2], x7 // left
1252        zip1            v1.2d,   v1.2d,   v0.2d   // left, flipped
1253        zip1            v0.2d,   v3.2d,   v2.2d
1254        sub             v0.8h,   v0.8h,   v5.8h   // left-right
1255        sub             v1.8h,   v1.8h,   v5.8h
1256        sqrdmulh        v20.8h,  v0.8h,   v7.8h   // ((left-right)*weights_hor + 128) >> 8
1257        sqrdmulh        v21.8h,  v1.8h,   v7.8h
1258        add             v20.8h,  v20.8h,  v5.8h
1259        add             v21.8h,  v21.8h,  v5.8h
1260        st1             {v20.d}[0], [x0], x1
1261        st1             {v20.d}[1], [x6], x1
1262        subs            w4,  w4,  #4
1263        st1             {v21.d}[0], [x0], x1
1264        st1             {v21.d}[1], [x6], x1
1265        b.gt            4b
1266        ret
126780:
1268        ld1             {v7.8b}, [x8]             // weights_hor
1269        sub             x2,  x2,  #8
1270        mov             x7,  #-8
1271        ushll           v7.8h,   v7.8b,   #7      // weights_hor << 7
12728:
1273        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7 // left
1274        sub             v3.8h,   v3.8h,   v5.8h   // left-right
1275        sub             v2.8h,   v2.8h,   v5.8h
1276        sub             v1.8h,   v1.8h,   v5.8h
1277        sub             v0.8h,   v0.8h,   v5.8h
1278        sqrdmulh        v20.8h,  v3.8h,   v7.8h   // ((left-right)*weights_hor + 128) >> 8
1279        sqrdmulh        v21.8h,  v2.8h,   v7.8h   // (left flipped)
1280        sqrdmulh        v22.8h,  v1.8h,   v7.8h
1281        sqrdmulh        v23.8h,  v0.8h,   v7.8h
1282        add             v20.8h,  v20.8h,  v5.8h
1283        add             v21.8h,  v21.8h,  v5.8h
1284        add             v22.8h,  v22.8h,  v5.8h
1285        add             v23.8h,  v23.8h,  v5.8h
1286        st1             {v20.8h}, [x0], x1
1287        st1             {v21.8h}, [x6], x1
1288        subs            w4,  w4,  #4
1289        st1             {v22.8h}, [x0], x1
1290        st1             {v23.8h}, [x6], x1
1291        b.gt            8b
1292        ret
1293160:
1294320:
1295640:
1296        sub             x2,  x2,  #8
1297        mov             x7,  #-8
1298        // Set up pointers for four rows in parallel; x0, x6, x5, x10
1299        add             x5,  x0,  x1
1300        add             x10, x6,  x1
1301        lsl             x1,  x1,  #1
1302        sub             x1,  x1,  w3, uxtw #1
1303        mov             w9,  w3
1304
13051:
1306        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},   [x2],  x7 // left
1307        sub             v0.8h,   v0.8h,   v5.8h   // left-right
1308        sub             v1.8h,   v1.8h,   v5.8h
1309        sub             v2.8h,   v2.8h,   v5.8h
1310        sub             v3.8h,   v3.8h,   v5.8h
13112:
1312        ld1             {v7.16b}, [x8],   #16     // weights_hor
1313        ushll           v6.8h,   v7.8b,   #7      // weights_hor << 7
1314        ushll2          v7.8h,   v7.16b,  #7
1315        sqrdmulh        v20.8h,  v3.8h,   v6.8h   // ((left-right)*weights_hor + 128) >> 8
1316        sqrdmulh        v21.8h,  v3.8h,   v7.8h   // (left flipped)
1317        sqrdmulh        v22.8h,  v2.8h,   v6.8h
1318        sqrdmulh        v23.8h,  v2.8h,   v7.8h
1319        sqrdmulh        v24.8h,  v1.8h,   v6.8h
1320        sqrdmulh        v25.8h,  v1.8h,   v7.8h
1321        sqrdmulh        v26.8h,  v0.8h,   v6.8h
1322        sqrdmulh        v27.8h,  v0.8h,   v7.8h
1323        add             v20.8h,  v20.8h,  v5.8h
1324        add             v21.8h,  v21.8h,  v5.8h
1325        add             v22.8h,  v22.8h,  v5.8h
1326        add             v23.8h,  v23.8h,  v5.8h
1327        add             v24.8h,  v24.8h,  v5.8h
1328        add             v25.8h,  v25.8h,  v5.8h
1329        add             v26.8h,  v26.8h,  v5.8h
1330        add             v27.8h,  v27.8h,  v5.8h
1331        subs            w3,  w3,  #16
1332        st1             {v20.8h, v21.8h}, [x0],  #32
1333        st1             {v22.8h, v23.8h}, [x6],  #32
1334        st1             {v24.8h, v25.8h}, [x5],  #32
1335        st1             {v26.8h, v27.8h}, [x10], #32
1336        b.gt            2b
1337        subs            w4,  w4,  #4
1338        b.le            9f
1339        sub             x8,  x8,  w9, uxtw
1340        add             x0,  x0,  x1
1341        add             x6,  x6,  x1
1342        add             x5,  x5,  x1
1343        add             x10, x10, x1
1344        mov             w3,  w9
1345        b               1b
13469:
1347        ret
1348
1349L(ipred_smooth_h_tbl):
1350        .hword L(ipred_smooth_h_tbl) - 640b
1351        .hword L(ipred_smooth_h_tbl) - 320b
1352        .hword L(ipred_smooth_h_tbl) - 160b
1353        .hword L(ipred_smooth_h_tbl) -  80b
1354        .hword L(ipred_smooth_h_tbl) -  40b
1355endfunc
1356
1357// void ipred_filter_16bpc_neon(pixel *dst, const ptrdiff_t stride,
1358//                              const pixel *const topleft,
1359//                              const int width, const int height, const int filt_idx,
1360//                              const int max_width, const int max_height,
1361//                              const int bitdepth_max);
1362.macro filter_fn bpc
1363function ipred_filter_\bpc\()bpc_neon
1364        and             w5,  w5,  #511
1365        movrel          x6,  X(filter_intra_taps)
1366        lsl             w5,  w5,  #6
1367        add             x6,  x6,  w5, uxtw
1368        ld1             {v16.8b, v17.8b, v18.8b, v19.8b}, [x6], #32
1369        clz             w9,  w3
1370        adr             x5,  L(ipred_filter\bpc\()_tbl)
1371        ld1             {v20.8b, v21.8b, v22.8b}, [x6]
1372        sub             w9,  w9,  #26
1373        ldrh            w9,  [x5, w9, uxtw #1]
1374        sxtl            v16.8h,  v16.8b
1375        sxtl            v17.8h,  v17.8b
1376        sub             x5,  x5,  w9, uxtw
1377        sxtl            v18.8h,  v18.8b
1378        sxtl            v19.8h,  v19.8b
1379        add             x6,  x0,  x1
1380        lsl             x1,  x1,  #1
1381        sxtl            v20.8h,  v20.8b
1382        sxtl            v21.8h,  v21.8b
1383        sxtl            v22.8h,  v22.8b
1384        dup             v31.8h,  w8
1385.if \bpc == 10
1386        movi            v30.8h,  #0
1387.endif
1388        br              x5
138940:
1390        ldur            d0,  [x2, #2]             // top (0-3)
1391        sub             x2,  x2,  #4
1392        mov             x7,  #-4
13934:
1394        ld1             {v1.4h}, [x2], x7         // left (0-1) + topleft (2)
1395.if \bpc == 10
1396        mul             v2.8h,   v17.8h,  v0.h[0] // p1(top[0]) * filter(1)
1397        mla             v2.8h,   v18.8h,  v0.h[1] // p2(top[1]) * filter(2)
1398        mla             v2.8h,   v19.8h,  v0.h[2] // p3(top[2]) * filter(3)
1399        mla             v2.8h,   v20.8h,  v0.h[3] // p4(top[3]) * filter(4)
1400        mla             v2.8h,   v16.8h,  v1.h[2] // p0(topleft) * filter(0)
1401        mla             v2.8h,   v21.8h,  v1.h[1] // p5(left[0]) * filter(5)
1402        mla             v2.8h,   v22.8h,  v1.h[0] // p6(left[1]) * filter(6)
1403        srshr           v2.8h,   v2.8h,   #4
1404        smax            v2.8h,   v2.8h,   v30.8h
1405.else
1406        smull           v2.4s,   v17.4h,  v0.h[0] // p1(top[0]) * filter(1)
1407        smlal           v2.4s,   v18.4h,  v0.h[1] // p2(top[1]) * filter(2)
1408        smlal           v2.4s,   v19.4h,  v0.h[2] // p3(top[2]) * filter(3)
1409        smlal           v2.4s,   v20.4h,  v0.h[3] // p4(top[3]) * filter(4)
1410        smlal           v2.4s,   v16.4h,  v1.h[2] // p0(topleft) * filter(0)
1411        smlal           v2.4s,   v21.4h,  v1.h[1] // p5(left[0]) * filter(5)
1412        smlal           v2.4s,   v22.4h,  v1.h[0] // p6(left[1]) * filter(6)
1413        smull2          v3.4s,   v17.8h,  v0.h[0] // p1(top[0]) * filter(1)
1414        smlal2          v3.4s,   v18.8h,  v0.h[1] // p2(top[1]) * filter(2)
1415        smlal2          v3.4s,   v19.8h,  v0.h[2] // p3(top[2]) * filter(3)
1416        smlal2          v3.4s,   v20.8h,  v0.h[3] // p4(top[3]) * filter(4)
1417        smlal2          v3.4s,   v16.8h,  v1.h[2] // p0(topleft) * filter(0)
1418        smlal2          v3.4s,   v21.8h,  v1.h[1] // p5(left[0]) * filter(5)
1419        smlal2          v3.4s,   v22.8h,  v1.h[0] // p6(left[1]) * filter(6)
1420        sqrshrun        v2.4h,   v2.4s,   #4
1421        sqrshrun2       v2.8h,   v3.4s,   #4
1422.endif
1423        smin            v2.8h,   v2.8h,   v31.8h
1424        subs            w4,  w4,  #2
1425        st1             {v2.d}[0], [x0], x1
1426        ext             v0.16b,  v2.16b,  v2.16b, #8 // move top from [4-7] to [0-3]
1427        st1             {v2.d}[1], [x6], x1
1428        b.gt            4b
1429        ret
143080:
1431        ldur            q0,  [x2, #2]             // top (0-7)
1432        sub             x2,  x2,  #4
1433        mov             x7,  #-4
14348:
1435        ld1             {v1.4h}, [x2], x7         // left (0-1) + topleft (2)
1436.if \bpc == 10
1437        mul             v2.8h,   v17.8h,  v0.h[0] // p1(top[0]) * filter(1)
1438        mla             v2.8h,   v18.8h,  v0.h[1] // p2(top[1]) * filter(2)
1439        mla             v2.8h,   v19.8h,  v0.h[2] // p3(top[2]) * filter(3)
1440        mla             v2.8h,   v20.8h,  v0.h[3] // p4(top[3]) * filter(4)
1441        mla             v2.8h,   v16.8h,  v1.h[2] // p0(topleft) * filter(0)
1442        mla             v2.8h,   v21.8h,  v1.h[1] // p5(left[0]) * filter(5)
1443        mla             v2.8h,   v22.8h,  v1.h[0] // p6(left[1]) * filter(6)
1444        mul             v3.8h,   v17.8h,  v0.h[4] // p1(top[0]) * filter(1)
1445        mla             v3.8h,   v18.8h,  v0.h[5] // p2(top[1]) * filter(2)
1446        mla             v3.8h,   v19.8h,  v0.h[6] // p3(top[2]) * filter(3)
1447        srshr           v2.8h,   v2.8h,   #4
1448        smax            v2.8h,   v2.8h,   v30.8h
1449        smin            v2.8h,   v2.8h,   v31.8h
1450        mla             v3.8h,   v20.8h,  v0.h[7] // p4(top[3]) * filter(4)
1451        mla             v3.8h,   v16.8h,  v0.h[3] // p0(topleft) * filter(0)
1452        mla             v3.8h,   v21.8h,  v2.h[3] // p5(left[0]) * filter(5)
1453        mla             v3.8h,   v22.8h,  v2.h[7] // p6(left[1]) * filter(6)
1454        srshr           v3.8h,   v3.8h,   #4
1455        smax            v3.8h,   v3.8h,   v30.8h
1456.else
1457        smull           v2.4s,   v17.4h,  v0.h[0] // p1(top[0]) * filter(1)
1458        smlal           v2.4s,   v18.4h,  v0.h[1] // p2(top[1]) * filter(2)
1459        smlal           v2.4s,   v19.4h,  v0.h[2] // p3(top[2]) * filter(3)
1460        smlal           v2.4s,   v20.4h,  v0.h[3] // p4(top[3]) * filter(4)
1461        smlal           v2.4s,   v16.4h,  v1.h[2] // p0(topleft) * filter(0)
1462        smlal           v2.4s,   v21.4h,  v1.h[1] // p5(left[0]) * filter(5)
1463        smlal           v2.4s,   v22.4h,  v1.h[0] // p6(left[1]) * filter(6)
1464        smull2          v3.4s,   v17.8h,  v0.h[0] // p1(top[0]) * filter(1)
1465        smlal2          v3.4s,   v18.8h,  v0.h[1] // p2(top[1]) * filter(2)
1466        smlal2          v3.4s,   v19.8h,  v0.h[2] // p3(top[2]) * filter(3)
1467        smlal2          v3.4s,   v20.8h,  v0.h[3] // p4(top[3]) * filter(4)
1468        smlal2          v3.4s,   v16.8h,  v1.h[2] // p0(topleft) * filter(0)
1469        smlal2          v3.4s,   v21.8h,  v1.h[1] // p5(left[0]) * filter(5)
1470        smlal2          v3.4s,   v22.8h,  v1.h[0] // p6(left[1]) * filter(6)
1471        smull           v4.4s,   v17.4h,  v0.h[4] // p1(top[0]) * filter(1)
1472        smlal           v4.4s,   v18.4h,  v0.h[5] // p2(top[1]) * filter(2)
1473        smlal           v4.4s,   v19.4h,  v0.h[6] // p3(top[2]) * filter(3)
1474        sqrshrun        v2.4h,   v2.4s,   #4
1475        sqrshrun2       v2.8h,   v3.4s,   #4
1476        smin            v2.8h,   v2.8h,   v31.8h
1477        smlal           v4.4s,   v20.4h,  v0.h[7] // p4(top[3]) * filter(4)
1478        smlal           v4.4s,   v16.4h,  v0.h[3] // p0(topleft) * filter(0)
1479        smlal           v4.4s,   v21.4h,  v2.h[3] // p5(left[0]) * filter(5)
1480        smlal           v4.4s,   v22.4h,  v2.h[7] // p6(left[1]) * filter(6)
1481        smull2          v5.4s,   v17.8h,  v0.h[4] // p1(top[0]) * filter(1)
1482        smlal2          v5.4s,   v18.8h,  v0.h[5] // p2(top[1]) * filter(2)
1483        smlal2          v5.4s,   v19.8h,  v0.h[6] // p3(top[2]) * filter(3)
1484        smlal2          v5.4s,   v20.8h,  v0.h[7] // p4(top[3]) * filter(4)
1485        smlal2          v5.4s,   v16.8h,  v0.h[3] // p0(topleft) * filter(0)
1486        smlal2          v5.4s,   v21.8h,  v2.h[3] // p5(left[0]) * filter(5)
1487        smlal2          v5.4s,   v22.8h,  v2.h[7] // p6(left[1]) * filter(6)
1488        sqrshrun        v3.4h,   v4.4s,   #4
1489        sqrshrun2       v3.8h,   v5.4s,   #4
1490.endif
1491        smin            v3.8h,   v3.8h,   v31.8h
1492        subs            w4,  w4,  #2
1493        st2             {v2.d, v3.d}[0], [x0], x1
1494        zip2            v0.2d,   v2.2d,   v3.2d
1495        st2             {v2.d, v3.d}[1], [x6], x1
1496        b.gt            8b
1497        ret
1498160:
1499320:
1500        add             x8,  x2,  #2
1501        sub             x2,  x2,  #4
1502        mov             x7,  #-4
1503        sub             x1,  x1,  w3, uxtw #1
1504        mov             w9,  w3
1505
15061:
1507        ld1             {v0.4h}, [x2], x7         // left (0-1) + topleft (2)
15082:
1509        ld1             {v1.8h, v2.8h}, [x8], #32 // top(0-15)
1510.if \bpc == 10
1511        mul             v3.8h,   v16.8h,  v0.h[2] // p0(topleft) * filter(0)
1512        mla             v3.8h,   v21.8h,  v0.h[1] // p5(left[0]) * filter(5)
1513        mla             v3.8h,   v22.8h,  v0.h[0] // p6(left[1]) * filter(6)
1514        mla             v3.8h,   v17.8h,  v1.h[0] // p1(top[0]) * filter(1)
1515        mla             v3.8h,   v18.8h,  v1.h[1] // p2(top[1]) * filter(2)
1516        mla             v3.8h,   v19.8h,  v1.h[2] // p3(top[2]) * filter(3)
1517        mla             v3.8h,   v20.8h,  v1.h[3] // p4(top[3]) * filter(4)
1518
1519        mul             v4.8h,   v17.8h,  v1.h[4] // p1(top[0]) * filter(1)
1520        mla             v4.8h,   v18.8h,  v1.h[5] // p2(top[1]) * filter(2)
1521        mla             v4.8h,   v19.8h,  v1.h[6] // p3(top[2]) * filter(3)
1522        srshr           v3.8h,   v3.8h,   #4
1523        smax            v3.8h,   v3.8h,   v30.8h
1524        smin            v3.8h,   v3.8h,   v31.8h
1525        mla             v4.8h,   v20.8h,  v1.h[7] // p4(top[3]) * filter(4)
1526        mla             v4.8h,   v16.8h,  v1.h[3] // p0(topleft) * filter(0)
1527        mla             v4.8h,   v21.8h,  v3.h[3] // p5(left[0]) * filter(5)
1528        mla             v4.8h,   v22.8h,  v3.h[7] // p6(left[1]) * filter(6)
1529
1530        mul             v5.8h,   v17.8h,  v2.h[0] // p1(top[0]) * filter(1)
1531        mla             v5.8h,   v18.8h,  v2.h[1] // p2(top[1]) * filter(2)
1532        mla             v5.8h,   v19.8h,  v2.h[2] // p3(top[2]) * filter(3)
1533        srshr           v4.8h,   v4.8h,   #4
1534        smax            v4.8h,   v4.8h,   v30.8h
1535        smin            v4.8h,   v4.8h,   v31.8h
1536        mla             v5.8h,   v20.8h,  v2.h[3] // p4(top[3]) * filter(4)
1537        mla             v5.8h,   v16.8h,  v1.h[7] // p0(topleft) * filter(0)
1538        mla             v5.8h,   v21.8h,  v4.h[3] // p5(left[0]) * filter(5)
1539        mla             v5.8h,   v22.8h,  v4.h[7] // p6(left[1]) * filter(6)
1540
1541        mul             v6.8h,   v17.8h,  v2.h[4] // p1(top[0]) * filter(1)
1542        mla             v6.8h,   v18.8h,  v2.h[5] // p2(top[1]) * filter(2)
1543        mla             v6.8h,   v19.8h,  v2.h[6] // p3(top[2]) * filter(3)
1544        srshr           v5.8h,   v5.8h,   #4
1545        smax            v5.8h,   v5.8h,   v30.8h
1546        smin            v5.8h,   v5.8h,   v31.8h
1547        mla             v6.8h,   v20.8h,  v2.h[7] // p4(top[3]) * filter(4)
1548        mla             v6.8h,   v16.8h,  v2.h[3] // p0(topleft) * filter(0)
1549        mla             v6.8h,   v21.8h,  v5.h[3] // p5(left[0]) * filter(5)
1550        mla             v6.8h,   v22.8h,  v5.h[7] // p6(left[1]) * filter(6)
1551
1552        subs            w3,  w3,  #16
1553        srshr           v6.8h,   v6.8h,   #4
1554        smax            v6.8h,   v6.8h,   v30.8h
1555.else
1556        smull           v3.4s,   v16.4h,  v0.h[2] // p0(topleft) * filter(0)
1557        smlal           v3.4s,   v21.4h,  v0.h[1] // p5(left[0]) * filter(5)
1558        smlal           v3.4s,   v22.4h,  v0.h[0] // p6(left[1]) * filter(6)
1559        smlal           v3.4s,   v17.4h,  v1.h[0] // p1(top[0]) * filter(1)
1560        smlal           v3.4s,   v18.4h,  v1.h[1] // p2(top[1]) * filter(2)
1561        smlal           v3.4s,   v19.4h,  v1.h[2] // p3(top[2]) * filter(3)
1562        smlal           v3.4s,   v20.4h,  v1.h[3] // p4(top[3]) * filter(4)
1563        smull2          v4.4s,   v16.8h,  v0.h[2] // p0(topleft) * filter(0)
1564        smlal2          v4.4s,   v21.8h,  v0.h[1] // p5(left[0]) * filter(5)
1565        smlal2          v4.4s,   v22.8h,  v0.h[0] // p6(left[1]) * filter(6)
1566        smlal2          v4.4s,   v17.8h,  v1.h[0] // p1(top[0]) * filter(1)
1567        smlal2          v4.4s,   v18.8h,  v1.h[1] // p2(top[1]) * filter(2)
1568        smlal2          v4.4s,   v19.8h,  v1.h[2] // p3(top[2]) * filter(3)
1569        smlal2          v4.4s,   v20.8h,  v1.h[3] // p4(top[3]) * filter(4)
1570
1571        smull           v5.4s,   v17.4h,  v1.h[4] // p1(top[0]) * filter(1)
1572        smlal           v5.4s,   v18.4h,  v1.h[5] // p2(top[1]) * filter(2)
1573        smlal           v5.4s,   v19.4h,  v1.h[6] // p3(top[2]) * filter(3)
1574        sqrshrun        v3.4h,   v3.4s,   #4
1575        sqrshrun2       v3.8h,   v4.4s,   #4
1576        smin            v3.8h,   v3.8h,   v31.8h
1577        smlal           v5.4s,   v20.4h,  v1.h[7] // p4(top[3]) * filter(4)
1578        smlal           v5.4s,   v16.4h,  v1.h[3] // p0(topleft) * filter(0)
1579        smlal           v5.4s,   v21.4h,  v3.h[3] // p5(left[0]) * filter(5)
1580        smlal           v5.4s,   v22.4h,  v3.h[7] // p6(left[1]) * filter(6)
1581        smull2          v6.4s,   v17.8h,  v1.h[4] // p1(top[0]) * filter(1)
1582        smlal2          v6.4s,   v18.8h,  v1.h[5] // p2(top[1]) * filter(2)
1583        smlal2          v6.4s,   v19.8h,  v1.h[6] // p3(top[2]) * filter(3)
1584        smlal2          v6.4s,   v20.8h,  v1.h[7] // p4(top[3]) * filter(4)
1585        smlal2          v6.4s,   v16.8h,  v1.h[3] // p0(topleft) * filter(0)
1586        smlal2          v6.4s,   v21.8h,  v3.h[3] // p5(left[0]) * filter(5)
1587        smlal2          v6.4s,   v22.8h,  v3.h[7] // p6(left[1]) * filter(6)
1588
1589        smull           v24.4s,  v17.4h,  v2.h[0] // p1(top[0]) * filter(1)
1590        smlal           v24.4s,  v18.4h,  v2.h[1] // p2(top[1]) * filter(2)
1591        smlal           v24.4s,  v19.4h,  v2.h[2] // p3(top[2]) * filter(3)
1592        sqrshrun        v4.4h,   v5.4s,   #4
1593        sqrshrun2       v4.8h,   v6.4s,   #4
1594        smin            v4.8h,   v4.8h,   v31.8h
1595        smlal           v24.4s,  v20.4h,  v2.h[3] // p4(top[3]) * filter(4)
1596        smlal           v24.4s,  v16.4h,  v1.h[7] // p0(topleft) * filter(0)
1597        smlal           v24.4s,  v21.4h,  v4.h[3] // p5(left[0]) * filter(5)
1598        smlal           v24.4s,  v22.4h,  v4.h[7] // p6(left[1]) * filter(6)
1599        smull2          v25.4s,  v17.8h,  v2.h[0] // p1(top[0]) * filter(1)
1600        smlal2          v25.4s,  v18.8h,  v2.h[1] // p2(top[1]) * filter(2)
1601        smlal2          v25.4s,  v19.8h,  v2.h[2] // p3(top[2]) * filter(3)
1602        smlal2          v25.4s,  v20.8h,  v2.h[3] // p4(top[3]) * filter(4)
1603        smlal2          v25.4s,  v16.8h,  v1.h[7] // p0(topleft) * filter(0)
1604        smlal2          v25.4s,  v21.8h,  v4.h[3] // p5(left[0]) * filter(5)
1605        smlal2          v25.4s,  v22.8h,  v4.h[7] // p6(left[1]) * filter(6)
1606
1607        smull           v26.4s,  v17.4h,  v2.h[4] // p1(top[0]) * filter(1)
1608        smlal           v26.4s,  v18.4h,  v2.h[5] // p2(top[1]) * filter(2)
1609        smlal           v26.4s,  v19.4h,  v2.h[6] // p3(top[2]) * filter(3)
1610        sqrshrun        v5.4h,   v24.4s,  #4
1611        sqrshrun2       v5.8h,   v25.4s,  #4
1612        smin            v5.8h,   v5.8h,   v31.8h
1613        smlal           v26.4s,  v20.4h,  v2.h[7] // p4(top[3]) * filter(4)
1614        smlal           v26.4s,  v16.4h,  v2.h[3] // p0(topleft) * filter(0)
1615        smlal           v26.4s,  v21.4h,  v5.h[3] // p5(left[0]) * filter(5)
1616        smlal           v26.4s,  v22.4h,  v5.h[7] // p6(left[1]) * filter(6)
1617        smull2          v27.4s,  v17.8h,  v2.h[4] // p1(top[0]) * filter(1)
1618        smlal2          v27.4s,  v18.8h,  v2.h[5] // p2(top[1]) * filter(2)
1619        smlal2          v27.4s,  v19.8h,  v2.h[6] // p3(top[2]) * filter(3)
1620        smlal2          v27.4s,  v20.8h,  v2.h[7] // p4(top[3]) * filter(4)
1621        smlal2          v27.4s,  v16.8h,  v2.h[3] // p0(topleft) * filter(0)
1622        smlal2          v27.4s,  v21.8h,  v5.h[3] // p5(left[0]) * filter(5)
1623        smlal2          v27.4s,  v22.8h,  v5.h[7] // p6(left[1]) * filter(6)
1624
1625        subs            w3,  w3,  #16
1626        sqrshrun        v6.4h,   v26.4s,  #4
1627        sqrshrun2       v6.8h,   v27.4s,  #4
1628.endif
1629        smin            v6.8h,   v6.8h,   v31.8h
1630
1631        ins             v0.h[2], v2.h[7]
1632        st4             {v3.d, v4.d, v5.d, v6.d}[0], [x0], #32
1633        ins             v0.h[0], v6.h[7]
1634        st4             {v3.d, v4.d, v5.d, v6.d}[1], [x6], #32
1635        ins             v0.h[1], v6.h[3]
1636        b.gt            2b
1637        subs            w4,  w4,  #2
1638        b.le            9f
1639        sub             x8,  x6,  w9, uxtw #1
1640        add             x0,  x0,  x1
1641        add             x6,  x6,  x1
1642        mov             w3,  w9
1643        b               1b
16449:
1645        ret
1646
1647L(ipred_filter\bpc\()_tbl):
1648        .hword L(ipred_filter\bpc\()_tbl) - 320b
1649        .hword L(ipred_filter\bpc\()_tbl) - 160b
1650        .hword L(ipred_filter\bpc\()_tbl) -  80b
1651        .hword L(ipred_filter\bpc\()_tbl) -  40b
1652endfunc
1653.endm
1654
1655filter_fn 10
1656filter_fn 12
1657
1658function ipred_filter_16bpc_neon, export=1
1659        ldr             w8,  [sp]
1660        cmp             w8,  0x3ff
1661        b.le            ipred_filter_10bpc_neon
1662        b               ipred_filter_12bpc_neon
1663endfunc
1664
1665// void pal_pred_16bpc_neon(pixel *dst, const ptrdiff_t stride,
1666//                          const uint16_t *const pal, const uint8_t *idx,
1667//                          const int w, const int h);
1668function pal_pred_16bpc_neon, export=1
1669        ld1             {v30.8h}, [x2]
1670        clz             w9,  w4
1671        adr             x6,  L(pal_pred_tbl)
1672        sub             w9,  w9,  #25
1673        ldrh            w9,  [x6, w9, uxtw #1]
1674        movi            v31.8h,  #1, lsl #8
1675        sub             x6,  x6,  w9, uxtw
1676        br              x6
167740:
1678        add             x2,  x0,  x1
1679        lsl             x1,  x1,  #1
16804:
1681        ld1             {v1.16b}, [x3], #16
1682        subs            w5,  w5,  #4
1683        // Restructure v1 from a, b, c, ... into 2*a, 2*a+1, 2*b, 2*b+1, 2*c, 2*c+1, ...
1684        add             v1.16b,  v1.16b,  v1.16b
1685        zip1            v0.16b,  v1.16b,  v1.16b
1686        zip2            v1.16b,  v1.16b,  v1.16b
1687        add             v0.8h,   v0.8h,   v31.8h
1688        add             v1.8h,   v1.8h,   v31.8h
1689        tbl             v0.16b, {v30.16b}, v0.16b
1690        st1             {v0.d}[0], [x0], x1
1691        tbl             v1.16b, {v30.16b}, v1.16b
1692        st1             {v0.d}[1], [x2], x1
1693        st1             {v1.d}[0], [x0], x1
1694        st1             {v1.d}[1], [x2], x1
1695        b.gt            4b
1696        ret
169780:
1698        add             x2,  x0,  x1
1699        lsl             x1,  x1,  #1
17008:
1701        ld1             {v2.16b, v3.16b}, [x3], #32
1702        subs            w5,  w5,  #4
1703        add             v2.16b,  v2.16b,  v2.16b
1704        add             v3.16b,  v3.16b,  v3.16b
1705        zip1            v0.16b,  v2.16b,  v2.16b
1706        zip2            v1.16b,  v2.16b,  v2.16b
1707        zip1            v2.16b,  v3.16b,  v3.16b
1708        zip2            v3.16b,  v3.16b,  v3.16b
1709        add             v0.8h,   v0.8h,   v31.8h
1710        add             v1.8h,   v1.8h,   v31.8h
1711        add             v2.8h,   v2.8h,   v31.8h
1712        add             v3.8h,   v3.8h,   v31.8h
1713        tbl             v0.16b, {v30.16b}, v0.16b
1714        tbl             v1.16b, {v30.16b}, v1.16b
1715        st1             {v0.8h}, [x0], x1
1716        tbl             v2.16b, {v30.16b}, v2.16b
1717        st1             {v1.8h}, [x2], x1
1718        tbl             v3.16b, {v30.16b}, v3.16b
1719        st1             {v2.8h}, [x0], x1
1720        st1             {v3.8h}, [x2], x1
1721        b.gt            8b
1722        ret
1723160:
1724        add             x2,  x0,  x1
1725        lsl             x1,  x1,  #1
172616:
1727        ld1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x3], #64
1728        subs            w5,  w5,  #4
1729        add             v4.16b,  v4.16b,  v4.16b
1730        add             v5.16b,  v5.16b,  v5.16b
1731        add             v6.16b,  v6.16b,  v6.16b
1732        add             v7.16b,  v7.16b,  v7.16b
1733        zip1            v0.16b,  v4.16b,  v4.16b
1734        zip2            v1.16b,  v4.16b,  v4.16b
1735        zip1            v2.16b,  v5.16b,  v5.16b
1736        zip2            v3.16b,  v5.16b,  v5.16b
1737        zip1            v4.16b,  v6.16b,  v6.16b
1738        zip2            v5.16b,  v6.16b,  v6.16b
1739        zip1            v6.16b,  v7.16b,  v7.16b
1740        zip2            v7.16b,  v7.16b,  v7.16b
1741        add             v0.8h,   v0.8h,   v31.8h
1742        add             v1.8h,   v1.8h,   v31.8h
1743        add             v2.8h,   v2.8h,   v31.8h
1744        add             v3.8h,   v3.8h,   v31.8h
1745        add             v4.8h,   v4.8h,   v31.8h
1746        tbl             v0.16b, {v30.16b}, v0.16b
1747        add             v5.8h,   v5.8h,   v31.8h
1748        tbl             v1.16b, {v30.16b}, v1.16b
1749        add             v6.8h,   v6.8h,   v31.8h
1750        tbl             v2.16b, {v30.16b}, v2.16b
1751        add             v7.8h,   v7.8h,   v31.8h
1752        tbl             v3.16b, {v30.16b}, v3.16b
1753        tbl             v4.16b, {v30.16b}, v4.16b
1754        tbl             v5.16b, {v30.16b}, v5.16b
1755        st1             {v0.8h, v1.8h}, [x0], x1
1756        tbl             v6.16b, {v30.16b}, v6.16b
1757        st1             {v2.8h, v3.8h}, [x2], x1
1758        tbl             v7.16b, {v30.16b}, v7.16b
1759        st1             {v4.8h, v5.8h}, [x0], x1
1760        st1             {v6.8h, v7.8h}, [x2], x1
1761        b.gt            16b
1762        ret
1763320:
1764        add             x2,  x0,  x1
1765        lsl             x1,  x1,  #1
176632:
1767        ld1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x3], #64
1768        subs            w5,  w5,  #2
1769        add             v4.16b,  v4.16b,  v4.16b
1770        add             v5.16b,  v5.16b,  v5.16b
1771        add             v6.16b,  v6.16b,  v6.16b
1772        add             v7.16b,  v7.16b,  v7.16b
1773        zip1            v0.16b,  v4.16b,  v4.16b
1774        zip2            v1.16b,  v4.16b,  v4.16b
1775        zip1            v2.16b,  v5.16b,  v5.16b
1776        zip2            v3.16b,  v5.16b,  v5.16b
1777        zip1            v4.16b,  v6.16b,  v6.16b
1778        zip2            v5.16b,  v6.16b,  v6.16b
1779        zip1            v6.16b,  v7.16b,  v7.16b
1780        zip2            v7.16b,  v7.16b,  v7.16b
1781        add             v0.8h,   v0.8h,   v31.8h
1782        add             v1.8h,   v1.8h,   v31.8h
1783        add             v2.8h,   v2.8h,   v31.8h
1784        add             v3.8h,   v3.8h,   v31.8h
1785        add             v4.8h,   v4.8h,   v31.8h
1786        tbl             v0.16b, {v30.16b}, v0.16b
1787        add             v5.8h,   v5.8h,   v31.8h
1788        tbl             v1.16b, {v30.16b}, v1.16b
1789        add             v6.8h,   v6.8h,   v31.8h
1790        tbl             v2.16b, {v30.16b}, v2.16b
1791        add             v7.8h,   v7.8h,   v31.8h
1792        tbl             v3.16b, {v30.16b}, v3.16b
1793        tbl             v4.16b, {v30.16b}, v4.16b
1794        tbl             v5.16b, {v30.16b}, v5.16b
1795        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
1796        tbl             v6.16b, {v30.16b}, v6.16b
1797        tbl             v7.16b, {v30.16b}, v7.16b
1798        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], x1
1799        b.gt            32b
1800        ret
1801640:
1802        add             x2,  x0,  #64
180364:
1804        ld1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x3], #64
1805        subs            w5,  w5,  #1
1806        add             v4.16b,  v4.16b,  v4.16b
1807        add             v5.16b,  v5.16b,  v5.16b
1808        add             v6.16b,  v6.16b,  v6.16b
1809        add             v7.16b,  v7.16b,  v7.16b
1810        zip1            v0.16b,  v4.16b,  v4.16b
1811        zip2            v1.16b,  v4.16b,  v4.16b
1812        zip1            v2.16b,  v5.16b,  v5.16b
1813        zip2            v3.16b,  v5.16b,  v5.16b
1814        zip1            v4.16b,  v6.16b,  v6.16b
1815        zip2            v5.16b,  v6.16b,  v6.16b
1816        zip1            v6.16b,  v7.16b,  v7.16b
1817        zip2            v7.16b,  v7.16b,  v7.16b
1818        add             v0.8h,   v0.8h,   v31.8h
1819        add             v1.8h,   v1.8h,   v31.8h
1820        add             v2.8h,   v2.8h,   v31.8h
1821        add             v3.8h,   v3.8h,   v31.8h
1822        add             v4.8h,   v4.8h,   v31.8h
1823        tbl             v0.16b, {v30.16b}, v0.16b
1824        add             v5.8h,   v5.8h,   v31.8h
1825        tbl             v1.16b, {v30.16b}, v1.16b
1826        add             v6.8h,   v6.8h,   v31.8h
1827        tbl             v2.16b, {v30.16b}, v2.16b
1828        add             v7.8h,   v7.8h,   v31.8h
1829        tbl             v3.16b, {v30.16b}, v3.16b
1830        tbl             v4.16b, {v30.16b}, v4.16b
1831        tbl             v5.16b, {v30.16b}, v5.16b
1832        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
1833        tbl             v6.16b, {v30.16b}, v6.16b
1834        tbl             v7.16b, {v30.16b}, v7.16b
1835        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], x1
1836        b.gt            64b
1837        ret
1838
1839L(pal_pred_tbl):
1840        .hword L(pal_pred_tbl) - 640b
1841        .hword L(pal_pred_tbl) - 320b
1842        .hword L(pal_pred_tbl) - 160b
1843        .hword L(pal_pred_tbl) -  80b
1844        .hword L(pal_pred_tbl) -  40b
1845endfunc
1846
1847// void ipred_cfl_128_16bpc_neon(pixel *dst, const ptrdiff_t stride,
1848//                               const pixel *const topleft,
1849//                               const int width, const int height,
1850//                               const int16_t *ac, const int alpha,
1851//                               const int bitdepth_max);
1852function ipred_cfl_128_16bpc_neon, export=1
1853        dup             v31.8h,  w7   // bitdepth_max
1854        clz             w9,  w3
1855        adr             x7,  L(ipred_cfl_128_tbl)
1856        sub             w9,  w9,  #26
1857        ldrh            w9,  [x7, w9, uxtw #1]
1858        urshr           v0.8h,   v31.8h,  #1
1859        dup             v1.8h,   w6   // alpha
1860        sub             x7,  x7,  w9, uxtw
1861        add             x6,  x0,  x1
1862        lsl             x1,  x1,  #1
1863        movi            v30.8h,  #0
1864        br              x7
1865L(ipred_cfl_splat_w4):
1866        ld1             {v4.8h, v5.8h}, [x5], #32
1867        subs            w4,  w4,  #4
1868        smull           v2.4s,   v4.4h,   v1.4h  // diff = ac * alpha
1869        smull2          v3.4s,   v4.8h,   v1.8h
1870        smull           v4.4s,   v5.4h,   v1.4h
1871        smull2          v5.4s,   v5.8h,   v1.8h
1872        sshr            v16.4s,  v2.4s,   #31    // sign = diff >> 31
1873        sshr            v17.4s,  v3.4s,   #31
1874        sshr            v18.4s,  v4.4s,   #31
1875        sshr            v19.4s,  v5.4s,   #31
1876        add             v2.4s,   v2.4s,   v16.4s // diff + sign
1877        add             v3.4s,   v3.4s,   v17.4s
1878        add             v4.4s,   v4.4s,   v18.4s
1879        add             v5.4s,   v5.4s,   v19.4s
1880        rshrn           v2.4h,   v2.4s,   #6     // (diff + sign + 32) >> 6 = apply_sign()
1881        rshrn2          v2.8h,   v3.4s,   #6
1882        rshrn           v3.4h,   v4.4s,   #6
1883        rshrn2          v3.8h,   v5.4s,   #6
1884        add             v2.8h,   v2.8h,   v0.8h  // dc + apply_sign()
1885        add             v3.8h,   v3.8h,   v0.8h
1886        smax            v2.8h,   v2.8h,   v30.8h
1887        smax            v3.8h,   v3.8h,   v30.8h
1888        smin            v2.8h,   v2.8h,   v31.8h
1889        smin            v3.8h,   v3.8h,   v31.8h
1890        st1             {v2.d}[0],  [x0], x1
1891        st1             {v2.d}[1],  [x6], x1
1892        st1             {v3.d}[0],  [x0], x1
1893        st1             {v3.d}[1],  [x6], x1
1894        b.gt            L(ipred_cfl_splat_w4)
1895        ret
1896L(ipred_cfl_splat_w8):
1897        ld1             {v4.8h, v5.8h}, [x5], #32
1898        subs            w4,  w4,  #2
1899        smull           v2.4s,   v4.4h,   v1.4h  // diff = ac * alpha
1900        smull2          v3.4s,   v4.8h,   v1.8h
1901        smull           v4.4s,   v5.4h,   v1.4h
1902        smull2          v5.4s,   v5.8h,   v1.8h
1903        sshr            v16.4s,  v2.4s,   #31    // sign = diff >> 31
1904        sshr            v17.4s,  v3.4s,   #31
1905        sshr            v18.4s,  v4.4s,   #31
1906        sshr            v19.4s,  v5.4s,   #31
1907        add             v2.4s,   v2.4s,   v16.4s // diff + sign
1908        add             v3.4s,   v3.4s,   v17.4s
1909        add             v4.4s,   v4.4s,   v18.4s
1910        add             v5.4s,   v5.4s,   v19.4s
1911        rshrn           v2.4h,   v2.4s,   #6     // (diff + sign + 32) >> 6 = apply_sign()
1912        rshrn2          v2.8h,   v3.4s,   #6
1913        rshrn           v3.4h,   v4.4s,   #6
1914        rshrn2          v3.8h,   v5.4s,   #6
1915        add             v2.8h,   v2.8h,   v0.8h  // dc + apply_sign()
1916        add             v3.8h,   v3.8h,   v0.8h
1917        smax            v2.8h,   v2.8h,   v30.8h
1918        smax            v3.8h,   v3.8h,   v30.8h
1919        smin            v2.8h,   v2.8h,   v31.8h
1920        smin            v3.8h,   v3.8h,   v31.8h
1921        st1             {v2.8h},  [x0], x1
1922        st1             {v3.8h},  [x6], x1
1923        b.gt            L(ipred_cfl_splat_w8)
1924        ret
1925L(ipred_cfl_splat_w16):
1926        add             x7,  x5,  w3, uxtw #1
1927        sub             x1,  x1,  w3, uxtw #1
1928        mov             w9,  w3
19291:
1930        ld1             {v2.8h, v3.8h}, [x5], #32
1931        ld1             {v4.8h, v5.8h}, [x7], #32
1932        subs            w3,  w3,  #16
1933        smull           v16.4s,  v2.4h,   v1.4h  // diff = ac * alpha
1934        smull2          v17.4s,  v2.8h,   v1.8h
1935        smull           v18.4s,  v3.4h,   v1.4h
1936        smull2          v19.4s,  v3.8h,   v1.8h
1937        smull           v2.4s,   v4.4h,   v1.4h
1938        smull2          v3.4s,   v4.8h,   v1.8h
1939        smull           v4.4s,   v5.4h,   v1.4h
1940        smull2          v5.4s,   v5.8h,   v1.8h
1941        sshr            v20.4s,  v16.4s,  #31    // sign = diff >> 31
1942        sshr            v21.4s,  v17.4s,  #31
1943        sshr            v22.4s,  v18.4s,  #31
1944        sshr            v23.4s,  v19.4s,  #31
1945        sshr            v24.4s,  v2.4s,   #31
1946        sshr            v25.4s,  v3.4s,   #31
1947        sshr            v26.4s,  v4.4s,   #31
1948        sshr            v27.4s,  v5.4s,   #31
1949        add             v16.4s,  v16.4s,  v20.4s // diff + sign
1950        add             v17.4s,  v17.4s,  v21.4s
1951        add             v18.4s,  v18.4s,  v22.4s
1952        add             v19.4s,  v19.4s,  v23.4s
1953        add             v2.4s,   v2.4s,   v24.4s
1954        add             v3.4s,   v3.4s,   v25.4s
1955        add             v4.4s,   v4.4s,   v26.4s
1956        add             v5.4s,   v5.4s,   v27.4s
1957        rshrn           v16.4h,  v16.4s,  #6     // (diff + sign + 32) >> 6 = apply_sign()
1958        rshrn2          v16.8h,  v17.4s,  #6
1959        rshrn           v17.4h,  v18.4s,  #6
1960        rshrn2          v17.8h,  v19.4s,  #6
1961        rshrn           v6.4h,   v2.4s,   #6
1962        rshrn2          v6.8h,   v3.4s,   #6
1963        rshrn           v7.4h,   v4.4s,   #6
1964        rshrn2          v7.8h,   v5.4s,   #6
1965        add             v2.8h,   v16.8h,  v0.8h  // dc + apply_sign()
1966        add             v3.8h,   v17.8h,  v0.8h
1967        add             v4.8h,   v6.8h,   v0.8h
1968        add             v5.8h,   v7.8h,   v0.8h
1969        smax            v2.8h,   v2.8h,   v30.8h
1970        smax            v3.8h,   v3.8h,   v30.8h
1971        smax            v4.8h,   v4.8h,   v30.8h
1972        smax            v5.8h,   v5.8h,   v30.8h
1973        smin            v2.8h,   v2.8h,   v31.8h
1974        smin            v3.8h,   v3.8h,   v31.8h
1975        smin            v4.8h,   v4.8h,   v31.8h
1976        smin            v5.8h,   v5.8h,   v31.8h
1977        st1             {v2.8h, v3.8h},  [x0], #32
1978        st1             {v4.8h, v5.8h},  [x6], #32
1979        b.gt            1b
1980        subs            w4,  w4,  #2
1981        add             x5,  x5,  w9, uxtw #1
1982        add             x7,  x7,  w9, uxtw #1
1983        add             x0,  x0,  x1
1984        add             x6,  x6,  x1
1985        mov             w3,  w9
1986        b.gt            1b
1987        ret
1988
1989L(ipred_cfl_128_tbl):
1990L(ipred_cfl_splat_tbl):
1991        .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16)
1992        .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16)
1993        .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w8)
1994        .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w4)
1995endfunc
1996
1997// void ipred_cfl_top_16bpc_neon(pixel *dst, const ptrdiff_t stride,
1998//                               const pixel *const topleft,
1999//                               const int width, const int height,
2000//                               const int16_t *ac, const int alpha,
2001//                               const int bitdepth_max);
2002function ipred_cfl_top_16bpc_neon, export=1
2003        dup             v31.8h,  w7   // bitdepth_max
2004        clz             w9,  w3
2005        adr             x7,  L(ipred_cfl_top_tbl)
2006        sub             w9,  w9,  #26
2007        ldrh            w9,  [x7, w9, uxtw #1]
2008        dup             v1.8h,   w6   // alpha
2009        add             x2,  x2,  #2
2010        sub             x7,  x7,  w9, uxtw
2011        add             x6,  x0,  x1
2012        lsl             x1,  x1,  #1
2013        movi            v30.8h,  #0
2014        br              x7
20154:
2016        ld1             {v0.4h},  [x2]
2017        addv            h0,      v0.4h
2018        urshr           v0.4h,   v0.4h,   #2
2019        dup             v0.8h,   v0.h[0]
2020        b               L(ipred_cfl_splat_w4)
20218:
2022        ld1             {v0.8h},  [x2]
2023        addv            h0,      v0.8h
2024        urshr           v0.4h,   v0.4h,   #3
2025        dup             v0.8h,   v0.h[0]
2026        b               L(ipred_cfl_splat_w8)
202716:
2028        ld1             {v2.8h, v3.8h}, [x2]
2029        addp            v0.8h,   v2.8h,   v3.8h
2030        addv            h0,      v0.8h
2031        urshr           v0.4h,   v0.4h,   #4
2032        dup             v0.8h,   v0.h[0]
2033        b               L(ipred_cfl_splat_w16)
203432:
2035        ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [x2]
2036        addp            v2.8h,   v2.8h,   v3.8h
2037        addp            v4.8h,   v4.8h,   v5.8h
2038        addp            v0.8h,   v2.8h,   v4.8h
2039        uaddlv          s0,      v0.8h
2040        rshrn           v0.4h,   v0.4s,   #5
2041        dup             v0.8h,   v0.h[0]
2042        b               L(ipred_cfl_splat_w16)
2043
2044L(ipred_cfl_top_tbl):
2045        .hword L(ipred_cfl_top_tbl) - 32b
2046        .hword L(ipred_cfl_top_tbl) - 16b
2047        .hword L(ipred_cfl_top_tbl) -  8b
2048        .hword L(ipred_cfl_top_tbl) -  4b
2049endfunc
2050
2051// void ipred_cfl_left_16bpc_neon(pixel *dst, const ptrdiff_t stride,
2052//                                const pixel *const topleft,
2053//                                const int width, const int height,
2054//                                const int16_t *ac, const int alpha,
2055//                                const int bitdepth_max);
2056function ipred_cfl_left_16bpc_neon, export=1
2057        dup             v31.8h,  w7   // bitdepth_max
2058        sub             x2,  x2,  w4, uxtw #1
2059        clz             w9,  w3
2060        clz             w8,  w4
2061        adr             x10, L(ipred_cfl_splat_tbl)
2062        adr             x7,  L(ipred_cfl_left_tbl)
2063        sub             w9,  w9,  #26
2064        sub             w8,  w8,  #26
2065        ldrh            w9,  [x10, w9, uxtw #1]
2066        ldrh            w8,  [x7,  w8, uxtw #1]
2067        dup             v1.8h,   w6   // alpha
2068        sub             x9,  x10, w9, uxtw
2069        sub             x7,  x7,  w8, uxtw
2070        add             x6,  x0,  x1
2071        lsl             x1,  x1,  #1
2072        movi            v30.8h,  #0
2073        br              x7
2074
2075L(ipred_cfl_left_h4):
2076        ld1             {v0.4h},  [x2]
2077        addv            h0,      v0.4h
2078        urshr           v0.4h,   v0.4h,   #2
2079        dup             v0.8h,   v0.h[0]
2080        br              x9
2081
2082L(ipred_cfl_left_h8):
2083        ld1             {v0.8h},  [x2]
2084        addv            h0,      v0.8h
2085        urshr           v0.4h,   v0.4h,   #3
2086        dup             v0.8h,   v0.h[0]
2087        br              x9
2088
2089L(ipred_cfl_left_h16):
2090        ld1             {v2.8h, v3.8h}, [x2]
2091        addp            v0.8h,   v2.8h,   v3.8h
2092        addv            h0,      v0.8h
2093        urshr           v0.4h,   v0.4h,   #4
2094        dup             v0.8h,   v0.h[0]
2095        br              x9
2096
2097L(ipred_cfl_left_h32):
2098        ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [x2]
2099        addp            v2.8h,   v2.8h,   v3.8h
2100        addp            v4.8h,   v4.8h,   v5.8h
2101        addp            v0.8h,   v2.8h,   v4.8h
2102        uaddlv          s0,      v0.8h
2103        rshrn           v0.4h,   v0.4s,   #5
2104        dup             v0.8h,   v0.h[0]
2105        br              x9
2106
2107L(ipred_cfl_left_tbl):
2108        .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h32)
2109        .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h16)
2110        .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h8)
2111        .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h4)
2112endfunc
2113
2114// void ipred_cfl_16bpc_neon(pixel *dst, const ptrdiff_t stride,
2115//                           const pixel *const topleft,
2116//                           const int width, const int height,
2117//                           const int16_t *ac, const int alpha,
2118//                           const int bitdepth_max);
2119function ipred_cfl_16bpc_neon, export=1
2120        dup             v31.8h,  w7              // bitdepth_max
2121        sub             x2,  x2,  w4, uxtw #1
2122        add             w8,  w3,  w4             // width + height
2123        dup             v1.8h,   w6              // alpha
2124        clz             w9,  w3
2125        clz             w6,  w4
2126        dup             v16.4s, w8               // width + height
2127        adr             x7,  L(ipred_cfl_tbl)
2128        rbit            w8,  w8                  // rbit(width + height)
2129        sub             w9,  w9,  #22            // 26 leading bits, minus table offset 4
2130        sub             w6,  w6,  #26
2131        clz             w8,  w8                  // ctz(width + height)
2132        ldrh            w9,  [x7, w9, uxtw #1]
2133        ldrh            w6,  [x7, w6, uxtw #1]
2134        neg             w8,  w8                  // -ctz(width + height)
2135        sub             x9,  x7,  w9, uxtw
2136        sub             x7,  x7,  w6, uxtw
2137        ushr            v16.4s,  v16.4s,  #1     // (width + height) >> 1
2138        dup             v17.4s,  w8              // -ctz(width + height)
2139        add             x6,  x0,  x1
2140        lsl             x1,  x1,  #1
2141        movi            v30.8h,  #0
2142        br              x7
2143
2144L(ipred_cfl_h4):
2145        ld1             {v0.4h},  [x2], #8
2146        uaddlv          s0,      v0.4h
2147        add             x2,  x2,  #2
2148        br              x9
2149L(ipred_cfl_w4):
2150        ld1             {v2.4h},  [x2]
2151        add             v0.2s,   v0.2s,   v16.2s
2152        uaddlv          s2,      v2.4h
2153        cmp             w4,  #4
2154        add             v0.2s,   v0.2s,   v2.2s
2155        ushl            v0.2s,   v0.2s,   v17.2s
2156        b.eq            1f
2157        // h = 8/16
2158        cmp             w4,  #16
2159        mov             w16, #0x6667
2160        mov             w17, #0xAAAB
2161        csel            w16, w16, w17, eq
2162        dup             v16.2s,  w16
2163        mul             v0.2s,   v0.2s,   v16.2s
2164        ushr            v0.2s,   v0.2s,   #17
21651:
2166        dup             v0.8h,   v0.h[0]
2167        b               L(ipred_cfl_splat_w4)
2168
2169L(ipred_cfl_h8):
2170        ld1             {v0.8h},  [x2], #16
2171        uaddlv          s0,      v0.8h
2172        add             x2,  x2,  #2
2173        br              x9
2174L(ipred_cfl_w8):
2175        ld1             {v2.8h},  [x2]
2176        add             v0.2s,   v0.2s,   v16.2s
2177        uaddlv          s2,      v2.8h
2178        cmp             w4,  #8
2179        add             v0.2s,   v0.2s,   v2.2s
2180        ushl            v0.2s,   v0.2s,   v17.2s
2181        b.eq            1f
2182        // h = 4/16/32
2183        cmp             w4,  #32
2184        mov             w16, #0x6667
2185        mov             w17, #0xAAAB
2186        csel            w16, w16, w17, eq
2187        dup             v16.2s,  w16
2188        mul             v0.2s,   v0.2s,   v16.2s
2189        ushr            v0.2s,   v0.2s,   #17
21901:
2191        dup             v0.8h,   v0.h[0]
2192        b               L(ipred_cfl_splat_w8)
2193
2194L(ipred_cfl_h16):
2195        ld1             {v2.8h, v3.8h}, [x2], #32
2196        addp            v0.8h,   v2.8h,   v3.8h
2197        add             x2,  x2,  #2
2198        uaddlv          s0,      v0.8h
2199        br              x9
2200L(ipred_cfl_w16):
2201        ld1             {v2.8h, v3.8h}, [x2]
2202        add             v0.2s,   v0.2s,   v16.2s
2203        addp            v2.8h,   v2.8h,   v3.8h
2204        uaddlv          s2,      v2.8h
2205        cmp             w4,  #16
2206        add             v0.2s,   v0.2s,   v2.2s
2207        ushl            v0.2s,   v0.2s,   v17.2s
2208        b.eq            1f
2209        // h = 4/8/32
2210        tst             w4,  #(32+16+8) // 16 added to make a consecutive bitmask
2211        mov             w16, #0x6667
2212        mov             w17, #0xAAAB
2213        csel            w16, w16, w17, eq
2214        dup             v16.2s,  w16
2215        mul             v0.2s,   v0.2s,   v16.2s
2216        ushr            v0.2s,   v0.2s,   #17
22171:
2218        dup             v0.8h,   v0.h[0]
2219        b               L(ipred_cfl_splat_w16)
2220
2221L(ipred_cfl_h32):
2222        ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [x2], #64
2223        addp            v2.8h,   v2.8h,   v3.8h
2224        addp            v4.8h,   v4.8h,   v5.8h
2225        addp            v0.8h,   v2.8h,   v4.8h
2226        add             x2,  x2,  #2
2227        uaddlv          s0,      v0.8h
2228        br              x9
2229L(ipred_cfl_w32):
2230        ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [x2]
2231        add             v0.4s,   v0.4s,   v16.4s
2232        addp            v2.8h,   v2.8h,   v3.8h
2233        addp            v4.8h,   v4.8h,   v5.8h
2234        addp            v2.8h,   v2.8h,   v4.8h
2235        cmp             w4,  #32
2236        uaddlv          s2,      v2.8h
2237        add             v0.2s,   v0.2s,   v2.2s
2238        ushl            v0.2s,   v0.2s,   v17.2s
2239        b.eq            1f
2240        // h = 8/16
2241        cmp             w4,  #8
2242        mov             w16, #0x6667
2243        mov             w17, #0xAAAB
2244        csel            w16, w16, w17, eq
2245        dup             v16.2s,  w16
2246        mul             v0.2s,   v0.2s,   v16.2s
2247        ushr            v0.2s,   v0.2s,   #17
22481:
2249        dup             v0.8h,   v0.h[0]
2250        b               L(ipred_cfl_splat_w16)
2251
2252L(ipred_cfl_tbl):
2253        .hword L(ipred_cfl_tbl) - L(ipred_cfl_h32)
2254        .hword L(ipred_cfl_tbl) - L(ipred_cfl_h16)
2255        .hword L(ipred_cfl_tbl) - L(ipred_cfl_h8)
2256        .hword L(ipred_cfl_tbl) - L(ipred_cfl_h4)
2257        .hword L(ipred_cfl_tbl) - L(ipred_cfl_w32)
2258        .hword L(ipred_cfl_tbl) - L(ipred_cfl_w16)
2259        .hword L(ipred_cfl_tbl) - L(ipred_cfl_w8)
2260        .hword L(ipred_cfl_tbl) - L(ipred_cfl_w4)
2261endfunc
2262
2263// void cfl_ac_420_16bpc_neon(int16_t *const ac, const pixel *const ypx,
2264//                            const ptrdiff_t stride, const int w_pad,
2265//                            const int h_pad, const int cw, const int ch);
2266function ipred_cfl_ac_420_16bpc_neon, export=1
2267        clz             w8,  w5
2268        lsl             w4,  w4,  #2
2269        adr             x7,  L(ipred_cfl_ac_420_tbl)
2270        sub             w8,  w8,  #27
2271        ldrh            w8,  [x7, w8, uxtw #1]
2272        movi            v24.4s,  #0
2273        movi            v25.4s,  #0
2274        movi            v26.4s,  #0
2275        movi            v27.4s,  #0
2276        sub             x7,  x7,  w8, uxtw
2277        sub             w8,  w6,  w4         // height - h_pad
2278        rbit            w9,  w5              // rbit(width)
2279        rbit            w10, w6              // rbit(height)
2280        clz             w9,  w9              // ctz(width)
2281        clz             w10, w10             // ctz(height)
2282        add             w9,  w9,  w10        // log2sz
2283        add             x10, x1,  x2
2284        dup             v31.4s,  w9
2285        lsl             x2,  x2,  #1
2286        neg             v31.4s,  v31.4s      // -log2sz
2287        br              x7
2288
2289L(ipred_cfl_ac_420_w4):
22901:      // Copy and subsample input
2291        ld1             {v0.8h}, [x1],  x2
2292        ld1             {v1.8h}, [x10], x2
2293        ld1             {v2.8h}, [x1],  x2
2294        ld1             {v3.8h}, [x10], x2
2295        addp            v0.8h,   v0.8h,   v2.8h
2296        addp            v1.8h,   v1.8h,   v3.8h
2297        add             v0.8h,   v0.8h,   v1.8h
2298        shl             v0.8h,   v0.8h,   #1
2299        subs            w8,  w8,  #2
2300        st1             {v0.8h}, [x0], #16
2301        uaddw           v24.4s,  v24.4s,  v0.4h
2302        uaddw2          v25.4s,  v25.4s,  v0.8h
2303        b.gt            1b
2304        trn2            v1.2d,   v0.2d,   v0.2d
2305        trn2            v0.2d,   v0.2d,   v0.2d
2306L(ipred_cfl_ac_420_w4_hpad):
2307        cbz             w4,  3f
23082:      // Vertical padding (h_pad > 0)
2309        subs            w4,  w4,  #4
2310        st1             {v0.8h, v1.8h}, [x0], #32
2311        uaddw           v24.4s,  v24.4s,  v0.4h
2312        uaddw2          v25.4s,  v25.4s,  v0.8h
2313        uaddw           v26.4s,  v26.4s,  v1.4h
2314        uaddw2          v27.4s,  v27.4s,  v1.8h
2315        b.gt            2b
23163:
2317L(ipred_cfl_ac_420_w4_calc_subtract_dc):
2318        // Aggregate the sums
2319        add             v24.4s,  v24.4s,  v25.4s
2320        add             v26.4s,  v26.4s,  v27.4s
2321        add             v0.4s,   v24.4s,  v26.4s
2322        addv            s0,  v0.4s                // sum
2323        sub             x0,  x0,  w6, uxtw #3
2324        urshl           v4.2s,   v0.2s,   v31.2s  // (sum + (1 << (log2sz - 1)))  >>= log2sz
2325        dup             v4.8h,   v4.h[0]
23266:      // Subtract dc from ac
2327        ld1             {v0.8h, v1.8h}, [x0]
2328        subs            w6,  w6,  #4
2329        sub             v0.8h,   v0.8h,   v4.8h
2330        sub             v1.8h,   v1.8h,   v4.8h
2331        st1             {v0.8h, v1.8h}, [x0], #32
2332        b.gt            6b
2333        ret
2334
2335L(ipred_cfl_ac_420_w8):
2336        cbnz            w3,  L(ipred_cfl_ac_420_w8_wpad)
23371:      // Copy and subsample input, without padding
2338        ld1             {v0.8h, v1.8h}, [x1],  x2
2339        ld1             {v2.8h, v3.8h}, [x10], x2
2340        ld1             {v4.8h, v5.8h}, [x1],  x2
2341        addp            v0.8h,   v0.8h,   v1.8h
2342        ld1             {v6.8h, v7.8h}, [x10], x2
2343        addp            v2.8h,   v2.8h,   v3.8h
2344        addp            v4.8h,   v4.8h,   v5.8h
2345        addp            v6.8h,   v6.8h,   v7.8h
2346        add             v0.8h,   v0.8h,   v2.8h
2347        add             v4.8h,   v4.8h,   v6.8h
2348        shl             v0.8h,   v0.8h,   #1
2349        shl             v1.8h,   v4.8h,   #1
2350        subs            w8,  w8,  #2
2351        st1             {v0.8h, v1.8h}, [x0], #32
2352        uaddw           v24.4s,  v24.4s,  v0.4h
2353        uaddw2          v25.4s,  v25.4s,  v0.8h
2354        uaddw           v26.4s,  v26.4s,  v1.4h
2355        uaddw2          v27.4s,  v27.4s,  v1.8h
2356        b.gt            1b
2357        mov             v0.16b,  v1.16b
2358        b               L(ipred_cfl_ac_420_w8_hpad)
2359
2360L(ipred_cfl_ac_420_w8_wpad):
23611:      // Copy and subsample input, padding 4
2362        ld1             {v0.8h}, [x1],  x2
2363        ld1             {v1.8h}, [x10], x2
2364        ld1             {v2.8h}, [x1],  x2
2365        ld1             {v3.8h}, [x10], x2
2366        addp            v0.8h,   v0.8h,   v2.8h
2367        addp            v1.8h,   v1.8h,   v3.8h
2368        add             v0.8h,   v0.8h,   v1.8h
2369        shl             v0.8h,   v0.8h,   #1
2370        dup             v1.4h,   v0.h[3]
2371        dup             v3.4h,   v0.h[7]
2372        trn2            v2.2d,   v0.2d,   v0.2d
2373        subs            w8,  w8,  #2
2374        st1             {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32
2375        uaddw           v24.4s,  v24.4s,  v0.4h
2376        uaddw           v25.4s,  v25.4s,  v1.4h
2377        uaddw           v26.4s,  v26.4s,  v2.4h
2378        uaddw           v27.4s,  v27.4s,  v3.4h
2379        b.gt            1b
2380        trn1            v0.2d,   v2.2d,   v3.2d
2381        trn1            v1.2d,   v2.2d,   v3.2d
2382
2383L(ipred_cfl_ac_420_w8_hpad):
2384        cbz             w4,  3f
23852:      // Vertical padding (h_pad > 0)
2386        subs            w4,  w4,  #4
2387        st1             {v0.8h, v1.8h}, [x0], #32
2388        uaddw           v24.4s,  v24.4s,  v0.4h
2389        uaddw2          v25.4s,  v25.4s,  v0.8h
2390        uaddw           v26.4s,  v26.4s,  v1.4h
2391        uaddw2          v27.4s,  v27.4s,  v1.8h
2392        st1             {v0.8h, v1.8h}, [x0], #32
2393        uaddw           v24.4s,  v24.4s,  v0.4h
2394        uaddw2          v25.4s,  v25.4s,  v0.8h
2395        uaddw           v26.4s,  v26.4s,  v1.4h
2396        uaddw2          v27.4s,  v27.4s,  v1.8h
2397        b.gt            2b
23983:
2399
2400        // Double the height and reuse the w4 summing/subtracting
2401        lsl             w6,  w6,  #1
2402        b               L(ipred_cfl_ac_420_w4_calc_subtract_dc)
2403
2404L(ipred_cfl_ac_420_w16):
2405        adr             x7,  L(ipred_cfl_ac_420_w16_tbl)
2406        ldrh            w3,  [x7, w3, uxtw #1]
2407        sub             x7,  x7,  w3, uxtw
2408        br              x7
2409
2410L(ipred_cfl_ac_420_w16_wpad0):
24111:      // Copy and subsample input, without padding
2412        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x1],  x2
2413        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x10], x2
2414        addp            v0.8h,   v0.8h,   v1.8h
2415        addp            v2.8h,   v2.8h,   v3.8h
2416        addp            v4.8h,   v4.8h,   v5.8h
2417        addp            v6.8h,   v6.8h,   v7.8h
2418        ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x1],  x2
2419        add             v0.8h,   v0.8h,   v4.8h
2420        ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x10], x2
2421        add             v2.8h,   v2.8h,   v6.8h
2422        addp            v16.8h,  v16.8h,  v17.8h
2423        addp            v18.8h,  v18.8h,  v19.8h
2424        addp            v20.8h,  v20.8h,  v21.8h
2425        addp            v22.8h,  v22.8h,  v23.8h
2426        add             v16.8h,  v16.8h,  v20.8h
2427        add             v18.8h,  v18.8h,  v22.8h
2428        shl             v0.8h,   v0.8h,   #1
2429        shl             v1.8h,   v2.8h,   #1
2430        shl             v2.8h,   v16.8h,  #1
2431        shl             v3.8h,   v18.8h,  #1
2432        subs            w8,  w8,  #2
2433        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
2434        uaddw           v24.4s,  v24.4s,  v0.4h
2435        uaddw2          v25.4s,  v25.4s,  v0.8h
2436        uaddw           v26.4s,  v26.4s,  v1.4h
2437        uaddw2          v27.4s,  v27.4s,  v1.8h
2438        uaddw           v24.4s,  v24.4s,  v2.4h
2439        uaddw2          v25.4s,  v25.4s,  v2.8h
2440        uaddw           v26.4s,  v26.4s,  v3.4h
2441        uaddw2          v27.4s,  v27.4s,  v3.8h
2442        b.gt            1b
2443        mov             v0.16b,  v2.16b
2444        mov             v1.16b,  v3.16b
2445        b               L(ipred_cfl_ac_420_w16_hpad)
2446
2447L(ipred_cfl_ac_420_w16_wpad1):
24481:      // Copy and subsample input, padding 4
2449        ldr             q2,  [x1,  #32]
2450        ld1             {v0.8h, v1.8h}, [x1],  x2
2451        ldr             q5,  [x10, #32]
2452        ld1             {v3.8h, v4.8h}, [x10], x2
2453        addp            v2.8h,   v2.8h,   v2.8h
2454        addp            v0.8h,   v0.8h,   v1.8h
2455        addp            v5.8h,   v5.8h,   v5.8h
2456        addp            v3.8h,   v3.8h,   v4.8h
2457        ldr             q18, [x1,  #32]
2458        add             v2.4h,   v2.4h,   v5.4h
2459        ld1             {v16.8h, v17.8h}, [x1],  x2
2460        add             v0.8h,   v0.8h,   v3.8h
2461        ldr             q21, [x10, #32]
2462        ld1             {v19.8h, v20.8h}, [x10], x2
2463        addp            v18.8h,  v18.8h,  v18.8h
2464        addp            v16.8h,  v16.8h,  v17.8h
2465        addp            v21.8h,  v21.8h,  v21.8h
2466        addp            v19.8h,  v19.8h,  v20.8h
2467        add             v18.4h,  v18.4h,  v21.4h
2468        add             v16.8h,  v16.8h,  v19.8h
2469        shl             v1.4h,   v2.4h,   #1
2470        shl             v0.8h,   v0.8h,   #1
2471        shl             v3.4h,   v18.4h,  #1
2472        shl             v2.8h,   v16.8h,  #1
2473        dup             v4.4h,   v1.h[3]
2474        dup             v5.4h,   v3.h[3]
2475        trn1            v1.2d,   v1.2d,   v4.2d
2476        trn1            v3.2d,   v3.2d,   v5.2d
2477        subs            w8,  w8,  #2
2478        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
2479        uaddw           v24.4s,  v24.4s,  v0.4h
2480        uaddw2          v25.4s,  v25.4s,  v0.8h
2481        uaddw           v26.4s,  v26.4s,  v1.4h
2482        uaddw2          v27.4s,  v27.4s,  v1.8h
2483        uaddw           v24.4s,  v24.4s,  v2.4h
2484        uaddw2          v25.4s,  v25.4s,  v2.8h
2485        uaddw           v26.4s,  v26.4s,  v3.4h
2486        uaddw2          v27.4s,  v27.4s,  v3.8h
2487        b.gt            1b
2488        mov             v0.16b,  v2.16b
2489        mov             v1.16b,  v3.16b
2490        b               L(ipred_cfl_ac_420_w16_hpad)
2491
2492L(ipred_cfl_ac_420_w16_wpad2):
24931:      // Copy and subsample input, padding 8
2494        ld1             {v0.8h, v1.8h}, [x1],  x2
2495        ld1             {v2.8h, v3.8h}, [x10], x2
2496        ld1             {v4.8h, v5.8h}, [x1],  x2
2497        addp            v0.8h,   v0.8h,   v1.8h
2498        ld1             {v6.8h, v7.8h}, [x10], x2
2499        addp            v2.8h,   v2.8h,   v3.8h
2500        addp            v4.8h,   v4.8h,   v5.8h
2501        addp            v6.8h,   v6.8h,   v7.8h
2502        add             v0.8h,   v0.8h,   v2.8h
2503        add             v4.8h,   v4.8h,   v6.8h
2504        shl             v0.8h,   v0.8h,   #1
2505        shl             v2.8h,   v4.8h,   #1
2506        dup             v1.8h,   v0.h[7]
2507        dup             v3.8h,   v2.h[7]
2508        subs            w8,  w8,  #2
2509        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
2510        uaddw           v24.4s,  v24.4s,  v0.4h
2511        uaddw2          v25.4s,  v25.4s,  v0.8h
2512        uaddw           v26.4s,  v26.4s,  v1.4h
2513        uaddw2          v27.4s,  v27.4s,  v1.8h
2514        uaddw           v24.4s,  v24.4s,  v2.4h
2515        uaddw2          v25.4s,  v25.4s,  v2.8h
2516        uaddw           v26.4s,  v26.4s,  v3.4h
2517        uaddw2          v27.4s,  v27.4s,  v3.8h
2518        b.gt            1b
2519        mov             v0.16b,  v2.16b
2520        mov             v1.16b,  v3.16b
2521        b               L(ipred_cfl_ac_420_w16_hpad)
2522
2523L(ipred_cfl_ac_420_w16_wpad3):
25241:      // Copy and subsample input, padding 12
2525        ld1             {v0.8h}, [x1],  x2
2526        ld1             {v2.8h}, [x10], x2
2527        ld1             {v4.8h}, [x1],  x2
2528        ld1             {v6.8h}, [x10], x2
2529        addp            v0.8h,   v0.8h,   v4.8h
2530        addp            v2.8h,   v2.8h,   v6.8h
2531        add             v0.8h,   v0.8h,   v2.8h
2532        shl             v0.8h,   v0.8h,   #1
2533        dup             v1.8h,   v0.h[3]
2534        dup             v3.8h,   v0.h[7]
2535        trn2            v2.2d,   v0.2d,   v3.2d
2536        trn1            v0.2d,   v0.2d,   v1.2d
2537        subs            w8,  w8,  #2
2538        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
2539        uaddw           v24.4s,  v24.4s,  v0.4h
2540        uaddw2          v25.4s,  v25.4s,  v0.8h
2541        uaddw           v26.4s,  v26.4s,  v1.4h
2542        uaddw2          v27.4s,  v27.4s,  v1.8h
2543        uaddw           v24.4s,  v24.4s,  v2.4h
2544        uaddw2          v25.4s,  v25.4s,  v2.8h
2545        uaddw           v26.4s,  v26.4s,  v3.4h
2546        uaddw2          v27.4s,  v27.4s,  v3.8h
2547        b.gt            1b
2548        mov             v0.16b,  v2.16b
2549        mov             v1.16b,  v3.16b
2550
2551L(ipred_cfl_ac_420_w16_hpad):
2552        cbz             w4,  3f
25532:      // Vertical padding (h_pad > 0)
2554        subs            w4,  w4,  #4
2555        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
2556        uaddw           v24.4s,  v24.4s,  v0.4h
2557        uaddw2          v25.4s,  v25.4s,  v0.8h
2558        uaddw           v26.4s,  v26.4s,  v1.4h
2559        uaddw2          v27.4s,  v27.4s,  v1.8h
2560        uaddw           v24.4s,  v24.4s,  v2.4h
2561        uaddw2          v25.4s,  v25.4s,  v2.8h
2562        uaddw           v26.4s,  v26.4s,  v3.4h
2563        uaddw2          v27.4s,  v27.4s,  v3.8h
2564        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
2565        uaddw           v24.4s,  v24.4s,  v0.4h
2566        uaddw2          v25.4s,  v25.4s,  v0.8h
2567        uaddw           v26.4s,  v26.4s,  v1.4h
2568        uaddw2          v27.4s,  v27.4s,  v1.8h
2569        uaddw           v24.4s,  v24.4s,  v2.4h
2570        uaddw2          v25.4s,  v25.4s,  v2.8h
2571        uaddw           v26.4s,  v26.4s,  v3.4h
2572        uaddw2          v27.4s,  v27.4s,  v3.8h
2573        b.gt            2b
25743:
2575
2576        // Quadruple the height and reuse the w4 summing/subtracting
2577        lsl             w6,  w6,  #2
2578        b               L(ipred_cfl_ac_420_w4_calc_subtract_dc)
2579
2580L(ipred_cfl_ac_420_tbl):
2581        .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w16)
2582        .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w8)
2583        .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w4)
2584        .hword 0
2585
2586L(ipred_cfl_ac_420_w16_tbl):
2587        .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad0)
2588        .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad1)
2589        .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad2)
2590        .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad3)
2591endfunc
2592
2593// void cfl_ac_422_16bpc_neon(int16_t *const ac, const pixel *const ypx,
2594//                            const ptrdiff_t stride, const int w_pad,
2595//                            const int h_pad, const int cw, const int ch);
2596function ipred_cfl_ac_422_16bpc_neon, export=1
2597        clz             w8,  w5
2598        lsl             w4,  w4,  #2
2599        adr             x7,  L(ipred_cfl_ac_422_tbl)
2600        sub             w8,  w8,  #27
2601        ldrh            w8,  [x7, w8, uxtw #1]
2602        movi            v24.4s,  #0
2603        movi            v25.4s,  #0
2604        movi            v26.4s,  #0
2605        movi            v27.4s,  #0
2606        sub             x7,  x7,  w8, uxtw
2607        sub             w8,  w6,  w4         // height - h_pad
2608        rbit            w9,  w5              // rbit(width)
2609        rbit            w10, w6              // rbit(height)
2610        clz             w9,  w9              // ctz(width)
2611        clz             w10, w10             // ctz(height)
2612        add             w9,  w9,  w10        // log2sz
2613        add             x10, x1,  x2
2614        dup             v31.4s,  w9
2615        lsl             x2,  x2,  #1
2616        neg             v31.4s,  v31.4s      // -log2sz
2617        br              x7
2618
2619L(ipred_cfl_ac_422_w4):
26201:      // Copy and subsample input
2621        ld1             {v0.8h}, [x1],  x2
2622        ld1             {v1.8h}, [x10], x2
2623        ld1             {v2.8h}, [x1],  x2
2624        ld1             {v3.8h}, [x10], x2
2625        addp            v0.8h,   v0.8h,   v1.8h
2626        addp            v2.8h,   v2.8h,   v3.8h
2627        shl             v0.8h,   v0.8h,   #2
2628        shl             v1.8h,   v2.8h,   #2
2629        subs            w8,  w8,  #4
2630        st1             {v0.8h, v1.8h}, [x0], #32
2631        uaddw           v24.4s,  v24.4s,  v0.4h
2632        uaddw2          v25.4s,  v25.4s,  v0.8h
2633        uaddw           v26.4s,  v26.4s,  v1.4h
2634        uaddw2          v27.4s,  v27.4s,  v1.8h
2635        b.gt            1b
2636        trn2            v0.2d,   v1.2d,   v1.2d
2637        trn2            v1.2d,   v1.2d,   v1.2d
2638        b               L(ipred_cfl_ac_420_w4_hpad)
2639
2640L(ipred_cfl_ac_422_w8):
2641        cbnz            w3,  L(ipred_cfl_ac_422_w8_wpad)
26421:      // Copy and subsample input, without padding
2643        ld1             {v0.8h, v1.8h}, [x1],  x2
2644        ld1             {v2.8h, v3.8h}, [x10], x2
2645        ld1             {v4.8h, v5.8h}, [x1],  x2
2646        addp            v0.8h,   v0.8h,   v1.8h
2647        ld1             {v6.8h, v7.8h}, [x10], x2
2648        addp            v2.8h,   v2.8h,   v3.8h
2649        addp            v4.8h,   v4.8h,   v5.8h
2650        addp            v6.8h,   v6.8h,   v7.8h
2651        shl             v0.8h,   v0.8h,   #2
2652        shl             v1.8h,   v2.8h,   #2
2653        shl             v2.8h,   v4.8h,   #2
2654        shl             v3.8h,   v6.8h,   #2
2655        subs            w8,  w8,  #4
2656        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
2657        uaddw           v24.4s,  v24.4s,  v0.4h
2658        uaddw2          v25.4s,  v25.4s,  v0.8h
2659        uaddw           v26.4s,  v26.4s,  v1.4h
2660        uaddw2          v27.4s,  v27.4s,  v1.8h
2661        uaddw           v24.4s,  v24.4s,  v2.4h
2662        uaddw2          v25.4s,  v25.4s,  v2.8h
2663        uaddw           v26.4s,  v26.4s,  v3.4h
2664        uaddw2          v27.4s,  v27.4s,  v3.8h
2665        b.gt            1b
2666        mov             v0.16b,  v3.16b
2667        mov             v1.16b,  v3.16b
2668        b               L(ipred_cfl_ac_420_w8_hpad)
2669
2670L(ipred_cfl_ac_422_w8_wpad):
26711:      // Copy and subsample input, padding 4
2672        ld1             {v0.8h}, [x1],  x2
2673        ld1             {v1.8h}, [x10], x2
2674        ld1             {v2.8h}, [x1],  x2
2675        ld1             {v3.8h}, [x10], x2
2676        addp            v0.8h,   v0.8h,   v1.8h
2677        addp            v2.8h,   v2.8h,   v3.8h
2678        shl             v0.8h,   v0.8h,   #2
2679        shl             v2.8h,   v2.8h,   #2
2680        dup             v4.4h,   v0.h[3]
2681        dup             v5.8h,   v0.h[7]
2682        dup             v6.4h,   v2.h[3]
2683        dup             v7.8h,   v2.h[7]
2684        trn2            v1.2d,   v0.2d,   v5.2d
2685        trn1            v0.2d,   v0.2d,   v4.2d
2686        trn2            v3.2d,   v2.2d,   v7.2d
2687        trn1            v2.2d,   v2.2d,   v6.2d
2688        subs            w8,  w8,  #4
2689        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
2690        uaddw           v24.4s,  v24.4s,  v0.4h
2691        uaddw2          v25.4s,  v25.4s,  v0.8h
2692        uaddw           v26.4s,  v26.4s,  v1.4h
2693        uaddw2          v27.4s,  v27.4s,  v1.8h
2694        uaddw           v24.4s,  v24.4s,  v2.4h
2695        uaddw2          v25.4s,  v25.4s,  v2.8h
2696        uaddw           v26.4s,  v26.4s,  v3.4h
2697        uaddw2          v27.4s,  v27.4s,  v3.8h
2698        b.gt            1b
2699        mov             v0.16b,  v3.16b
2700        mov             v1.16b,  v3.16b
2701        b               L(ipred_cfl_ac_420_w8_hpad)
2702
2703L(ipred_cfl_ac_422_w16):
2704        adr             x7,  L(ipred_cfl_ac_422_w16_tbl)
2705        ldrh            w3,  [x7, w3, uxtw #1]
2706        sub             x7,  x7,  w3, uxtw
2707        br              x7
2708
2709L(ipred_cfl_ac_422_w16_wpad0):
27101:      // Copy and subsample input, without padding
2711        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x1],  x2
2712        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x10], x2
2713        addp            v0.8h,   v0.8h,   v1.8h
2714        addp            v2.8h,   v2.8h,   v3.8h
2715        addp            v4.8h,   v4.8h,   v5.8h
2716        addp            v6.8h,   v6.8h,   v7.8h
2717        shl             v0.8h,   v0.8h,   #2
2718        shl             v1.8h,   v2.8h,   #2
2719        shl             v2.8h,   v4.8h,   #2
2720        shl             v3.8h,   v6.8h,   #2
2721        subs            w8,  w8,  #2
2722        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
2723        uaddw           v24.4s,  v24.4s,  v0.4h
2724        uaddw2          v25.4s,  v25.4s,  v0.8h
2725        uaddw           v26.4s,  v26.4s,  v1.4h
2726        uaddw2          v27.4s,  v27.4s,  v1.8h
2727        uaddw           v24.4s,  v24.4s,  v2.4h
2728        uaddw2          v25.4s,  v25.4s,  v2.8h
2729        uaddw           v26.4s,  v26.4s,  v3.4h
2730        uaddw2          v27.4s,  v27.4s,  v3.8h
2731        b.gt            1b
2732        mov             v0.16b,  v2.16b
2733        mov             v1.16b,  v3.16b
2734        b               L(ipred_cfl_ac_420_w16_hpad)
2735
2736L(ipred_cfl_ac_422_w16_wpad1):
27371:      // Copy and subsample input, padding 4
2738        ldr             q2,  [x1,  #32]
2739        ld1             {v0.8h, v1.8h}, [x1],  x2
2740        ldr             q6,  [x10, #32]
2741        ld1             {v4.8h, v5.8h}, [x10], x2
2742        addp            v2.8h,   v2.8h,   v2.8h
2743        addp            v0.8h,   v0.8h,   v1.8h
2744        addp            v6.8h,   v6.8h,   v6.8h
2745        addp            v4.8h,   v4.8h,   v5.8h
2746        shl             v1.4h,   v2.4h,   #2
2747        shl             v0.8h,   v0.8h,   #2
2748        shl             v3.4h,   v6.4h,   #2
2749        shl             v2.8h,   v4.8h,   #2
2750        dup             v4.4h,   v1.h[3]
2751        dup             v5.4h,   v3.h[3]
2752        trn1            v1.2d,   v1.2d,   v4.2d
2753        trn1            v3.2d,   v3.2d,   v5.2d
2754        subs            w8,  w8,  #2
2755        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
2756        uaddw           v24.4s,  v24.4s,  v0.4h
2757        uaddw2          v25.4s,  v25.4s,  v0.8h
2758        uaddw           v26.4s,  v26.4s,  v1.4h
2759        uaddw2          v27.4s,  v27.4s,  v1.8h
2760        uaddw           v24.4s,  v24.4s,  v2.4h
2761        uaddw2          v25.4s,  v25.4s,  v2.8h
2762        uaddw           v26.4s,  v26.4s,  v3.4h
2763        uaddw2          v27.4s,  v27.4s,  v3.8h
2764        b.gt            1b
2765        mov             v0.16b,  v2.16b
2766        mov             v1.16b,  v3.16b
2767        b               L(ipred_cfl_ac_420_w16_hpad)
2768
2769L(ipred_cfl_ac_422_w16_wpad2):
27701:      // Copy and subsample input, padding 8
2771        ld1             {v0.8h, v1.8h}, [x1],  x2
2772        ld1             {v2.8h, v3.8h}, [x10], x2
2773        addp            v0.8h,   v0.8h,   v1.8h
2774        addp            v2.8h,   v2.8h,   v3.8h
2775        shl             v0.8h,   v0.8h,   #2
2776        shl             v2.8h,   v2.8h,   #2
2777        dup             v1.8h,   v0.h[7]
2778        dup             v3.8h,   v2.h[7]
2779        subs            w8,  w8,  #2
2780        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
2781        uaddw           v24.4s,  v24.4s,  v0.4h
2782        uaddw2          v25.4s,  v25.4s,  v0.8h
2783        uaddw           v26.4s,  v26.4s,  v1.4h
2784        uaddw2          v27.4s,  v27.4s,  v1.8h
2785        uaddw           v24.4s,  v24.4s,  v2.4h
2786        uaddw2          v25.4s,  v25.4s,  v2.8h
2787        uaddw           v26.4s,  v26.4s,  v3.4h
2788        uaddw2          v27.4s,  v27.4s,  v3.8h
2789        b.gt            1b
2790        mov             v0.16b,  v2.16b
2791        mov             v1.16b,  v3.16b
2792        b               L(ipred_cfl_ac_420_w16_hpad)
2793
2794L(ipred_cfl_ac_422_w16_wpad3):
27951:      // Copy and subsample input, padding 12
2796        ld1             {v0.8h}, [x1],  x2
2797        ld1             {v2.8h}, [x10], x2
2798        addp            v0.8h,   v0.8h,   v0.8h
2799        addp            v2.8h,   v2.8h,   v2.8h
2800        shl             v0.4h,   v0.4h,   #2
2801        shl             v2.4h,   v2.4h,   #2
2802        dup             v1.8h,   v0.h[3]
2803        dup             v3.8h,   v2.h[3]
2804        trn1            v0.2d,   v0.2d,   v1.2d
2805        trn1            v2.2d,   v2.2d,   v3.2d
2806        subs            w8,  w8,  #2
2807        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
2808        uaddw           v24.4s,  v24.4s,  v0.4h
2809        uaddw2          v25.4s,  v25.4s,  v0.8h
2810        uaddw           v26.4s,  v26.4s,  v1.4h
2811        uaddw2          v27.4s,  v27.4s,  v1.8h
2812        uaddw           v24.4s,  v24.4s,  v2.4h
2813        uaddw2          v25.4s,  v25.4s,  v2.8h
2814        uaddw           v26.4s,  v26.4s,  v3.4h
2815        uaddw2          v27.4s,  v27.4s,  v3.8h
2816        b.gt            1b
2817        mov             v0.16b,  v2.16b
2818        mov             v1.16b,  v3.16b
2819        b               L(ipred_cfl_ac_420_w16_hpad)
2820
2821L(ipred_cfl_ac_422_tbl):
2822        .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w16)
2823        .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w8)
2824        .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w4)
2825        .hword 0
2826
2827L(ipred_cfl_ac_422_w16_tbl):
2828        .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad0)
2829        .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad1)
2830        .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad2)
2831        .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad3)
2832endfunc
2833
2834// void cfl_ac_444_16bpc_neon(int16_t *const ac, const pixel *const ypx,
2835//                            const ptrdiff_t stride, const int w_pad,
2836//                            const int h_pad, const int cw, const int ch);
2837function ipred_cfl_ac_444_16bpc_neon, export=1
2838        clz             w8,  w5
2839        lsl             w4,  w4,  #2
2840        adr             x7,  L(ipred_cfl_ac_444_tbl)
2841        sub             w8,  w8,  #26
2842        ldrh            w8,  [x7, w8, uxtw #1]
2843        movi            v24.4s,  #0
2844        movi            v25.4s,  #0
2845        movi            v26.4s,  #0
2846        movi            v27.4s,  #0
2847        sub             x7,  x7,  w8, uxtw
2848        sub             w8,  w6,  w4         // height - h_pad
2849        rbit            w9,  w5              // rbit(width)
2850        rbit            w10, w6              // rbit(height)
2851        clz             w9,  w9              // ctz(width)
2852        clz             w10, w10             // ctz(height)
2853        add             w9,  w9,  w10        // log2sz
2854        add             x10, x1,  x2
2855        dup             v31.4s,  w9
2856        lsl             x2,  x2,  #1
2857        neg             v31.4s,  v31.4s      // -log2sz
2858        br              x7
2859
2860L(ipred_cfl_ac_444_w4):
28611:      // Copy and expand input
2862        ld1             {v0.4h},   [x1],  x2
2863        ld1             {v0.d}[1], [x10], x2
2864        ld1             {v1.4h},   [x1],  x2
2865        ld1             {v1.d}[1], [x10], x2
2866        shl             v0.8h,   v0.8h,   #3
2867        shl             v1.8h,   v1.8h,   #3
2868        subs            w8,  w8,  #4
2869        st1             {v0.8h, v1.8h}, [x0], #32
2870        uaddw           v24.4s,  v24.4s,  v0.4h
2871        uaddw2          v25.4s,  v25.4s,  v0.8h
2872        uaddw           v26.4s,  v26.4s,  v1.4h
2873        uaddw2          v27.4s,  v27.4s,  v1.8h
2874        b.gt            1b
2875        trn2            v0.2d,   v1.2d,   v1.2d
2876        trn2            v1.2d,   v1.2d,   v1.2d
2877        b               L(ipred_cfl_ac_420_w4_hpad)
2878
2879L(ipred_cfl_ac_444_w8):
28801:      // Copy and expand input
2881        ld1             {v0.8h}, [x1],  x2
2882        ld1             {v1.8h}, [x10], x2
2883        ld1             {v2.8h}, [x1],  x2
2884        shl             v0.8h,   v0.8h,   #3
2885        ld1             {v3.8h}, [x10], x2
2886        shl             v1.8h,   v1.8h,   #3
2887        shl             v2.8h,   v2.8h,   #3
2888        shl             v3.8h,   v3.8h,   #3
2889        subs            w8,  w8,  #4
2890        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
2891        uaddw           v24.4s,  v24.4s,  v0.4h
2892        uaddw2          v25.4s,  v25.4s,  v0.8h
2893        uaddw           v26.4s,  v26.4s,  v1.4h
2894        uaddw2          v27.4s,  v27.4s,  v1.8h
2895        uaddw           v24.4s,  v24.4s,  v2.4h
2896        uaddw2          v25.4s,  v25.4s,  v2.8h
2897        uaddw           v26.4s,  v26.4s,  v3.4h
2898        uaddw2          v27.4s,  v27.4s,  v3.8h
2899        b.gt            1b
2900        mov             v0.16b,  v3.16b
2901        mov             v1.16b,  v3.16b
2902        b               L(ipred_cfl_ac_420_w8_hpad)
2903
2904L(ipred_cfl_ac_444_w16):
2905        cbnz            w3,  L(ipred_cfl_ac_444_w16_wpad)
29061:      // Copy and expand input, without padding
2907        ld1             {v0.8h, v1.8h}, [x1],  x2
2908        ld1             {v2.8h, v3.8h}, [x10], x2
2909        shl             v0.8h,   v0.8h,   #3
2910        shl             v1.8h,   v1.8h,   #3
2911        shl             v2.8h,   v2.8h,   #3
2912        shl             v3.8h,   v3.8h,   #3
2913        subs            w8,  w8,  #2
2914        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
2915        uaddw           v24.4s,  v24.4s,  v0.4h
2916        uaddw2          v25.4s,  v25.4s,  v0.8h
2917        uaddw           v26.4s,  v26.4s,  v1.4h
2918        uaddw2          v27.4s,  v27.4s,  v1.8h
2919        uaddw           v24.4s,  v24.4s,  v2.4h
2920        uaddw2          v25.4s,  v25.4s,  v2.8h
2921        uaddw           v26.4s,  v26.4s,  v3.4h
2922        uaddw2          v27.4s,  v27.4s,  v3.8h
2923        b.gt            1b
2924        mov             v0.16b,  v2.16b
2925        mov             v1.16b,  v3.16b
2926        b               L(ipred_cfl_ac_420_w16_hpad)
2927
2928L(ipred_cfl_ac_444_w16_wpad):
29291:      // Copy and expand input, padding 8
2930        ld1             {v0.8h}, [x1],  x2
2931        ld1             {v2.8h}, [x10], x2
2932        shl             v0.8h,   v0.8h,   #3
2933        shl             v2.8h,   v2.8h,   #3
2934        dup             v1.8h,   v0.h[7]
2935        dup             v3.8h,   v2.h[7]
2936        subs            w8,  w8,  #2
2937        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
2938        uaddw           v24.4s,  v24.4s,  v0.4h
2939        uaddw2          v25.4s,  v25.4s,  v0.8h
2940        uaddw           v26.4s,  v26.4s,  v1.4h
2941        uaddw2          v27.4s,  v27.4s,  v1.8h
2942        uaddw           v24.4s,  v24.4s,  v2.4h
2943        uaddw2          v25.4s,  v25.4s,  v2.8h
2944        uaddw           v26.4s,  v26.4s,  v3.4h
2945        uaddw2          v27.4s,  v27.4s,  v3.8h
2946        b.gt            1b
2947        mov             v0.16b,  v2.16b
2948        mov             v1.16b,  v3.16b
2949        b               L(ipred_cfl_ac_420_w16_hpad)
2950
2951L(ipred_cfl_ac_444_w32):
2952        adr             x7,  L(ipred_cfl_ac_444_w32_tbl)
2953        ldrh            w3,  [x7, w3, uxtw] // (w3>>1) << 1
2954        lsr             x2,  x2,  #1 // Restore the stride to one line increments
2955        sub             x7,  x7,  w3, uxtw
2956        br              x7
2957
2958L(ipred_cfl_ac_444_w32_wpad0):
29591:      // Copy and expand input, without padding
2960        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x1],  x2
2961        shl             v0.8h,   v0.8h,   #3
2962        shl             v1.8h,   v1.8h,   #3
2963        shl             v2.8h,   v2.8h,   #3
2964        shl             v3.8h,   v3.8h,   #3
2965        subs            w8,  w8,  #1
2966        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
2967        uaddw           v24.4s,  v24.4s,  v0.4h
2968        uaddw2          v25.4s,  v25.4s,  v0.8h
2969        uaddw           v26.4s,  v26.4s,  v1.4h
2970        uaddw2          v27.4s,  v27.4s,  v1.8h
2971        uaddw           v24.4s,  v24.4s,  v2.4h
2972        uaddw2          v25.4s,  v25.4s,  v2.8h
2973        uaddw           v26.4s,  v26.4s,  v3.4h
2974        uaddw2          v27.4s,  v27.4s,  v3.8h
2975        b.gt            1b
2976        b               L(ipred_cfl_ac_444_w32_hpad)
2977
2978L(ipred_cfl_ac_444_w32_wpad2):
29791:      // Copy and expand input, padding 8
2980        ld1             {v0.8h, v1.8h, v2.8h}, [x1],  x2
2981        shl             v2.8h,   v2.8h,   #3
2982        shl             v0.8h,   v0.8h,   #3
2983        shl             v1.8h,   v1.8h,   #3
2984        dup             v3.8h,   v2.h[7]
2985        subs            w8,  w8,  #1
2986        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
2987        uaddw           v24.4s,  v24.4s,  v0.4h
2988        uaddw2          v25.4s,  v25.4s,  v0.8h
2989        uaddw           v26.4s,  v26.4s,  v1.4h
2990        uaddw2          v27.4s,  v27.4s,  v1.8h
2991        uaddw           v24.4s,  v24.4s,  v2.4h
2992        uaddw2          v25.4s,  v25.4s,  v2.8h
2993        uaddw           v26.4s,  v26.4s,  v3.4h
2994        uaddw2          v27.4s,  v27.4s,  v3.8h
2995        b.gt            1b
2996        b               L(ipred_cfl_ac_444_w32_hpad)
2997
2998L(ipred_cfl_ac_444_w32_wpad4):
29991:      // Copy and expand input, padding 16
3000        ld1             {v0.8h, v1.8h}, [x1],  x2
3001        shl             v1.8h,   v1.8h,   #3
3002        shl             v0.8h,   v0.8h,   #3
3003        dup             v2.8h,   v1.h[7]
3004        dup             v3.8h,   v1.h[7]
3005        subs            w8,  w8,  #1
3006        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
3007        uaddw           v24.4s,  v24.4s,  v0.4h
3008        uaddw2          v25.4s,  v25.4s,  v0.8h
3009        uaddw           v26.4s,  v26.4s,  v1.4h
3010        uaddw2          v27.4s,  v27.4s,  v1.8h
3011        uaddw           v24.4s,  v24.4s,  v2.4h
3012        uaddw2          v25.4s,  v25.4s,  v2.8h
3013        uaddw           v26.4s,  v26.4s,  v3.4h
3014        uaddw2          v27.4s,  v27.4s,  v3.8h
3015        b.gt            1b
3016        b               L(ipred_cfl_ac_444_w32_hpad)
3017
3018L(ipred_cfl_ac_444_w32_wpad6):
30191:      // Copy and expand input, padding 24
3020        ld1             {v0.8h}, [x1],  x2
3021        shl             v0.8h,   v0.8h,   #3
3022        dup             v1.8h,   v0.h[7]
3023        dup             v2.8h,   v0.h[7]
3024        dup             v3.8h,   v0.h[7]
3025        subs            w8,  w8,  #1
3026        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
3027        uaddw           v24.4s,  v24.4s,  v0.4h
3028        uaddw2          v25.4s,  v25.4s,  v0.8h
3029        uaddw           v26.4s,  v26.4s,  v1.4h
3030        uaddw2          v27.4s,  v27.4s,  v1.8h
3031        uaddw           v24.4s,  v24.4s,  v2.4h
3032        uaddw2          v25.4s,  v25.4s,  v2.8h
3033        uaddw           v26.4s,  v26.4s,  v3.4h
3034        uaddw2          v27.4s,  v27.4s,  v3.8h
3035        b.gt            1b
3036
3037L(ipred_cfl_ac_444_w32_hpad):
3038        cbz             w4,  3f
30392:      // Vertical padding (h_pad > 0)
3040        subs            w4,  w4,  #2
3041        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
3042        uaddw           v24.4s,  v24.4s,  v0.4h
3043        uaddw2          v25.4s,  v25.4s,  v0.8h
3044        uaddw           v26.4s,  v26.4s,  v1.4h
3045        uaddw2          v27.4s,  v27.4s,  v1.8h
3046        uaddw           v24.4s,  v24.4s,  v2.4h
3047        uaddw2          v25.4s,  v25.4s,  v2.8h
3048        uaddw           v26.4s,  v26.4s,  v3.4h
3049        uaddw2          v27.4s,  v27.4s,  v3.8h
3050        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
3051        uaddw           v24.4s,  v24.4s,  v0.4h
3052        uaddw2          v25.4s,  v25.4s,  v0.8h
3053        uaddw           v26.4s,  v26.4s,  v1.4h
3054        uaddw2          v27.4s,  v27.4s,  v1.8h
3055        uaddw           v24.4s,  v24.4s,  v2.4h
3056        uaddw2          v25.4s,  v25.4s,  v2.8h
3057        uaddw           v26.4s,  v26.4s,  v3.4h
3058        uaddw2          v27.4s,  v27.4s,  v3.8h
3059        b.gt            2b
30603:
3061
3062        //  Multiply the height by eight and reuse the w4 subtracting
3063        lsl             w6,  w6,  #3
3064        b               L(ipred_cfl_ac_420_w4_calc_subtract_dc)
3065
3066L(ipred_cfl_ac_444_tbl):
3067        .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w32)
3068        .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w16)
3069        .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w8)
3070        .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w4)
3071
3072L(ipred_cfl_ac_444_w32_tbl):
3073        .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad0)
3074        .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad2)
3075        .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad4)
3076        .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad6)
3077endfunc
3078