1/*
2 * Copyright © 2018, VideoLAN and dav1d authors
3 * Copyright © 2019, Martin Storsjo
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright notice, this
10 *    list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright notice,
13 *    this list of conditions and the following disclaimer in the documentation
14 *    and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include "src/arm/asm.S"
29#include "util.S"
30
31// void ipred_dc_128_16bpc_neon(pixel *dst, const ptrdiff_t stride,
32//                              const pixel *const topleft,
33//                              const int width, const int height, const int a,
34//                              const int max_width, const int max_height,
35//                              const int bitdepth_max);
36function ipred_dc_128_16bpc_neon, export=1
37        ldr             w8,  [sp]
38        clz             w3,  w3
39        adr             x5,  L(ipred_dc_128_tbl)
40        sub             w3,  w3,  #25
41        ldrh            w3,  [x5, w3, uxtw #1]
42        dup             v0.8h,   w8
43        sub             x5,  x5,  w3, uxtw
44        add             x6,  x0,  x1
45        lsl             x1,  x1,  #1
46        urshr           v0.8h,   v0.8h,  #1
47        br              x5
484:
49        st1             {v0.4h},  [x0], x1
50        st1             {v0.4h},  [x6], x1
51        subs            w4,  w4,  #4
52        st1             {v0.4h},  [x0], x1
53        st1             {v0.4h},  [x6], x1
54        b.gt            4b
55        ret
568:
57        st1             {v0.8h},  [x0], x1
58        st1             {v0.8h},  [x6], x1
59        subs            w4,  w4,  #4
60        st1             {v0.8h},  [x0], x1
61        st1             {v0.8h},  [x6], x1
62        b.gt            8b
63        ret
64160:
65        mov             v1.16b,  v0.16b
6616:
67        st1             {v0.8h, v1.8h}, [x0], x1
68        st1             {v0.8h, v1.8h}, [x6], x1
69        subs            w4,  w4,  #4
70        st1             {v0.8h, v1.8h}, [x0], x1
71        st1             {v0.8h, v1.8h}, [x6], x1
72        b.gt            16b
73        ret
74320:
75        mov             v1.16b,  v0.16b
76        mov             v2.16b,  v0.16b
77        mov             v3.16b,  v0.16b
7832:
79        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
80        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
81        subs            w4,  w4,  #4
82        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
83        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
84        b.gt            32b
85        ret
86640:
87        mov             v1.16b,  v0.16b
88        mov             v2.16b,  v0.16b
89        mov             v3.16b,  v0.16b
90        sub             x1,  x1,  #64
9164:
92        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
93        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
94        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
95        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
96        subs            w4,  w4,  #4
97        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
98        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
99        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
100        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
101        b.gt            64b
102        ret
103
104L(ipred_dc_128_tbl):
105        .hword L(ipred_dc_128_tbl) - 640b
106        .hword L(ipred_dc_128_tbl) - 320b
107        .hword L(ipred_dc_128_tbl) - 160b
108        .hword L(ipred_dc_128_tbl) -   8b
109        .hword L(ipred_dc_128_tbl) -   4b
110endfunc
111
112// void ipred_v_16bpc_neon(pixel *dst, const ptrdiff_t stride,
113//                         const pixel *const topleft,
114//                         const int width, const int height, const int a,
115//                         const int max_width, const int max_height);
116function ipred_v_16bpc_neon, export=1
117        clz             w3,  w3
118        adr             x5,  L(ipred_v_tbl)
119        sub             w3,  w3,  #25
120        ldrh            w3,  [x5, w3, uxtw #1]
121        add             x2,  x2,  #2
122        sub             x5,  x5,  w3, uxtw
123        add             x6,  x0,  x1
124        lsl             x1,  x1,  #1
125        br              x5
12640:
127        ld1             {v0.4h},  [x2]
1284:
129        st1             {v0.4h},  [x0], x1
130        st1             {v0.4h},  [x6], x1
131        subs            w4,  w4,  #4
132        st1             {v0.4h},  [x0], x1
133        st1             {v0.4h},  [x6], x1
134        b.gt            4b
135        ret
13680:
137        ld1             {v0.8h},  [x2]
1388:
139        st1             {v0.8h},  [x0], x1
140        st1             {v0.8h},  [x6], x1
141        subs            w4,  w4,  #4
142        st1             {v0.8h},  [x0], x1
143        st1             {v0.8h},  [x6], x1
144        b.gt            8b
145        ret
146160:
147        ld1             {v0.8h, v1.8h}, [x2]
14816:
149        st1             {v0.8h, v1.8h}, [x0], x1
150        st1             {v0.8h, v1.8h}, [x6], x1
151        subs            w4,  w4,  #4
152        st1             {v0.8h, v1.8h}, [x0], x1
153        st1             {v0.8h, v1.8h}, [x6], x1
154        b.gt            16b
155        ret
156320:
157        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2]
15832:
159        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
160        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
161        subs            w4,  w4,  #4
162        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
163        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
164        b.gt            32b
165        ret
166640:
167        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
168        sub             x1,  x1,  #64
169        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2]
17064:
171        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
172        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
173        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1
174        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], x1
175        subs            w4,  w4,  #4
176        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
177        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
178        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1
179        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], x1
180        b.gt            64b
181        ret
182
183L(ipred_v_tbl):
184        .hword L(ipred_v_tbl) - 640b
185        .hword L(ipred_v_tbl) - 320b
186        .hword L(ipred_v_tbl) - 160b
187        .hword L(ipred_v_tbl) -  80b
188        .hword L(ipred_v_tbl) -  40b
189endfunc
190
191// void ipred_h_16bpc_neon(pixel *dst, const ptrdiff_t stride,
192//                         const pixel *const topleft,
193//                         const int width, const int height, const int a,
194//                         const int max_width, const int max_height);
195function ipred_h_16bpc_neon, export=1
196        clz             w3,  w3
197        adr             x5,  L(ipred_h_tbl)
198        sub             w3,  w3,  #25
199        ldrh            w3,  [x5, w3, uxtw #1]
200        sub             x2,  x2,  #8
201        sub             x5,  x5,  w3, uxtw
202        mov             x7,  #-8
203        add             x6,  x0,  x1
204        lsl             x1,  x1,  #1
205        br              x5
2064:
207        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7
208        st1             {v3.4h},  [x0], x1
209        st1             {v2.4h},  [x6], x1
210        subs            w4,  w4,  #4
211        st1             {v1.4h},  [x0], x1
212        st1             {v0.4h},  [x6], x1
213        b.gt            4b
214        ret
2158:
216        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7
217        st1             {v3.8h},  [x0], x1
218        st1             {v2.8h},  [x6], x1
219        subs            w4,  w4,  #4
220        st1             {v1.8h},  [x0], x1
221        st1             {v0.8h},  [x6], x1
222        b.gt            8b
223        ret
22416:
225        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7
226        str             q3,  [x0, #16]
227        str             q2,  [x6, #16]
228        st1             {v3.8h}, [x0], x1
229        st1             {v2.8h}, [x6], x1
230        subs            w4,  w4,  #4
231        str             q1,  [x0, #16]
232        str             q0,  [x6, #16]
233        st1             {v1.8h}, [x0], x1
234        st1             {v0.8h}, [x6], x1
235        b.gt            16b
236        ret
23732:
238        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7
239        str             q3,  [x0, #16]
240        str             q2,  [x6, #16]
241        stp             q3,  q3,  [x0, #32]
242        stp             q2,  q2,  [x6, #32]
243        st1             {v3.8h}, [x0], x1
244        st1             {v2.8h}, [x6], x1
245        subs            w4,  w4,  #4
246        str             q1,  [x0, #16]
247        str             q0,  [x6, #16]
248        stp             q1,  q1,  [x0, #32]
249        stp             q0,  q0,  [x6, #32]
250        st1             {v1.8h}, [x0], x1
251        st1             {v0.8h}, [x6], x1
252        b.gt            32b
253        ret
25464:
255        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7
256        str             q3,  [x0, #16]
257        str             q2,  [x6, #16]
258        stp             q3,  q3,  [x0, #32]
259        stp             q2,  q2,  [x6, #32]
260        stp             q3,  q3,  [x0, #64]
261        stp             q2,  q2,  [x6, #64]
262        stp             q3,  q3,  [x0, #96]
263        stp             q2,  q2,  [x6, #96]
264        st1             {v3.8h}, [x0], x1
265        st1             {v2.8h}, [x6], x1
266        subs            w4,  w4,  #4
267        str             q1,  [x0, #16]
268        str             q0,  [x6, #16]
269        stp             q1,  q1,  [x0, #32]
270        stp             q0,  q0,  [x6, #32]
271        stp             q1,  q1,  [x0, #64]
272        stp             q0,  q0,  [x6, #64]
273        stp             q1,  q1,  [x0, #96]
274        stp             q0,  q0,  [x6, #96]
275        st1             {v1.8h}, [x0], x1
276        st1             {v0.8h}, [x6], x1
277        b.gt            64b
278        ret
279
280L(ipred_h_tbl):
281        .hword L(ipred_h_tbl) - 64b
282        .hword L(ipred_h_tbl) - 32b
283        .hword L(ipred_h_tbl) - 16b
284        .hword L(ipred_h_tbl) -  8b
285        .hword L(ipred_h_tbl) -  4b
286endfunc
287
288// void ipred_dc_top_16bpc_neon(pixel *dst, const ptrdiff_t stride,
289//                              const pixel *const topleft,
290//                              const int width, const int height, const int a,
291//                              const int max_width, const int max_height);
292function ipred_dc_top_16bpc_neon, export=1
293        clz             w3,  w3
294        adr             x5,  L(ipred_dc_top_tbl)
295        sub             w3,  w3,  #25
296        ldrh            w3,  [x5, w3, uxtw #1]
297        add             x2,  x2,  #2
298        sub             x5,  x5,  w3, uxtw
299        add             x6,  x0,  x1
300        lsl             x1,  x1,  #1
301        br              x5
30240:
303        ld1             {v0.4h},  [x2]
304        addv            h0,      v0.4h
305        urshr           v0.4h,   v0.4h,   #2
306        dup             v0.4h,   v0.h[0]
3074:
308        st1             {v0.4h},  [x0], x1
309        st1             {v0.4h},  [x6], x1
310        subs            w4,  w4,  #4
311        st1             {v0.4h},  [x0], x1
312        st1             {v0.4h},  [x6], x1
313        b.gt            4b
314        ret
31580:
316        ld1             {v0.8h},  [x2]
317        addv            h0,      v0.8h
318        urshr           v0.4h,   v0.4h,   #3
319        dup             v0.8h,   v0.h[0]
3208:
321        st1             {v0.8h},  [x0], x1
322        st1             {v0.8h},  [x6], x1
323        subs            w4,  w4,  #4
324        st1             {v0.8h},  [x0], x1
325        st1             {v0.8h},  [x6], x1
326        b.gt            8b
327        ret
328160:
329        ld1             {v0.8h, v1.8h}, [x2]
330        addp            v0.8h,   v0.8h,   v1.8h
331        addv            h0,      v0.8h
332        urshr           v2.4h,   v0.4h,   #4
333        dup             v0.8h,   v2.h[0]
334        dup             v1.8h,   v2.h[0]
33516:
336        st1             {v0.8h, v1.8h}, [x0], x1
337        st1             {v0.8h, v1.8h}, [x6], x1
338        subs            w4,  w4,  #4
339        st1             {v0.8h, v1.8h}, [x0], x1
340        st1             {v0.8h, v1.8h}, [x6], x1
341        b.gt            16b
342        ret
343320:
344        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2]
345        addp            v0.8h,   v0.8h,   v1.8h
346        addp            v2.8h,   v2.8h,   v3.8h
347        addp            v0.8h,   v0.8h,   v2.8h
348        uaddlv          s0,      v0.8h
349        rshrn           v4.4h,   v0.4s,   #5
350        dup             v0.8h,   v4.h[0]
351        dup             v1.8h,   v4.h[0]
352        dup             v2.8h,   v4.h[0]
353        dup             v3.8h,   v4.h[0]
35432:
355        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
356        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
357        subs            w4,  w4,  #4
358        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
359        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
360        b.gt            32b
361        ret
362640:
363        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
364        addp            v0.8h,   v0.8h,   v1.8h
365        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2]
366        addp            v2.8h,   v2.8h,   v3.8h
367        addp            v4.8h,   v4.8h,   v5.8h
368        addp            v6.8h,   v6.8h,   v7.8h
369        addp            v0.8h,   v0.8h,   v2.8h
370        addp            v4.8h,   v4.8h,   v6.8h
371        addp            v0.8h,   v0.8h,   v4.8h
372        uaddlv          s0,      v0.8h
373        rshrn           v4.4h,   v0.4s,   #6
374        sub             x1,  x1,  #64
375        dup             v0.8h,   v4.h[0]
376        dup             v1.8h,   v4.h[0]
377        dup             v2.8h,   v4.h[0]
378        dup             v3.8h,   v4.h[0]
37964:
380        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
381        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
382        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
383        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
384        subs            w4,  w4,  #4
385        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
386        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
387        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
388        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
389        b.gt            64b
390        ret
391
392L(ipred_dc_top_tbl):
393        .hword L(ipred_dc_top_tbl) - 640b
394        .hword L(ipred_dc_top_tbl) - 320b
395        .hword L(ipred_dc_top_tbl) - 160b
396        .hword L(ipred_dc_top_tbl) -  80b
397        .hword L(ipred_dc_top_tbl) -  40b
398endfunc
399
400// void ipred_dc_left_16bpc_neon(pixel *dst, const ptrdiff_t stride,
401//                               const pixel *const topleft,
402//                               const int width, const int height, const int a,
403//                               const int max_width, const int max_height);
404function ipred_dc_left_16bpc_neon, export=1
405        sub             x2,  x2,  w4, uxtw #1
406        clz             w3,  w3
407        clz             w7,  w4
408        adr             x5,  L(ipred_dc_left_tbl)
409        sub             w3,  w3,  #20 // 25 leading bits, minus table offset 5
410        sub             w7,  w7,  #25
411        ldrh            w3,  [x5, w3, uxtw #1]
412        ldrh            w7,  [x5, w7, uxtw #1]
413        sub             x3,  x5,  w3, uxtw
414        sub             x5,  x5,  w7, uxtw
415        add             x6,  x0,  x1
416        lsl             x1,  x1,  #1
417        br              x5
418
419L(ipred_dc_left_h4):
420        ld1             {v0.4h},  [x2]
421        addv            h0,      v0.4h
422        urshr           v0.4h,   v0.4h,   #2
423        dup             v0.8h,   v0.h[0]
424        br              x3
425L(ipred_dc_left_w4):
426        st1             {v0.4h},  [x0], x1
427        st1             {v0.4h},  [x6], x1
428        subs            w4,  w4,  #4
429        st1             {v0.4h},  [x0], x1
430        st1             {v0.4h},  [x6], x1
431        b.gt            L(ipred_dc_left_w4)
432        ret
433
434L(ipred_dc_left_h8):
435        ld1             {v0.8h},  [x2]
436        addv            h0,      v0.8h
437        urshr           v0.4h,   v0.4h,   #3
438        dup             v0.8h,   v0.h[0]
439        br              x3
440L(ipred_dc_left_w8):
441        st1             {v0.8h},  [x0], x1
442        st1             {v0.8h},  [x6], x1
443        subs            w4,  w4,  #4
444        st1             {v0.8h},  [x0], x1
445        st1             {v0.8h},  [x6], x1
446        b.gt            L(ipred_dc_left_w8)
447        ret
448
449L(ipred_dc_left_h16):
450        ld1             {v0.8h, v1.8h}, [x2]
451        addp            v0.8h,   v0.8h,   v1.8h
452        addv            h0,      v0.8h
453        urshr           v2.4h,   v0.4h,   #4
454        dup             v0.8h,   v2.h[0]
455        dup             v1.8h,   v2.h[0]
456        br              x3
457L(ipred_dc_left_w16):
458        mov             v1.16b,  v0.16b
4591:
460        st1             {v0.8h, v1.8h}, [x0], x1
461        st1             {v0.8h, v1.8h}, [x6], x1
462        subs            w4,  w4,  #4
463        st1             {v0.8h, v1.8h}, [x0], x1
464        st1             {v0.8h, v1.8h}, [x6], x1
465        b.gt            1b
466        ret
467
468L(ipred_dc_left_h32):
469        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2]
470        addp            v0.8h,   v0.8h,   v1.8h
471        addp            v2.8h,   v2.8h,   v3.8h
472        addp            v0.8h,   v0.8h,   v2.8h
473        uaddlp          v0.4s,   v0.8h
474        addv            s0,      v0.4s
475        rshrn           v4.4h,   v0.4s,   #5
476        dup             v0.8h,   v4.h[0]
477        br              x3
478L(ipred_dc_left_w32):
479        mov             v1.16b,  v0.16b
480        mov             v2.16b,  v0.16b
481        mov             v3.16b,  v0.16b
4821:
483        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
484        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
485        subs            w4,  w4,  #4
486        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
487        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
488        b.gt            1b
489        ret
490
491L(ipred_dc_left_h64):
492        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
493        addp            v0.8h,   v0.8h,   v1.8h
494        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2]
495        addp            v2.8h,   v2.8h,   v3.8h
496        addp            v4.8h,   v4.8h,   v5.8h
497        addp            v6.8h,   v6.8h,   v7.8h
498        addp            v0.8h,   v0.8h,   v2.8h
499        addp            v4.8h,   v4.8h,   v6.8h
500        addp            v0.8h,   v0.8h,   v4.8h
501        uaddlv          s0,      v0.8h
502        rshrn           v4.4h,   v0.4s,   #6
503        dup             v0.8h,   v4.h[0]
504        br              x3
505L(ipred_dc_left_w64):
506        mov             v1.16b,  v0.16b
507        mov             v2.16b,  v0.16b
508        mov             v3.16b,  v0.16b
509        sub             x1,  x1,  #64
5101:
511        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
512        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
513        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
514        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
515        subs            w4,  w4,  #4
516        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
517        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
518        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
519        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
520        b.gt            1b
521        ret
522
523L(ipred_dc_left_tbl):
524        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h64)
525        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h32)
526        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h16)
527        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h8)
528        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h4)
529        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w64)
530        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w32)
531        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w16)
532        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w8)
533        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w4)
534endfunc
535
536// void ipred_dc_16bpc_neon(pixel *dst, const ptrdiff_t stride,
537//                          const pixel *const topleft,
538//                          const int width, const int height, const int a,
539//                          const int max_width, const int max_height);
540function ipred_dc_16bpc_neon, export=1
541        sub             x2,  x2,  w4, uxtw #1
542        add             w7,  w3,  w4             // width + height
543        clz             w3,  w3
544        clz             w6,  w4
545        dup             v16.4s, w7               // width + height
546        adr             x5,  L(ipred_dc_tbl)
547        rbit            w7,  w7                  // rbit(width + height)
548        sub             w3,  w3,  #20            // 25 leading bits, minus table offset 5
549        sub             w6,  w6,  #25
550        clz             w7,  w7                  // ctz(width + height)
551        ldrh            w3,  [x5, w3, uxtw #1]
552        ldrh            w6,  [x5, w6, uxtw #1]
553        neg             w7,  w7                  // -ctz(width + height)
554        sub             x3,  x5,  w3, uxtw
555        sub             x5,  x5,  w6, uxtw
556        ushr            v16.4s,  v16.4s,  #1     // (width + height) >> 1
557        dup             v17.4s,  w7              // -ctz(width + height)
558        add             x6,  x0,  x1
559        lsl             x1,  x1,  #1
560        br              x5
561
562L(ipred_dc_h4):
563        ld1             {v0.4h},  [x2], #8
564        uaddlv          s0,      v0.4h
565        br              x3
566L(ipred_dc_w4):
567        add             x2,  x2,  #2
568        ld1             {v1.4h},  [x2]
569        add             v0.2s,   v0.2s,   v16.2s
570        uaddlv          s1,      v1.4h
571        cmp             w4,  #4
572        add             v0.2s,   v0.2s,   v1.2s
573        ushl            v0.2s,   v0.2s,   v17.2s
574        b.eq            1f
575        // h = 8/16
576        cmp             w4,  #16
577        mov             w16, #0x6667
578        mov             w17, #0xAAAB
579        csel            w16, w16, w17, eq
580        dup             v16.2s,  w16
581        mul             v0.2s,   v0.2s,   v16.2s
582        ushr            v0.2s,   v0.2s,   #17
5831:
584        dup             v0.4h,   v0.h[0]
5852:
586        st1             {v0.4h},  [x0], x1
587        st1             {v0.4h},  [x6], x1
588        subs            w4,  w4,  #4
589        st1             {v0.4h},  [x0], x1
590        st1             {v0.4h},  [x6], x1
591        b.gt            2b
592        ret
593
594L(ipred_dc_h8):
595        ld1             {v0.8h},  [x2], #16
596        uaddlv          s0,      v0.8h
597        br              x3
598L(ipred_dc_w8):
599        add             x2,  x2,  #2
600        ld1             {v1.8h},  [x2]
601        add             v0.2s,   v0.2s,   v16.2s
602        uaddlv          s1,      v1.8h
603        cmp             w4,  #8
604        add             v0.2s,   v0.2s,   v1.2s
605        ushl            v0.2s,   v0.2s,   v17.2s
606        b.eq            1f
607        // h = 4/16/32
608        cmp             w4,  #32
609        mov             w16, #0x6667
610        mov             w17, #0xAAAB
611        csel            w16, w16, w17, eq
612        dup             v16.2s,  w16
613        mul             v0.2s,   v0.2s,   v16.2s
614        ushr            v0.2s,   v0.2s,   #17
6151:
616        dup             v0.8h,   v0.h[0]
6172:
618        st1             {v0.8h},  [x0], x1
619        st1             {v0.8h},  [x6], x1
620        subs            w4,  w4,  #4
621        st1             {v0.8h},  [x0], x1
622        st1             {v0.8h},  [x6], x1
623        b.gt            2b
624        ret
625
626L(ipred_dc_h16):
627        ld1             {v0.8h, v1.8h}, [x2], #32
628        addp            v0.8h,   v0.8h,   v1.8h
629        uaddlv          s0,      v0.8h
630        br              x3
631L(ipred_dc_w16):
632        add             x2,  x2,  #2
633        ld1             {v1.8h, v2.8h}, [x2]
634        add             v0.2s,   v0.2s,   v16.2s
635        addp            v1.8h,   v1.8h,   v2.8h
636        uaddlv          s1,      v1.8h
637        cmp             w4,  #16
638        add             v0.2s,   v0.2s,   v1.2s
639        ushl            v4.2s,   v0.2s,   v17.2s
640        b.eq            1f
641        // h = 4/8/32/64
642        tst             w4,  #(32+16+8) // 16 added to make a consecutive bitmask
643        mov             w16, #0x6667
644        mov             w17, #0xAAAB
645        csel            w16, w16, w17, eq
646        dup             v16.2s,  w16
647        mul             v4.2s,   v4.2s,   v16.2s
648        ushr            v4.2s,   v4.2s,   #17
6491:
650        dup             v0.8h,   v4.h[0]
651        dup             v1.8h,   v4.h[0]
6522:
653        st1             {v0.8h, v1.8h}, [x0], x1
654        st1             {v0.8h, v1.8h}, [x6], x1
655        subs            w4,  w4,  #4
656        st1             {v0.8h, v1.8h}, [x0], x1
657        st1             {v0.8h, v1.8h}, [x6], x1
658        b.gt            2b
659        ret
660
661L(ipred_dc_h32):
662        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
663        addp            v0.8h,   v0.8h,   v1.8h
664        addp            v2.8h,   v2.8h,   v3.8h
665        addp            v0.8h,   v0.8h,   v2.8h
666        uaddlv          s0,      v0.8h
667        br              x3
668L(ipred_dc_w32):
669        add             x2,  x2,  #2
670        ld1             {v1.8h, v2.8h, v3.8h, v4.8h}, [x2]
671        add             v0.2s,   v0.2s,   v16.2s
672        addp            v1.8h,   v1.8h,   v2.8h
673        addp            v3.8h,   v3.8h,   v4.8h
674        addp            v1.8h,   v1.8h,   v3.8h
675        uaddlv          s1,      v1.8h
676        cmp             w4,  #32
677        add             v0.2s,   v0.2s,   v1.2s
678        ushl            v4.2s,   v0.2s,   v17.2s
679        b.eq            1f
680        // h = 8/16/64
681        cmp             w4,  #8
682        mov             w16, #0x6667
683        mov             w17, #0xAAAB
684        csel            w16, w16, w17, eq
685        dup             v16.2s,  w16
686        mul             v4.2s,   v4.2s,   v16.2s
687        ushr            v4.2s,   v4.2s,   #17
6881:
689        dup             v0.8h,   v4.h[0]
690        dup             v1.8h,   v4.h[0]
691        dup             v2.8h,   v4.h[0]
692        dup             v3.8h,   v4.h[0]
6932:
694        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
695        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
696        subs            w4,  w4,  #4
697        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
698        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
699        b.gt            2b
700        ret
701
702L(ipred_dc_h64):
703        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
704        addp            v0.8h,   v0.8h,   v1.8h
705        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
706        addp            v2.8h,   v2.8h,   v3.8h
707        addp            v4.8h,   v4.8h,   v5.8h
708        addp            v6.8h,   v6.8h,   v7.8h
709        addp            v0.8h,   v0.8h,   v2.8h
710        addp            v4.8h,   v4.8h,   v6.8h
711        addp            v0.8h,   v0.8h,   v4.8h
712        uaddlv          s0,      v0.8h
713        br              x3
714L(ipred_dc_w64):
715        add             x2,  x2,  #2
716        ld1             {v1.8h, v2.8h, v3.8h, v4.8h}, [x2], #64
717        add             v0.2s,   v0.2s,   v16.2s
718        addp            v1.8h,   v1.8h,   v2.8h
719        ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x2]
720        addp            v3.8h,   v3.8h,   v4.8h
721        addp            v20.8h,  v20.8h,  v21.8h
722        addp            v22.8h,  v22.8h,  v23.8h
723        addp            v1.8h,   v1.8h,   v3.8h
724        addp            v20.8h,  v20.8h,  v22.8h
725        addp            v1.8h,   v1.8h,   v20.8h
726        uaddlv          s1,      v1.8h
727        cmp             w4,  #64
728        add             v0.2s,   v0.2s,   v1.2s
729        ushl            v4.2s,   v0.2s,   v17.2s
730        b.eq            1f
731        // h = 16/32
732        cmp             w4,  #16
733        mov             w16, #0x6667
734        mov             w17, #0xAAAB
735        csel            w16, w16, w17, eq
736        dup             v16.2s,  w16
737        mul             v4.2s,   v4.2s,   v16.2s
738        ushr            v4.2s,   v4.2s,   #17
7391:
740        sub             x1,  x1,  #64
741        dup             v0.8h,   v4.h[0]
742        dup             v1.8h,   v4.h[0]
743        dup             v2.8h,   v4.h[0]
744        dup             v3.8h,   v4.h[0]
7452:
746        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
747        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
748        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
749        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
750        subs            w4,  w4,  #4
751        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
752        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
753        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
754        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
755        b.gt            2b
756        ret
757
758L(ipred_dc_tbl):
759        .hword L(ipred_dc_tbl) - L(ipred_dc_h64)
760        .hword L(ipred_dc_tbl) - L(ipred_dc_h32)
761        .hword L(ipred_dc_tbl) - L(ipred_dc_h16)
762        .hword L(ipred_dc_tbl) - L(ipred_dc_h8)
763        .hword L(ipred_dc_tbl) - L(ipred_dc_h4)
764        .hword L(ipred_dc_tbl) - L(ipred_dc_w64)
765        .hword L(ipred_dc_tbl) - L(ipred_dc_w32)
766        .hword L(ipred_dc_tbl) - L(ipred_dc_w16)
767        .hword L(ipred_dc_tbl) - L(ipred_dc_w8)
768        .hword L(ipred_dc_tbl) - L(ipred_dc_w4)
769endfunc
770
771// void ipred_paeth_16bpc_neon(pixel *dst, const ptrdiff_t stride,
772//                             const pixel *const topleft,
773//                             const int width, const int height, const int a,
774//                             const int max_width, const int max_height);
775function ipred_paeth_16bpc_neon, export=1
776        clz             w9,  w3
777        adr             x5,  L(ipred_paeth_tbl)
778        sub             w9,  w9,  #25
779        ldrh            w9,  [x5, w9, uxtw #1]
780        ld1r            {v4.8h},  [x2]
781        add             x8,  x2,  #2
782        sub             x2,  x2,  #8
783        sub             x5,  x5,  w9, uxtw
784        mov             x7,  #-8
785        add             x6,  x0,  x1
786        lsl             x1,  x1,  #1
787        br              x5
78840:
789        ld1r            {v5.2d},  [x8]
790        sub             v6.8h,   v5.8h,   v4.8h   // top - topleft
7914:
792        ld4r            {v0.4h, v1.4h, v2.4h, v3.4h},  [x2], x7
793        zip1            v0.2d,   v0.2d,   v1.2d
794        zip1            v2.2d,   v2.2d,   v3.2d
795        add             v16.8h,  v6.8h,   v0.8h   // base
796        add             v17.8h,  v6.8h,   v2.8h
797        sabd            v20.8h,  v5.8h,   v16.8h  // tdiff
798        sabd            v21.8h,  v5.8h,   v17.8h
799        sabd            v22.8h,  v4.8h,   v16.8h  // tldiff
800        sabd            v23.8h,  v4.8h,   v17.8h
801        sabd            v16.8h,  v0.8h,   v16.8h  // ldiff
802        sabd            v17.8h,  v2.8h,   v17.8h
803        umin            v18.8h,  v20.8h,  v22.8h  // min(tdiff, tldiff)
804        umin            v19.8h,  v21.8h,  v23.8h
805        cmge            v20.8h,  v22.8h,  v20.8h  // tldiff >= tdiff
806        cmge            v21.8h,  v23.8h,  v21.8h
807        cmge            v16.8h,  v18.8h,  v16.8h  // min(tdiff, tldiff) >= ldiff
808        cmge            v17.8h,  v19.8h,  v17.8h
809        bsl             v21.16b, v5.16b,  v4.16b  // tdiff <= tldiff ? top : topleft
810        bsl             v20.16b, v5.16b,  v4.16b
811        bit             v21.16b, v2.16b,  v17.16b // ldiff <= min ? left : ...
812        bit             v20.16b, v0.16b,  v16.16b
813        st1             {v21.d}[1], [x0], x1
814        st1             {v21.d}[0], [x6], x1
815        subs            w4,  w4,  #4
816        st1             {v20.d}[1], [x0], x1
817        st1             {v20.d}[0], [x6], x1
818        b.gt            4b
819        ret
82080:
821160:
822320:
823640:
824        ld1             {v5.8h},  [x8], #16
825        mov             w9,  w3
826        // Set up pointers for four rows in parallel; x0, x6, x5, x10
827        add             x5,  x0,  x1
828        add             x10, x6,  x1
829        lsl             x1,  x1,  #1
830        sub             x1,  x1,  w3, uxtw #1
8311:
832        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7
8332:
834        sub             v6.8h,   v5.8h,   v4.8h   // top - topleft
835        add             v16.8h,  v6.8h,   v0.8h   // base
836        add             v17.8h,  v6.8h,   v1.8h
837        add             v18.8h,  v6.8h,   v2.8h
838        add             v19.8h,  v6.8h,   v3.8h
839        sabd            v20.8h,  v5.8h,   v16.8h  // tdiff
840        sabd            v21.8h,  v5.8h,   v17.8h
841        sabd            v22.8h,  v5.8h,   v18.8h
842        sabd            v23.8h,  v5.8h,   v19.8h
843        sabd            v24.8h,  v4.8h,   v16.8h  // tldiff
844        sabd            v25.8h,  v4.8h,   v17.8h
845        sabd            v26.8h,  v4.8h,   v18.8h
846        sabd            v27.8h,  v4.8h,   v19.8h
847        sabd            v16.8h,  v0.8h,   v16.8h  // ldiff
848        sabd            v17.8h,  v1.8h,   v17.8h
849        sabd            v18.8h,  v2.8h,   v18.8h
850        sabd            v19.8h,  v3.8h,   v19.8h
851        umin            v28.8h,  v20.8h,  v24.8h  // min(tdiff, tldiff)
852        umin            v29.8h,  v21.8h,  v25.8h
853        umin            v30.8h,  v22.8h,  v26.8h
854        umin            v31.8h,  v23.8h,  v27.8h
855        cmge            v20.8h,  v24.8h,  v20.8h  // tldiff >= tdiff
856        cmge            v21.8h,  v25.8h,  v21.8h
857        cmge            v22.8h,  v26.8h,  v22.8h
858        cmge            v23.8h,  v27.8h,  v23.8h
859        cmge            v16.8h,  v28.8h,  v16.8h  // min(tdiff, tldiff) >= ldiff
860        cmge            v17.8h,  v29.8h,  v17.8h
861        cmge            v18.8h,  v30.8h,  v18.8h
862        cmge            v19.8h,  v31.8h,  v19.8h
863        bsl             v23.16b, v5.16b,  v4.16b  // tdiff <= tldiff ? top : topleft
864        bsl             v22.16b, v5.16b,  v4.16b
865        bsl             v21.16b, v5.16b,  v4.16b
866        bsl             v20.16b, v5.16b,  v4.16b
867        bit             v23.16b, v3.16b,  v19.16b // ldiff <= min ? left : ...
868        bit             v22.16b, v2.16b,  v18.16b
869        bit             v21.16b, v1.16b,  v17.16b
870        bit             v20.16b, v0.16b,  v16.16b
871        st1             {v23.8h}, [x0], #16
872        st1             {v22.8h}, [x6], #16
873        subs            w3,  w3,  #8
874        st1             {v21.8h}, [x5], #16
875        st1             {v20.8h}, [x10], #16
876        b.le            8f
877        ld1             {v5.8h},  [x8], #16
878        b               2b
8798:
880        subs            w4,  w4,  #4
881        b.le            9f
882        // End of horizontal loop, move pointers to next four rows
883        sub             x8,  x8,  w9, uxtw #1
884        add             x0,  x0,  x1
885        add             x6,  x6,  x1
886        // Load the top row as early as possible
887        ld1             {v5.8h},  [x8], #16
888        add             x5,  x5,  x1
889        add             x10, x10, x1
890        mov             w3,  w9
891        b               1b
8929:
893        ret
894
895L(ipred_paeth_tbl):
896        .hword L(ipred_paeth_tbl) - 640b
897        .hword L(ipred_paeth_tbl) - 320b
898        .hword L(ipred_paeth_tbl) - 160b
899        .hword L(ipred_paeth_tbl) -  80b
900        .hword L(ipred_paeth_tbl) -  40b
901endfunc
902
903// void ipred_smooth_16bpc_neon(pixel *dst, const ptrdiff_t stride,
904//                              const pixel *const topleft,
905//                              const int width, const int height, const int a,
906//                              const int max_width, const int max_height);
907function ipred_smooth_16bpc_neon, export=1
908        movrel          x10, X(sm_weights)
909        add             x11, x10, w4, uxtw
910        add             x10, x10, w3, uxtw
911        clz             w9,  w3
912        adr             x5,  L(ipred_smooth_tbl)
913        sub             x12, x2,  w4, uxtw #1
914        sub             w9,  w9,  #25
915        ldrh            w9,  [x5, w9, uxtw #1]
916        ld1r            {v4.8h},  [x12] // bottom
917        add             x8,  x2,  #2
918        sub             x5,  x5,  w9, uxtw
919        add             x6,  x0,  x1
920        lsl             x1,  x1,  #1
921        br              x5
92240:
923        ld1r            {v6.2d}, [x8]             // top
924        ld1r            {v7.2s}, [x10]            // weights_hor
925        sub             x2,  x2,  #8
926        mov             x7,  #-8
927        dup             v5.8h,   v6.h[3]          // right
928        sub             v6.8h,   v6.8h,   v4.8h   // top-bottom
929        uxtl            v7.8h,   v7.8b            // weights_hor
930        add             v31.4h,  v4.4h,   v5.4h   // bottom+right
9314:
932        ld4r            {v0.4h, v1.4h, v2.4h, v3.4h},  [x2], x7 // left
933        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x11], #4 // weights_ver
934        ushll           v20.4s,  v31.4h,  #8      // (bottom+right)*256
935        ushll           v21.4s,  v31.4h,  #8
936        ushll           v22.4s,  v31.4h,  #8
937        ushll           v23.4s,  v31.4h,  #8
938        zip1            v1.2d,   v1.2d,   v0.2d   // left, flipped
939        zip1            v0.2d,   v3.2d,   v2.2d
940        zip1            v16.2s,  v16.2s,  v17.2s  // weights_ver
941        zip1            v18.2s,  v18.2s,  v19.2s
942        sub             v0.8h,   v0.8h,   v5.8h   // left-right
943        sub             v1.8h,   v1.8h,   v5.8h
944        uxtl            v16.8h,  v16.8b           // weights_ver
945        uxtl            v18.8h,  v18.8b
946        smlal           v20.4s,  v0.4h,   v7.4h   // += (left-right)*weights_hor
947        smlal2          v21.4s,  v0.8h,   v7.8h
948        smlal           v22.4s,  v1.4h,   v7.4h
949        smlal2          v23.4s,  v1.8h,   v7.8h
950        smlal           v20.4s,  v6.4h,   v16.4h  // += (top-bottom)*weights_ver
951        smlal2          v21.4s,  v6.8h,   v16.8h
952        smlal           v22.4s,  v6.4h,   v18.4h
953        smlal2          v23.4s,  v6.8h,   v18.8h
954        rshrn           v20.4h,  v20.4s,  #9
955        rshrn           v21.4h,  v21.4s,  #9
956        rshrn           v22.4h,  v22.4s,  #9
957        rshrn           v23.4h,  v23.4s,  #9
958        st1             {v20.4h}, [x0], x1
959        st1             {v21.4h}, [x6], x1
960        subs            w4,  w4,  #4
961        st1             {v22.4h}, [x0], x1
962        st1             {v23.4h}, [x6], x1
963        b.gt            4b
964        ret
96580:
966        ld1             {v6.8h}, [x8]             // top
967        ld1             {v7.8b}, [x10]            // weights_hor
968        sub             x2,  x2,  #8
969        mov             x7,  #-8
970        dup             v5.8h,   v6.h[7]          // right
971        sub             v6.8h,   v6.8h,   v4.8h   // top-bottom
972        uxtl            v7.8h,   v7.8b            // weights_hor
973        add             v31.4h,  v4.4h,   v5.4h   // bottom+right
9748:
975        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7 // left
976        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x11], #4 // weights_ver
977        ushll           v20.4s,  v31.4h,  #8      // (bottom+right)*256
978        ushll           v21.4s,  v31.4h,  #8
979        ushll           v22.4s,  v31.4h,  #8
980        ushll           v23.4s,  v31.4h,  #8
981        ushll           v24.4s,  v31.4h,  #8
982        ushll           v25.4s,  v31.4h,  #8
983        ushll           v26.4s,  v31.4h,  #8
984        ushll           v27.4s,  v31.4h,  #8
985        sub             v0.8h,   v0.8h,   v5.8h   // left-right
986        sub             v1.8h,   v1.8h,   v5.8h
987        sub             v2.8h,   v2.8h,   v5.8h
988        sub             v3.8h,   v3.8h,   v5.8h
989        uxtl            v16.8h,  v16.8b           // weights_ver
990        uxtl            v17.8h,  v17.8b
991        uxtl            v18.8h,  v18.8b
992        uxtl            v19.8h,  v19.8b
993        smlal           v20.4s,  v3.4h,   v7.4h   // += (left-right)*weights_hor
994        smlal2          v21.4s,  v3.8h,   v7.8h   // (left flipped)
995        smlal           v22.4s,  v2.4h,   v7.4h
996        smlal2          v23.4s,  v2.8h,   v7.8h
997        smlal           v24.4s,  v1.4h,   v7.4h
998        smlal2          v25.4s,  v1.8h,   v7.8h
999        smlal           v26.4s,  v0.4h,   v7.4h
1000        smlal2          v27.4s,  v0.8h,   v7.8h
1001        smlal           v20.4s,  v6.4h,   v16.4h  // += (top-bottom)*weights_ver
1002        smlal2          v21.4s,  v6.8h,   v16.8h
1003        smlal           v22.4s,  v6.4h,   v17.4h
1004        smlal2          v23.4s,  v6.8h,   v17.8h
1005        smlal           v24.4s,  v6.4h,   v18.4h
1006        smlal2          v25.4s,  v6.8h,   v18.8h
1007        smlal           v26.4s,  v6.4h,   v19.4h
1008        smlal2          v27.4s,  v6.8h,   v19.8h
1009        rshrn           v20.4h,  v20.4s,  #9
1010        rshrn2          v20.8h,  v21.4s,  #9
1011        rshrn           v21.4h,  v22.4s,  #9
1012        rshrn2          v21.8h,  v23.4s,  #9
1013        rshrn           v22.4h,  v24.4s,  #9
1014        rshrn2          v22.8h,  v25.4s,  #9
1015        rshrn           v23.4h,  v26.4s,  #9
1016        rshrn2          v23.8h,  v27.4s,  #9
1017        st1             {v20.8h}, [x0], x1
1018        st1             {v21.8h}, [x6], x1
1019        subs            w4,  w4,  #4
1020        st1             {v22.8h}, [x0], x1
1021        st1             {v23.8h}, [x6], x1
1022        b.gt            8b
1023        ret
1024160:
1025320:
1026640:
1027        add             x12, x2,  w3, uxtw #1
1028        sub             x1,  x1,  w3, uxtw #1
1029        ld1r            {v5.8h}, [x12]            // right
1030        sub             x2,  x2,  #4
1031        mov             x7,  #-4
1032        mov             w9,  w3
1033        add             v31.4h,  v4.4h,   v5.4h   // bottom+right
1034
10351:
1036        ld2r            {v0.8h, v1.8h},   [x2],  x7 // left
1037        ld2r            {v16.8b, v17.8b}, [x11], #2 // weights_ver
1038        sub             v0.8h,   v0.8h,   v5.8h   // left-right
1039        sub             v1.8h,   v1.8h,   v5.8h
1040        uxtl            v16.8h,  v16.8b           // weights_ver
1041        uxtl            v17.8h,  v17.8b
10422:
1043        ld1             {v7.16b}, [x10],  #16     // weights_hor
1044        ld1             {v2.8h, v3.8h}, [x8], #32 // top
1045        ushll           v20.4s,  v31.4h,  #8      // (bottom+right)*256
1046        ushll           v21.4s,  v31.4h,  #8
1047        ushll           v22.4s,  v31.4h,  #8
1048        ushll           v23.4s,  v31.4h,  #8
1049        ushll           v24.4s,  v31.4h,  #8
1050        ushll           v25.4s,  v31.4h,  #8
1051        ushll           v26.4s,  v31.4h,  #8
1052        ushll           v27.4s,  v31.4h,  #8
1053        uxtl            v6.8h,   v7.8b            // weights_hor
1054        uxtl2           v7.8h,   v7.16b
1055        sub             v2.8h,   v2.8h,   v4.8h   // top-bottom
1056        sub             v3.8h,   v3.8h,   v4.8h
1057        smlal           v20.4s,  v1.4h,   v6.4h   // += (left-right)*weights_hor
1058        smlal2          v21.4s,  v1.8h,   v6.8h   // (left flipped)
1059        smlal           v22.4s,  v1.4h,   v7.4h
1060        smlal2          v23.4s,  v1.8h,   v7.8h
1061        smlal           v24.4s,  v0.4h,   v6.4h
1062        smlal2          v25.4s,  v0.8h,   v6.8h
1063        smlal           v26.4s,  v0.4h,   v7.4h
1064        smlal2          v27.4s,  v0.8h,   v7.8h
1065        smlal           v20.4s,  v2.4h,   v16.4h  // += (top-bottom)*weights_ver
1066        smlal2          v21.4s,  v2.8h,   v16.8h
1067        smlal           v22.4s,  v3.4h,   v16.4h
1068        smlal2          v23.4s,  v3.8h,   v16.8h
1069        smlal           v24.4s,  v2.4h,   v17.4h
1070        smlal2          v25.4s,  v2.8h,   v17.8h
1071        smlal           v26.4s,  v3.4h,   v17.4h
1072        smlal2          v27.4s,  v3.8h,   v17.8h
1073        rshrn           v20.4h,  v20.4s,  #9
1074        rshrn2          v20.8h,  v21.4s,  #9
1075        rshrn           v21.4h,  v22.4s,  #9
1076        rshrn2          v21.8h,  v23.4s,  #9
1077        rshrn           v22.4h,  v24.4s,  #9
1078        rshrn2          v22.8h,  v25.4s,  #9
1079        rshrn           v23.4h,  v26.4s,  #9
1080        rshrn2          v23.8h,  v27.4s,  #9
1081        subs            w3,  w3,  #16
1082        st1             {v20.8h, v21.8h}, [x0], #32
1083        st1             {v22.8h, v23.8h}, [x6], #32
1084        b.gt            2b
1085        subs            w4,  w4,  #2
1086        b.le            9f
1087        sub             x8,  x8,  w9, uxtw #1
1088        sub             x10, x10, w9, uxtw
1089        add             x0,  x0,  x1
1090        add             x6,  x6,  x1
1091        mov             w3,  w9
1092        b               1b
10939:
1094        ret
1095
1096L(ipred_smooth_tbl):
1097        .hword L(ipred_smooth_tbl) - 640b
1098        .hword L(ipred_smooth_tbl) - 320b
1099        .hword L(ipred_smooth_tbl) - 160b
1100        .hword L(ipred_smooth_tbl) -  80b
1101        .hword L(ipred_smooth_tbl) -  40b
1102endfunc
1103
1104// void ipred_smooth_v_16bpc_neon(pixel *dst, const ptrdiff_t stride,
1105//                                const pixel *const topleft,
1106//                                const int width, const int height, const int a,
1107//                                const int max_width, const int max_height);
1108function ipred_smooth_v_16bpc_neon, export=1
1109        movrel          x7,  X(sm_weights)
1110        add             x7,  x7,  w4, uxtw
1111        clz             w9,  w3
1112        adr             x5,  L(ipred_smooth_v_tbl)
1113        sub             x8,  x2,  w4, uxtw #1
1114        sub             w9,  w9,  #25
1115        ldrh            w9,  [x5, w9, uxtw #1]
1116        ld1r            {v4.8h},  [x8] // bottom
1117        add             x2,  x2,  #2
1118        sub             x5,  x5,  w9, uxtw
1119        add             x6,  x0,  x1
1120        lsl             x1,  x1,  #1
1121        br              x5
112240:
1123        ld1r            {v6.2d}, [x2]             // top
1124        sub             v6.8h,   v6.8h,   v4.8h   // top-bottom
11254:
1126        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x7], #4 // weights_ver
1127        zip1            v16.2s,  v16.2s,  v17.2s  // weights_ver
1128        zip1            v18.2s,  v18.2s,  v19.2s
1129        ushll           v16.8h,  v16.8b,  #7      // weights_ver << 7
1130        ushll           v18.8h,  v18.8b,  #7
1131        sqrdmulh        v20.8h,  v6.8h,   v16.8h  // ((top-bottom)*weights_ver + 128) >> 8
1132        sqrdmulh        v21.8h,  v6.8h,   v18.8h
1133        add             v20.8h,  v20.8h,  v4.8h
1134        add             v21.8h,  v21.8h,  v4.8h
1135        st1             {v20.d}[0], [x0], x1
1136        st1             {v20.d}[1], [x6], x1
1137        subs            w4,  w4,  #4
1138        st1             {v21.d}[0], [x0], x1
1139        st1             {v21.d}[1], [x6], x1
1140        b.gt            4b
1141        ret
114280:
1143        ld1             {v6.8h}, [x2]             // top
1144        sub             v6.8h,   v6.8h,   v4.8h   // top-bottom
11458:
1146        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x7], #4 // weights_ver
1147        ushll           v16.8h,  v16.8b,  #7      // weights_ver << 7
1148        ushll           v17.8h,  v17.8b,  #7
1149        ushll           v18.8h,  v18.8b,  #7
1150        ushll           v19.8h,  v19.8b,  #7
1151        sqrdmulh        v20.8h,  v6.8h,   v16.8h  // ((top-bottom)*weights_ver + 128) >> 8
1152        sqrdmulh        v21.8h,  v6.8h,   v17.8h
1153        sqrdmulh        v22.8h,  v6.8h,   v18.8h
1154        sqrdmulh        v23.8h,  v6.8h,   v19.8h
1155        add             v20.8h,  v20.8h,  v4.8h
1156        add             v21.8h,  v21.8h,  v4.8h
1157        add             v22.8h,  v22.8h,  v4.8h
1158        add             v23.8h,  v23.8h,  v4.8h
1159        st1             {v20.8h}, [x0], x1
1160        st1             {v21.8h}, [x6], x1
1161        subs            w4,  w4,  #4
1162        st1             {v22.8h}, [x0], x1
1163        st1             {v23.8h}, [x6], x1
1164        b.gt            8b
1165        ret
1166160:
1167320:
1168640:
1169        // Set up pointers for four rows in parallel; x0, x6, x5, x8
1170        add             x5,  x0,  x1
1171        add             x8,  x6,  x1
1172        lsl             x1,  x1,  #1
1173        sub             x1,  x1,  w3, uxtw #1
1174        mov             w9,  w3
1175
11761:
1177        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver
1178        ushll           v16.8h,  v16.8b,  #7      // weights_ver << 7
1179        ushll           v17.8h,  v17.8b,  #7
1180        ushll           v18.8h,  v18.8b,  #7
1181        ushll           v19.8h,  v19.8b,  #7
11822:
1183        ld1             {v2.8h, v3.8h}, [x2], #32 // top
1184        sub             v2.8h,   v2.8h,   v4.8h   // top-bottom
1185        sub             v3.8h,   v3.8h,   v4.8h
1186        sqrdmulh        v20.8h,  v2.8h,   v16.8h  // ((top-bottom)*weights_ver + 128) >> 8
1187        sqrdmulh        v21.8h,  v3.8h,   v16.8h
1188        sqrdmulh        v22.8h,  v2.8h,   v17.8h
1189        sqrdmulh        v23.8h,  v3.8h,   v17.8h
1190        sqrdmulh        v24.8h,  v2.8h,   v18.8h
1191        sqrdmulh        v25.8h,  v3.8h,   v18.8h
1192        sqrdmulh        v26.8h,  v2.8h,   v19.8h
1193        sqrdmulh        v27.8h,  v3.8h,   v19.8h
1194        add             v20.8h,  v20.8h,  v4.8h
1195        add             v21.8h,  v21.8h,  v4.8h
1196        add             v22.8h,  v22.8h,  v4.8h
1197        add             v23.8h,  v23.8h,  v4.8h
1198        add             v24.8h,  v24.8h,  v4.8h
1199        add             v25.8h,  v25.8h,  v4.8h
1200        add             v26.8h,  v26.8h,  v4.8h
1201        add             v27.8h,  v27.8h,  v4.8h
1202        subs            w3,  w3,  #16
1203        st1             {v20.8h, v21.8h}, [x0], #32
1204        st1             {v22.8h, v23.8h}, [x6], #32
1205        st1             {v24.8h, v25.8h}, [x5], #32
1206        st1             {v26.8h, v27.8h}, [x8], #32
1207        b.gt            2b
1208        subs            w4,  w4,  #4
1209        b.le            9f
1210        sub             x2,  x2,  w9, uxtw #1
1211        add             x0,  x0,  x1
1212        add             x6,  x6,  x1
1213        add             x5,  x5,  x1
1214        add             x8,  x8,  x1
1215        mov             w3,  w9
1216        b               1b
12179:
1218        ret
1219
1220L(ipred_smooth_v_tbl):
1221        .hword L(ipred_smooth_v_tbl) - 640b
1222        .hword L(ipred_smooth_v_tbl) - 320b
1223        .hword L(ipred_smooth_v_tbl) - 160b
1224        .hword L(ipred_smooth_v_tbl) -  80b
1225        .hword L(ipred_smooth_v_tbl) -  40b
1226endfunc
1227
1228// void ipred_smooth_h_16bpc_neon(pixel *dst, const ptrdiff_t stride,
1229//                                const pixel *const topleft,
1230//                                const int width, const int height, const int a,
1231//                                const int max_width, const int max_height);
1232function ipred_smooth_h_16bpc_neon, export=1
1233        movrel          x8,  X(sm_weights)
1234        add             x8,  x8,  w3, uxtw
1235        clz             w9,  w3
1236        adr             x5,  L(ipred_smooth_h_tbl)
1237        add             x12, x2,  w3, uxtw #1
1238        sub             w9,  w9,  #25
1239        ldrh            w9,  [x5, w9, uxtw #1]
1240        ld1r            {v5.8h},  [x12] // right
1241        sub             x5,  x5,  w9, uxtw
1242        add             x6,  x0,  x1
1243        lsl             x1,  x1,  #1
1244        br              x5
124540:
1246        ld1r            {v7.2s}, [x8]             // weights_hor
1247        sub             x2,  x2,  #8
1248        mov             x7,  #-8
1249        ushll           v7.8h,   v7.8b,   #7      // weights_hor << 7
12504:
1251        ld4r            {v0.4h, v1.4h, v2.4h, v3.4h},  [x2], x7 // left
1252        zip1            v1.2d,   v1.2d,   v0.2d   // left, flipped
1253        zip1            v0.2d,   v3.2d,   v2.2d
1254        sub             v0.8h,   v0.8h,   v5.8h   // left-right
1255        sub             v1.8h,   v1.8h,   v5.8h
1256        sqrdmulh        v20.8h,  v0.8h,   v7.8h   // ((left-right)*weights_hor + 128) >> 8
1257        sqrdmulh        v21.8h,  v1.8h,   v7.8h
1258        add             v20.8h,  v20.8h,  v5.8h
1259        add             v21.8h,  v21.8h,  v5.8h
1260        st1             {v20.d}[0], [x0], x1
1261        st1             {v20.d}[1], [x6], x1
1262        subs            w4,  w4,  #4
1263        st1             {v21.d}[0], [x0], x1
1264        st1             {v21.d}[1], [x6], x1
1265        b.gt            4b
1266        ret
126780:
1268        ld1             {v7.8b}, [x8]             // weights_hor
1269        sub             x2,  x2,  #8
1270        mov             x7,  #-8
1271        ushll           v7.8h,   v7.8b,   #7      // weights_hor << 7
12728:
1273        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7 // left
1274        sub             v3.8h,   v3.8h,   v5.8h   // left-right
1275        sub             v2.8h,   v2.8h,   v5.8h
1276        sub             v1.8h,   v1.8h,   v5.8h
1277        sub             v0.8h,   v0.8h,   v5.8h
1278        sqrdmulh        v20.8h,  v3.8h,   v7.8h   // ((left-right)*weights_hor + 128) >> 8
1279        sqrdmulh        v21.8h,  v2.8h,   v7.8h   // (left flipped)
1280        sqrdmulh        v22.8h,  v1.8h,   v7.8h
1281        sqrdmulh        v23.8h,  v0.8h,   v7.8h
1282        add             v20.8h,  v20.8h,  v5.8h
1283        add             v21.8h,  v21.8h,  v5.8h
1284        add             v22.8h,  v22.8h,  v5.8h
1285        add             v23.8h,  v23.8h,  v5.8h
1286        st1             {v20.8h}, [x0], x1
1287        st1             {v21.8h}, [x6], x1
1288        subs            w4,  w4,  #4
1289        st1             {v22.8h}, [x0], x1
1290        st1             {v23.8h}, [x6], x1
1291        b.gt            8b
1292        ret
1293160:
1294320:
1295640:
1296        sub             x2,  x2,  #8
1297        mov             x7,  #-8
1298        // Set up pointers for four rows in parallel; x0, x6, x5, x10
1299        add             x5,  x0,  x1
1300        add             x10, x6,  x1
1301        lsl             x1,  x1,  #1
1302        sub             x1,  x1,  w3, uxtw #1
1303        mov             w9,  w3
1304
13051:
1306        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},   [x2],  x7 // left
1307        sub             v0.8h,   v0.8h,   v5.8h   // left-right
1308        sub             v1.8h,   v1.8h,   v5.8h
1309        sub             v2.8h,   v2.8h,   v5.8h
1310        sub             v3.8h,   v3.8h,   v5.8h
13112:
1312        ld1             {v7.16b}, [x8],   #16     // weights_hor
1313        ushll           v6.8h,   v7.8b,   #7      // weights_hor << 7
1314        ushll2          v7.8h,   v7.16b,  #7
1315        sqrdmulh        v20.8h,  v3.8h,   v6.8h   // ((left-right)*weights_hor + 128) >> 8
1316        sqrdmulh        v21.8h,  v3.8h,   v7.8h   // (left flipped)
1317        sqrdmulh        v22.8h,  v2.8h,   v6.8h
1318        sqrdmulh        v23.8h,  v2.8h,   v7.8h
1319        sqrdmulh        v24.8h,  v1.8h,   v6.8h
1320        sqrdmulh        v25.8h,  v1.8h,   v7.8h
1321        sqrdmulh        v26.8h,  v0.8h,   v6.8h
1322        sqrdmulh        v27.8h,  v0.8h,   v7.8h
1323        add             v20.8h,  v20.8h,  v5.8h
1324        add             v21.8h,  v21.8h,  v5.8h
1325        add             v22.8h,  v22.8h,  v5.8h
1326        add             v23.8h,  v23.8h,  v5.8h
1327        add             v24.8h,  v24.8h,  v5.8h
1328        add             v25.8h,  v25.8h,  v5.8h
1329        add             v26.8h,  v26.8h,  v5.8h
1330        add             v27.8h,  v27.8h,  v5.8h
1331        subs            w3,  w3,  #16
1332        st1             {v20.8h, v21.8h}, [x0],  #32
1333        st1             {v22.8h, v23.8h}, [x6],  #32
1334        st1             {v24.8h, v25.8h}, [x5],  #32
1335        st1             {v26.8h, v27.8h}, [x10], #32
1336        b.gt            2b
1337        subs            w4,  w4,  #4
1338        b.le            9f
1339        sub             x8,  x8,  w9, uxtw
1340        add             x0,  x0,  x1
1341        add             x6,  x6,  x1
1342        add             x5,  x5,  x1
1343        add             x10, x10, x1
1344        mov             w3,  w9
1345        b               1b
13469:
1347        ret
1348
1349L(ipred_smooth_h_tbl):
1350        .hword L(ipred_smooth_h_tbl) - 640b
1351        .hword L(ipred_smooth_h_tbl) - 320b
1352        .hword L(ipred_smooth_h_tbl) - 160b
1353        .hword L(ipred_smooth_h_tbl) -  80b
1354        .hword L(ipred_smooth_h_tbl) -  40b
1355endfunc
1356
1357// void ipred_filter_16bpc_neon(pixel *dst, const ptrdiff_t stride,
1358//                              const pixel *const topleft,
1359//                              const int width, const int height, const int filt_idx,
1360//                              const int max_width, const int max_height,
1361//                              const int bitdepth_max);
1362.macro filter_fn bpc
1363function ipred_filter_\bpc\()bpc_neon
1364        and             w5,  w5,  #511
1365        movrel          x6,  X(filter_intra_taps)
1366        lsl             w5,  w5,  #6
1367        add             x6,  x6,  w5, uxtw
1368        ld1             {v16.8b, v17.8b, v18.8b, v19.8b}, [x6], #32
1369        clz             w9,  w3
1370        adr             x5,  L(ipred_filter\bpc\()_tbl)
1371        ld1             {v20.8b, v21.8b, v22.8b}, [x6]
1372        sub             w9,  w9,  #26
1373        ldrh            w9,  [x5, w9, uxtw #1]
1374        sxtl            v16.8h,  v16.8b
1375        sxtl            v17.8h,  v17.8b
1376        sub             x5,  x5,  w9, uxtw
1377        sxtl            v18.8h,  v18.8b
1378        sxtl            v19.8h,  v19.8b
1379        add             x6,  x0,  x1
1380        lsl             x1,  x1,  #1
1381        sxtl            v20.8h,  v20.8b
1382        sxtl            v21.8h,  v21.8b
1383        sxtl            v22.8h,  v22.8b
1384        dup             v31.8h,  w8
1385        movi            v30.8h,  #0
1386        br              x5
138740:
1388        ldur            d0,  [x2, #2]             // top (0-3)
1389        sub             x2,  x2,  #4
1390        mov             x7,  #-4
13914:
1392        ld1             {v1.4h}, [x2], x7         // left (0-1) + topleft (2)
1393.if \bpc == 10
1394        mul             v2.8h,   v17.8h,  v0.h[0] // p1(top[0]) * filter(1)
1395        mla             v2.8h,   v18.8h,  v0.h[1] // p2(top[1]) * filter(2)
1396        mla             v2.8h,   v19.8h,  v0.h[2] // p3(top[2]) * filter(3)
1397        mla             v2.8h,   v20.8h,  v0.h[3] // p4(top[3]) * filter(4)
1398        mla             v2.8h,   v16.8h,  v1.h[2] // p0(topleft) * filter(0)
1399        mla             v2.8h,   v21.8h,  v1.h[1] // p5(left[0]) * filter(5)
1400        mla             v2.8h,   v22.8h,  v1.h[0] // p6(left[1]) * filter(6)
1401        srshr           v2.8h,   v2.8h,   #4
1402        smax            v2.8h,   v2.8h,   v30.8h
1403.else
1404        smull           v2.4s,   v17.4h,  v0.h[0] // p1(top[0]) * filter(1)
1405        smlal           v2.4s,   v18.4h,  v0.h[1] // p2(top[1]) * filter(2)
1406        smlal           v2.4s,   v19.4h,  v0.h[2] // p3(top[2]) * filter(3)
1407        smlal           v2.4s,   v20.4h,  v0.h[3] // p4(top[3]) * filter(4)
1408        smlal           v2.4s,   v16.4h,  v1.h[2] // p0(topleft) * filter(0)
1409        smlal           v2.4s,   v21.4h,  v1.h[1] // p5(left[0]) * filter(5)
1410        smlal           v2.4s,   v22.4h,  v1.h[0] // p6(left[1]) * filter(6)
1411        smull2          v3.4s,   v17.8h,  v0.h[0] // p1(top[0]) * filter(1)
1412        smlal2          v3.4s,   v18.8h,  v0.h[1] // p2(top[1]) * filter(2)
1413        smlal2          v3.4s,   v19.8h,  v0.h[2] // p3(top[2]) * filter(3)
1414        smlal2          v3.4s,   v20.8h,  v0.h[3] // p4(top[3]) * filter(4)
1415        smlal2          v3.4s,   v16.8h,  v1.h[2] // p0(topleft) * filter(0)
1416        smlal2          v3.4s,   v21.8h,  v1.h[1] // p5(left[0]) * filter(5)
1417        smlal2          v3.4s,   v22.8h,  v1.h[0] // p6(left[1]) * filter(6)
1418        sqrshrun        v2.4h,   v2.4s,   #4
1419        sqrshrun2       v2.8h,   v3.4s,   #4
1420.endif
1421        smin            v2.8h,   v2.8h,   v31.8h
1422        subs            w4,  w4,  #2
1423        st1             {v2.d}[0], [x0], x1
1424        uxtl            v0.8h,   v2.8b
1425        ext             v0.16b,  v2.16b,  v2.16b, #8 // move top from [4-7] to [0-3]
1426        st1             {v2.d}[1], [x6], x1
1427        b.gt            4b
1428        ret
142980:
1430        ldur            q0,  [x2, #2]             // top (0-7)
1431        sub             x2,  x2,  #4
1432        mov             x7,  #-4
14338:
1434        ld1             {v1.4h}, [x2], x7         // left (0-1) + topleft (2)
1435.if \bpc == 10
1436        mul             v2.8h,   v17.8h,  v0.h[0] // p1(top[0]) * filter(1)
1437        mla             v2.8h,   v18.8h,  v0.h[1] // p2(top[1]) * filter(2)
1438        mla             v2.8h,   v19.8h,  v0.h[2] // p3(top[2]) * filter(3)
1439        mla             v2.8h,   v20.8h,  v0.h[3] // p4(top[3]) * filter(4)
1440        mla             v2.8h,   v16.8h,  v1.h[2] // p0(topleft) * filter(0)
1441        mla             v2.8h,   v21.8h,  v1.h[1] // p5(left[0]) * filter(5)
1442        mla             v2.8h,   v22.8h,  v1.h[0] // p6(left[1]) * filter(6)
1443        mul             v3.8h,   v17.8h,  v0.h[4] // p1(top[0]) * filter(1)
1444        mla             v3.8h,   v18.8h,  v0.h[5] // p2(top[1]) * filter(2)
1445        mla             v3.8h,   v19.8h,  v0.h[6] // p3(top[2]) * filter(3)
1446        srshr           v2.8h,   v2.8h,   #4
1447        smax            v2.8h,   v2.8h,   v30.8h
1448        smin            v2.8h,   v2.8h,   v31.8h
1449        mla             v3.8h,   v20.8h,  v0.h[7] // p4(top[3]) * filter(4)
1450        mla             v3.8h,   v16.8h,  v0.h[3] // p0(topleft) * filter(0)
1451        mla             v3.8h,   v21.8h,  v2.h[3] // p5(left[0]) * filter(5)
1452        mla             v3.8h,   v22.8h,  v2.h[7] // p6(left[1]) * filter(6)
1453        srshr           v3.8h,   v3.8h,   #4
1454        smax            v3.8h,   v3.8h,   v30.8h
1455.else
1456        smull           v2.4s,   v17.4h,  v0.h[0] // p1(top[0]) * filter(1)
1457        smlal           v2.4s,   v18.4h,  v0.h[1] // p2(top[1]) * filter(2)
1458        smlal           v2.4s,   v19.4h,  v0.h[2] // p3(top[2]) * filter(3)
1459        smlal           v2.4s,   v20.4h,  v0.h[3] // p4(top[3]) * filter(4)
1460        smlal           v2.4s,   v16.4h,  v1.h[2] // p0(topleft) * filter(0)
1461        smlal           v2.4s,   v21.4h,  v1.h[1] // p5(left[0]) * filter(5)
1462        smlal           v2.4s,   v22.4h,  v1.h[0] // p6(left[1]) * filter(6)
1463        smull2          v3.4s,   v17.8h,  v0.h[0] // p1(top[0]) * filter(1)
1464        smlal2          v3.4s,   v18.8h,  v0.h[1] // p2(top[1]) * filter(2)
1465        smlal2          v3.4s,   v19.8h,  v0.h[2] // p3(top[2]) * filter(3)
1466        smlal2          v3.4s,   v20.8h,  v0.h[3] // p4(top[3]) * filter(4)
1467        smlal2          v3.4s,   v16.8h,  v1.h[2] // p0(topleft) * filter(0)
1468        smlal2          v3.4s,   v21.8h,  v1.h[1] // p5(left[0]) * filter(5)
1469        smlal2          v3.4s,   v22.8h,  v1.h[0] // p6(left[1]) * filter(6)
1470        smull           v4.4s,   v17.4h,  v0.h[4] // p1(top[0]) * filter(1)
1471        smlal           v4.4s,   v18.4h,  v0.h[5] // p2(top[1]) * filter(2)
1472        smlal           v4.4s,   v19.4h,  v0.h[6] // p3(top[2]) * filter(3)
1473        sqrshrun        v2.4h,   v2.4s,   #4
1474        sqrshrun2       v2.8h,   v3.4s,   #4
1475        smin            v2.8h,   v2.8h,   v31.8h
1476        smlal           v4.4s,   v20.4h,  v0.h[7] // p4(top[3]) * filter(4)
1477        smlal           v4.4s,   v16.4h,  v0.h[3] // p0(topleft) * filter(0)
1478        smlal           v4.4s,   v21.4h,  v2.h[3] // p5(left[0]) * filter(5)
1479        smlal           v4.4s,   v22.4h,  v2.h[7] // p6(left[1]) * filter(6)
1480        smull2          v5.4s,   v17.8h,  v0.h[4] // p1(top[0]) * filter(1)
1481        smlal2          v5.4s,   v18.8h,  v0.h[5] // p2(top[1]) * filter(2)
1482        smlal2          v5.4s,   v19.8h,  v0.h[6] // p3(top[2]) * filter(3)
1483        smlal2          v5.4s,   v20.8h,  v0.h[7] // p4(top[3]) * filter(4)
1484        smlal2          v5.4s,   v16.8h,  v0.h[3] // p0(topleft) * filter(0)
1485        smlal2          v5.4s,   v21.8h,  v2.h[3] // p5(left[0]) * filter(5)
1486        smlal2          v5.4s,   v22.8h,  v2.h[7] // p6(left[1]) * filter(6)
1487        sqrshrun        v3.4h,   v4.4s,   #4
1488        sqrshrun2       v3.8h,   v5.4s,   #4
1489.endif
1490        smin            v3.8h,   v3.8h,   v31.8h
1491        subs            w4,  w4,  #2
1492        st2             {v2.d, v3.d}[0], [x0], x1
1493        zip2            v0.2d,   v2.2d,   v3.2d
1494        st2             {v2.d, v3.d}[1], [x6], x1
1495        b.gt            8b
1496        ret
1497160:
1498320:
1499        add             x8,  x2,  #2
1500        sub             x2,  x2,  #4
1501        mov             x7,  #-4
1502        sub             x1,  x1,  w3, uxtw #1
1503        mov             w9,  w3
1504
15051:
1506        ld1             {v0.4h}, [x2], x7         // left (0-1) + topleft (2)
15072:
1508        ld1             {v1.8h, v2.8h}, [x8], #32 // top(0-15)
1509.if \bpc == 10
1510        mul             v3.8h,   v16.8h,  v0.h[2] // p0(topleft) * filter(0)
1511        mla             v3.8h,   v21.8h,  v0.h[1] // p5(left[0]) * filter(5)
1512        mla             v3.8h,   v22.8h,  v0.h[0] // p6(left[1]) * filter(6)
1513        mla             v3.8h,   v17.8h,  v1.h[0] // p1(top[0]) * filter(1)
1514        mla             v3.8h,   v18.8h,  v1.h[1] // p2(top[1]) * filter(2)
1515        mla             v3.8h,   v19.8h,  v1.h[2] // p3(top[2]) * filter(3)
1516        mla             v3.8h,   v20.8h,  v1.h[3] // p4(top[3]) * filter(4)
1517
1518        mul             v4.8h,   v17.8h,  v1.h[4] // p1(top[0]) * filter(1)
1519        mla             v4.8h,   v18.8h,  v1.h[5] // p2(top[1]) * filter(2)
1520        mla             v4.8h,   v19.8h,  v1.h[6] // p3(top[2]) * filter(3)
1521        srshr           v3.8h,   v3.8h,   #4
1522        smax            v3.8h,   v3.8h,   v30.8h
1523        smin            v3.8h,   v3.8h,   v31.8h
1524        mla             v4.8h,   v20.8h,  v1.h[7] // p4(top[3]) * filter(4)
1525        mla             v4.8h,   v16.8h,  v1.h[3] // p0(topleft) * filter(0)
1526        mla             v4.8h,   v21.8h,  v3.h[3] // p5(left[0]) * filter(5)
1527        mla             v4.8h,   v22.8h,  v3.h[7] // p6(left[1]) * filter(6)
1528
1529        mul             v5.8h,   v17.8h,  v2.h[0] // p1(top[0]) * filter(1)
1530        mla             v5.8h,   v18.8h,  v2.h[1] // p2(top[1]) * filter(2)
1531        mla             v5.8h,   v19.8h,  v2.h[2] // p3(top[2]) * filter(3)
1532        srshr           v4.8h,   v4.8h,   #4
1533        smax            v4.8h,   v4.8h,   v30.8h
1534        smin            v4.8h,   v4.8h,   v31.8h
1535        mla             v5.8h,   v20.8h,  v2.h[3] // p4(top[3]) * filter(4)
1536        mla             v5.8h,   v16.8h,  v1.h[7] // p0(topleft) * filter(0)
1537        mla             v5.8h,   v21.8h,  v4.h[3] // p5(left[0]) * filter(5)
1538        mla             v5.8h,   v22.8h,  v4.h[7] // p6(left[1]) * filter(6)
1539
1540        mul             v6.8h,   v17.8h,  v2.h[4] // p1(top[0]) * filter(1)
1541        mla             v6.8h,   v18.8h,  v2.h[5] // p2(top[1]) * filter(2)
1542        mla             v6.8h,   v19.8h,  v2.h[6] // p3(top[2]) * filter(3)
1543        srshr           v5.8h,   v5.8h,   #4
1544        smax            v5.8h,   v5.8h,   v30.8h
1545        smin            v5.8h,   v5.8h,   v31.8h
1546        mla             v6.8h,   v20.8h,  v2.h[7] // p4(top[3]) * filter(4)
1547        mla             v6.8h,   v16.8h,  v2.h[3] // p0(topleft) * filter(0)
1548        mla             v6.8h,   v21.8h,  v5.h[3] // p5(left[0]) * filter(5)
1549        mla             v6.8h,   v22.8h,  v5.h[7] // p6(left[1]) * filter(6)
1550
1551        subs            w3,  w3,  #16
1552        srshr           v6.8h,   v6.8h,   #4
1553        smax            v6.8h,   v6.8h,   v30.8h
1554.else
1555        smull           v3.4s,   v16.4h,  v0.h[2] // p0(topleft) * filter(0)
1556        smlal           v3.4s,   v21.4h,  v0.h[1] // p5(left[0]) * filter(5)
1557        smlal           v3.4s,   v22.4h,  v0.h[0] // p6(left[1]) * filter(6)
1558        smlal           v3.4s,   v17.4h,  v1.h[0] // p1(top[0]) * filter(1)
1559        smlal           v3.4s,   v18.4h,  v1.h[1] // p2(top[1]) * filter(2)
1560        smlal           v3.4s,   v19.4h,  v1.h[2] // p3(top[2]) * filter(3)
1561        smlal           v3.4s,   v20.4h,  v1.h[3] // p4(top[3]) * filter(4)
1562        smull2          v4.4s,   v16.8h,  v0.h[2] // p0(topleft) * filter(0)
1563        smlal2          v4.4s,   v21.8h,  v0.h[1] // p5(left[0]) * filter(5)
1564        smlal2          v4.4s,   v22.8h,  v0.h[0] // p6(left[1]) * filter(6)
1565        smlal2          v4.4s,   v17.8h,  v1.h[0] // p1(top[0]) * filter(1)
1566        smlal2          v4.4s,   v18.8h,  v1.h[1] // p2(top[1]) * filter(2)
1567        smlal2          v4.4s,   v19.8h,  v1.h[2] // p3(top[2]) * filter(3)
1568        smlal2          v4.4s,   v20.8h,  v1.h[3] // p4(top[3]) * filter(4)
1569
1570        smull           v5.4s,   v17.4h,  v1.h[4] // p1(top[0]) * filter(1)
1571        smlal           v5.4s,   v18.4h,  v1.h[5] // p2(top[1]) * filter(2)
1572        smlal           v5.4s,   v19.4h,  v1.h[6] // p3(top[2]) * filter(3)
1573        sqrshrun        v3.4h,   v3.4s,   #4
1574        sqrshrun2       v3.8h,   v4.4s,   #4
1575        smin            v3.8h,   v3.8h,   v31.8h
1576        smlal           v5.4s,   v20.4h,  v1.h[7] // p4(top[3]) * filter(4)
1577        smlal           v5.4s,   v16.4h,  v1.h[3] // p0(topleft) * filter(0)
1578        smlal           v5.4s,   v21.4h,  v3.h[3] // p5(left[0]) * filter(5)
1579        smlal           v5.4s,   v22.4h,  v3.h[7] // p6(left[1]) * filter(6)
1580        smull2          v6.4s,   v17.8h,  v1.h[4] // p1(top[0]) * filter(1)
1581        smlal2          v6.4s,   v18.8h,  v1.h[5] // p2(top[1]) * filter(2)
1582        smlal2          v6.4s,   v19.8h,  v1.h[6] // p3(top[2]) * filter(3)
1583        smlal2          v6.4s,   v20.8h,  v1.h[7] // p4(top[3]) * filter(4)
1584        smlal2          v6.4s,   v16.8h,  v1.h[3] // p0(topleft) * filter(0)
1585        smlal2          v6.4s,   v21.8h,  v3.h[3] // p5(left[0]) * filter(5)
1586        smlal2          v6.4s,   v22.8h,  v3.h[7] // p6(left[1]) * filter(6)
1587
1588        smull           v24.4s,  v17.4h,  v2.h[0] // p1(top[0]) * filter(1)
1589        smlal           v24.4s,  v18.4h,  v2.h[1] // p2(top[1]) * filter(2)
1590        smlal           v24.4s,  v19.4h,  v2.h[2] // p3(top[2]) * filter(3)
1591        sqrshrun        v4.4h,   v5.4s,   #4
1592        sqrshrun2       v4.8h,   v6.4s,   #4
1593        smin            v4.8h,   v4.8h,   v31.8h
1594        smlal           v24.4s,  v20.4h,  v2.h[3] // p4(top[3]) * filter(4)
1595        smlal           v24.4s,  v16.4h,  v1.h[7] // p0(topleft) * filter(0)
1596        smlal           v24.4s,  v21.4h,  v4.h[3] // p5(left[0]) * filter(5)
1597        smlal           v24.4s,  v22.4h,  v4.h[7] // p6(left[1]) * filter(6)
1598        smull2          v25.4s,  v17.8h,  v2.h[0] // p1(top[0]) * filter(1)
1599        smlal2          v25.4s,  v18.8h,  v2.h[1] // p2(top[1]) * filter(2)
1600        smlal2          v25.4s,  v19.8h,  v2.h[2] // p3(top[2]) * filter(3)
1601        smlal2          v25.4s,  v20.8h,  v2.h[3] // p4(top[3]) * filter(4)
1602        smlal2          v25.4s,  v16.8h,  v1.h[7] // p0(topleft) * filter(0)
1603        smlal2          v25.4s,  v21.8h,  v4.h[3] // p5(left[0]) * filter(5)
1604        smlal2          v25.4s,  v22.8h,  v4.h[7] // p6(left[1]) * filter(6)
1605
1606        smull           v26.4s,  v17.4h,  v2.h[4] // p1(top[0]) * filter(1)
1607        smlal           v26.4s,  v18.4h,  v2.h[5] // p2(top[1]) * filter(2)
1608        smlal           v26.4s,  v19.4h,  v2.h[6] // p3(top[2]) * filter(3)
1609        sqrshrun        v5.4h,   v24.4s,  #4
1610        sqrshrun2       v5.8h,   v25.4s,  #4
1611        smin            v5.8h,   v5.8h,   v31.8h
1612        smlal           v26.4s,  v20.4h,  v2.h[7] // p4(top[3]) * filter(4)
1613        smlal           v26.4s,  v16.4h,  v2.h[3] // p0(topleft) * filter(0)
1614        smlal           v26.4s,  v21.4h,  v5.h[3] // p5(left[0]) * filter(5)
1615        smlal           v26.4s,  v22.4h,  v5.h[7] // p6(left[1]) * filter(6)
1616        smull2          v27.4s,  v17.8h,  v2.h[4] // p1(top[0]) * filter(1)
1617        smlal2          v27.4s,  v18.8h,  v2.h[5] // p2(top[1]) * filter(2)
1618        smlal2          v27.4s,  v19.8h,  v2.h[6] // p3(top[2]) * filter(3)
1619        smlal2          v27.4s,  v20.8h,  v2.h[7] // p4(top[3]) * filter(4)
1620        smlal2          v27.4s,  v16.8h,  v2.h[3] // p0(topleft) * filter(0)
1621        smlal2          v27.4s,  v21.8h,  v5.h[3] // p5(left[0]) * filter(5)
1622        smlal2          v27.4s,  v22.8h,  v5.h[7] // p6(left[1]) * filter(6)
1623
1624        subs            w3,  w3,  #16
1625        sqrshrun        v6.4h,   v26.4s,  #4
1626        sqrshrun2       v6.8h,   v27.4s,  #4
1627.endif
1628        smin            v6.8h,   v6.8h,   v31.8h
1629
1630        ins             v0.h[2], v2.h[7]
1631        st4             {v3.d, v4.d, v5.d, v6.d}[0], [x0], #32
1632        ins             v0.h[0], v6.h[7]
1633        st4             {v3.d, v4.d, v5.d, v6.d}[1], [x6], #32
1634        ins             v0.h[1], v6.h[3]
1635        b.gt            2b
1636        subs            w4,  w4,  #2
1637        b.le            9f
1638        sub             x8,  x6,  w9, uxtw #1
1639        add             x0,  x0,  x1
1640        add             x6,  x6,  x1
1641        mov             w3,  w9
1642        b               1b
16439:
1644        ret
1645
1646L(ipred_filter\bpc\()_tbl):
1647        .hword L(ipred_filter\bpc\()_tbl) - 320b
1648        .hword L(ipred_filter\bpc\()_tbl) - 160b
1649        .hword L(ipred_filter\bpc\()_tbl) -  80b
1650        .hword L(ipred_filter\bpc\()_tbl) -  40b
1651endfunc
1652.endm
1653
1654filter_fn 10
1655filter_fn 12
1656
1657function ipred_filter_16bpc_neon, export=1
1658        ldr             w8,  [sp]
1659        cmp             w8,  0x3ff
1660        b.le            ipred_filter_10bpc_neon
1661        b               ipred_filter_12bpc_neon
1662endfunc
1663
1664// void pal_pred_16bpc_neon(pixel *dst, const ptrdiff_t stride,
1665//                          const uint16_t *const pal, const uint8_t *idx,
1666//                          const int w, const int h);
1667function pal_pred_16bpc_neon, export=1
1668        ld1             {v30.8h}, [x2]
1669        clz             w9,  w4
1670        adr             x6,  L(pal_pred_tbl)
1671        sub             w9,  w9,  #25
1672        ldrh            w9,  [x6, w9, uxtw #1]
1673        movi            v31.8h,  #1, lsl #8
1674        sub             x6,  x6,  w9, uxtw
1675        br              x6
167640:
1677        add             x2,  x0,  x1
1678        lsl             x1,  x1,  #1
16794:
1680        ld1             {v1.16b}, [x3], #16
1681        subs            w5,  w5,  #4
1682        // Restructure v1 from a, b, c, ... into 2*a, 2*a+1, 2*b, 2*b+1, 2*c, 2*c+1, ...
1683        add             v1.16b,  v1.16b,  v1.16b
1684        zip1            v0.16b,  v1.16b,  v1.16b
1685        zip2            v1.16b,  v1.16b,  v1.16b
1686        add             v0.8h,   v0.8h,   v31.8h
1687        add             v1.8h,   v1.8h,   v31.8h
1688        tbl             v0.16b, {v30.16b}, v0.16b
1689        st1             {v0.d}[0], [x0], x1
1690        tbl             v1.16b, {v30.16b}, v1.16b
1691        st1             {v0.d}[1], [x2], x1
1692        st1             {v1.d}[0], [x0], x1
1693        st1             {v1.d}[1], [x2], x1
1694        b.gt            4b
1695        ret
169680:
1697        add             x2,  x0,  x1
1698        lsl             x1,  x1,  #1
16998:
1700        ld1             {v2.16b, v3.16b}, [x3], #32
1701        subs            w5,  w5,  #4
1702        add             v2.16b,  v2.16b,  v2.16b
1703        add             v3.16b,  v3.16b,  v3.16b
1704        zip1            v0.16b,  v2.16b,  v2.16b
1705        zip2            v1.16b,  v2.16b,  v2.16b
1706        zip1            v2.16b,  v3.16b,  v3.16b
1707        zip2            v3.16b,  v3.16b,  v3.16b
1708        add             v0.8h,   v0.8h,   v31.8h
1709        add             v1.8h,   v1.8h,   v31.8h
1710        add             v2.8h,   v2.8h,   v31.8h
1711        add             v3.8h,   v3.8h,   v31.8h
1712        tbl             v0.16b, {v30.16b}, v0.16b
1713        tbl             v1.16b, {v30.16b}, v1.16b
1714        st1             {v0.8h}, [x0], x1
1715        tbl             v2.16b, {v30.16b}, v2.16b
1716        st1             {v1.8h}, [x2], x1
1717        tbl             v3.16b, {v30.16b}, v3.16b
1718        st1             {v2.8h}, [x0], x1
1719        st1             {v3.8h}, [x2], x1
1720        b.gt            8b
1721        ret
1722160:
1723        add             x2,  x0,  x1
1724        lsl             x1,  x1,  #1
172516:
1726        ld1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x3], #64
1727        subs            w5,  w5,  #4
1728        add             v4.16b,  v4.16b,  v4.16b
1729        add             v5.16b,  v5.16b,  v5.16b
1730        add             v6.16b,  v6.16b,  v6.16b
1731        add             v7.16b,  v7.16b,  v7.16b
1732        zip1            v0.16b,  v4.16b,  v4.16b
1733        zip2            v1.16b,  v4.16b,  v4.16b
1734        zip1            v2.16b,  v5.16b,  v5.16b
1735        zip2            v3.16b,  v5.16b,  v5.16b
1736        zip1            v4.16b,  v6.16b,  v6.16b
1737        zip2            v5.16b,  v6.16b,  v6.16b
1738        zip1            v6.16b,  v7.16b,  v7.16b
1739        zip2            v7.16b,  v7.16b,  v7.16b
1740        add             v0.8h,   v0.8h,   v31.8h
1741        add             v1.8h,   v1.8h,   v31.8h
1742        add             v2.8h,   v2.8h,   v31.8h
1743        add             v3.8h,   v3.8h,   v31.8h
1744        add             v4.8h,   v4.8h,   v31.8h
1745        tbl             v0.16b, {v30.16b}, v0.16b
1746        add             v5.8h,   v5.8h,   v31.8h
1747        tbl             v1.16b, {v30.16b}, v1.16b
1748        add             v6.8h,   v6.8h,   v31.8h
1749        tbl             v2.16b, {v30.16b}, v2.16b
1750        add             v7.8h,   v7.8h,   v31.8h
1751        tbl             v3.16b, {v30.16b}, v3.16b
1752        tbl             v4.16b, {v30.16b}, v4.16b
1753        tbl             v5.16b, {v30.16b}, v5.16b
1754        st1             {v0.8h, v1.8h}, [x0], x1
1755        tbl             v6.16b, {v30.16b}, v6.16b
1756        st1             {v2.8h, v3.8h}, [x2], x1
1757        tbl             v7.16b, {v30.16b}, v7.16b
1758        st1             {v4.8h, v5.8h}, [x0], x1
1759        st1             {v6.8h, v7.8h}, [x2], x1
1760        b.gt            16b
1761        ret
1762320:
1763        add             x2,  x0,  x1
1764        lsl             x1,  x1,  #1
176532:
1766        ld1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x3], #64
1767        subs            w5,  w5,  #2
1768        add             v4.16b,  v4.16b,  v4.16b
1769        add             v5.16b,  v5.16b,  v5.16b
1770        add             v6.16b,  v6.16b,  v6.16b
1771        add             v7.16b,  v7.16b,  v7.16b
1772        zip1            v0.16b,  v4.16b,  v4.16b
1773        zip2            v1.16b,  v4.16b,  v4.16b
1774        zip1            v2.16b,  v5.16b,  v5.16b
1775        zip2            v3.16b,  v5.16b,  v5.16b
1776        zip1            v4.16b,  v6.16b,  v6.16b
1777        zip2            v5.16b,  v6.16b,  v6.16b
1778        zip1            v6.16b,  v7.16b,  v7.16b
1779        zip2            v7.16b,  v7.16b,  v7.16b
1780        add             v0.8h,   v0.8h,   v31.8h
1781        add             v1.8h,   v1.8h,   v31.8h
1782        add             v2.8h,   v2.8h,   v31.8h
1783        add             v3.8h,   v3.8h,   v31.8h
1784        add             v4.8h,   v4.8h,   v31.8h
1785        tbl             v0.16b, {v30.16b}, v0.16b
1786        add             v5.8h,   v5.8h,   v31.8h
1787        tbl             v1.16b, {v30.16b}, v1.16b
1788        add             v6.8h,   v6.8h,   v31.8h
1789        tbl             v2.16b, {v30.16b}, v2.16b
1790        add             v7.8h,   v7.8h,   v31.8h
1791        tbl             v3.16b, {v30.16b}, v3.16b
1792        tbl             v4.16b, {v30.16b}, v4.16b
1793        tbl             v5.16b, {v30.16b}, v5.16b
1794        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
1795        tbl             v6.16b, {v30.16b}, v6.16b
1796        tbl             v7.16b, {v30.16b}, v7.16b
1797        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], x1
1798        b.gt            32b
1799        ret
1800640:
1801        add             x2,  x0,  #64
180264:
1803        ld1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x3], #64
1804        subs            w5,  w5,  #1
1805        add             v4.16b,  v4.16b,  v4.16b
1806        add             v5.16b,  v5.16b,  v5.16b
1807        add             v6.16b,  v6.16b,  v6.16b
1808        add             v7.16b,  v7.16b,  v7.16b
1809        zip1            v0.16b,  v4.16b,  v4.16b
1810        zip2            v1.16b,  v4.16b,  v4.16b
1811        zip1            v2.16b,  v5.16b,  v5.16b
1812        zip2            v3.16b,  v5.16b,  v5.16b
1813        zip1            v4.16b,  v6.16b,  v6.16b
1814        zip2            v5.16b,  v6.16b,  v6.16b
1815        zip1            v6.16b,  v7.16b,  v7.16b
1816        zip2            v7.16b,  v7.16b,  v7.16b
1817        add             v0.8h,   v0.8h,   v31.8h
1818        add             v1.8h,   v1.8h,   v31.8h
1819        add             v2.8h,   v2.8h,   v31.8h
1820        add             v3.8h,   v3.8h,   v31.8h
1821        add             v4.8h,   v4.8h,   v31.8h
1822        tbl             v0.16b, {v30.16b}, v0.16b
1823        add             v5.8h,   v5.8h,   v31.8h
1824        tbl             v1.16b, {v30.16b}, v1.16b
1825        add             v6.8h,   v6.8h,   v31.8h
1826        tbl             v2.16b, {v30.16b}, v2.16b
1827        add             v7.8h,   v7.8h,   v31.8h
1828        tbl             v3.16b, {v30.16b}, v3.16b
1829        tbl             v4.16b, {v30.16b}, v4.16b
1830        tbl             v5.16b, {v30.16b}, v5.16b
1831        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
1832        tbl             v6.16b, {v30.16b}, v6.16b
1833        tbl             v7.16b, {v30.16b}, v7.16b
1834        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], x1
1835        b.gt            64b
1836        ret
1837
1838L(pal_pred_tbl):
1839        .hword L(pal_pred_tbl) - 640b
1840        .hword L(pal_pred_tbl) - 320b
1841        .hword L(pal_pred_tbl) - 160b
1842        .hword L(pal_pred_tbl) -  80b
1843        .hword L(pal_pred_tbl) -  40b
1844endfunc
1845
1846// void ipred_cfl_128_16bpc_neon(pixel *dst, const ptrdiff_t stride,
1847//                               const pixel *const topleft,
1848//                               const int width, const int height,
1849//                               const int16_t *ac, const int alpha,
1850//                               const int bitdepth_max);
1851function ipred_cfl_128_16bpc_neon, export=1
1852        dup             v31.8h,  w7   // bitdepth_max
1853        clz             w9,  w3
1854        adr             x7,  L(ipred_cfl_128_tbl)
1855        sub             w9,  w9,  #26
1856        ldrh            w9,  [x7, w9, uxtw #1]
1857        urshr           v0.8h,   v31.8h,  #1
1858        dup             v1.8h,   w6   // alpha
1859        sub             x7,  x7,  w9, uxtw
1860        add             x6,  x0,  x1
1861        lsl             x1,  x1,  #1
1862        movi            v30.8h,  #0
1863        br              x7
1864L(ipred_cfl_splat_w4):
1865        ld1             {v4.8h, v5.8h}, [x5], #32
1866        subs            w4,  w4,  #4
1867        smull           v2.4s,   v4.4h,   v1.4h  // diff = ac * alpha
1868        smull2          v3.4s,   v4.8h,   v1.8h
1869        smull           v4.4s,   v5.4h,   v1.4h
1870        smull2          v5.4s,   v5.8h,   v1.8h
1871        sshr            v16.4s,  v2.4s,   #31    // sign = diff >> 31
1872        sshr            v17.4s,  v3.4s,   #31
1873        sshr            v18.4s,  v4.4s,   #31
1874        sshr            v19.4s,  v5.4s,   #31
1875        add             v2.4s,   v2.4s,   v16.4s // diff + sign
1876        add             v3.4s,   v3.4s,   v17.4s
1877        add             v4.4s,   v4.4s,   v18.4s
1878        add             v5.4s,   v5.4s,   v19.4s
1879        rshrn           v2.4h,   v2.4s,   #6     // (diff + sign + 32) >> 6 = apply_sign()
1880        rshrn2          v2.8h,   v3.4s,   #6
1881        rshrn           v3.4h,   v4.4s,   #6
1882        rshrn2          v3.8h,   v5.4s,   #6
1883        add             v2.8h,   v2.8h,   v0.8h  // dc + apply_sign()
1884        add             v3.8h,   v3.8h,   v0.8h
1885        smax            v2.8h,   v2.8h,   v30.8h
1886        smax            v3.8h,   v3.8h,   v30.8h
1887        smin            v2.8h,   v2.8h,   v31.8h
1888        smin            v3.8h,   v3.8h,   v31.8h
1889        st1             {v2.d}[0],  [x0], x1
1890        st1             {v2.d}[1],  [x6], x1
1891        st1             {v3.d}[0],  [x0], x1
1892        st1             {v3.d}[1],  [x6], x1
1893        b.gt            L(ipred_cfl_splat_w4)
1894        ret
1895L(ipred_cfl_splat_w8):
1896        ld1             {v4.8h, v5.8h}, [x5], #32
1897        subs            w4,  w4,  #2
1898        smull           v2.4s,   v4.4h,   v1.4h  // diff = ac * alpha
1899        smull2          v3.4s,   v4.8h,   v1.8h
1900        smull           v4.4s,   v5.4h,   v1.4h
1901        smull2          v5.4s,   v5.8h,   v1.8h
1902        sshr            v16.4s,  v2.4s,   #31    // sign = diff >> 31
1903        sshr            v17.4s,  v3.4s,   #31
1904        sshr            v18.4s,  v4.4s,   #31
1905        sshr            v19.4s,  v5.4s,   #31
1906        add             v2.4s,   v2.4s,   v16.4s // diff + sign
1907        add             v3.4s,   v3.4s,   v17.4s
1908        add             v4.4s,   v4.4s,   v18.4s
1909        add             v5.4s,   v5.4s,   v19.4s
1910        rshrn           v2.4h,   v2.4s,   #6     // (diff + sign + 32) >> 6 = apply_sign()
1911        rshrn2          v2.8h,   v3.4s,   #6
1912        rshrn           v3.4h,   v4.4s,   #6
1913        rshrn2          v3.8h,   v5.4s,   #6
1914        add             v2.8h,   v2.8h,   v0.8h  // dc + apply_sign()
1915        add             v3.8h,   v3.8h,   v0.8h
1916        smax            v2.8h,   v2.8h,   v30.8h
1917        smax            v3.8h,   v3.8h,   v30.8h
1918        smin            v2.8h,   v2.8h,   v31.8h
1919        smin            v3.8h,   v3.8h,   v31.8h
1920        st1             {v2.8h},  [x0], x1
1921        st1             {v3.8h},  [x6], x1
1922        b.gt            L(ipred_cfl_splat_w8)
1923        ret
1924L(ipred_cfl_splat_w16):
1925        add             x7,  x5,  w3, uxtw #1
1926        sub             x1,  x1,  w3, uxtw #1
1927        mov             w9,  w3
19281:
1929        ld1             {v2.8h, v3.8h}, [x5], #32
1930        ld1             {v4.8h, v5.8h}, [x7], #32
1931        subs            w3,  w3,  #16
1932        smull           v16.4s,  v2.4h,   v1.4h  // diff = ac * alpha
1933        smull2          v17.4s,  v2.8h,   v1.8h
1934        smull           v18.4s,  v3.4h,   v1.4h
1935        smull2          v19.4s,  v3.8h,   v1.8h
1936        smull           v2.4s,   v4.4h,   v1.4h
1937        smull2          v3.4s,   v4.8h,   v1.8h
1938        smull           v4.4s,   v5.4h,   v1.4h
1939        smull2          v5.4s,   v5.8h,   v1.8h
1940        sshr            v20.4s,  v16.4s,  #31    // sign = diff >> 31
1941        sshr            v21.4s,  v17.4s,  #31
1942        sshr            v22.4s,  v18.4s,  #31
1943        sshr            v23.4s,  v19.4s,  #31
1944        sshr            v24.4s,  v2.4s,   #31
1945        sshr            v25.4s,  v3.4s,   #31
1946        sshr            v26.4s,  v4.4s,   #31
1947        sshr            v27.4s,  v5.4s,   #31
1948        add             v16.4s,  v16.4s,  v20.4s // diff + sign
1949        add             v17.4s,  v17.4s,  v21.4s
1950        add             v18.4s,  v18.4s,  v22.4s
1951        add             v19.4s,  v19.4s,  v23.4s
1952        add             v2.4s,   v2.4s,   v24.4s
1953        add             v3.4s,   v3.4s,   v25.4s
1954        add             v4.4s,   v4.4s,   v26.4s
1955        add             v5.4s,   v5.4s,   v27.4s
1956        rshrn           v16.4h,  v16.4s,  #6     // (diff + sign + 32) >> 6 = apply_sign()
1957        rshrn2          v16.8h,  v17.4s,  #6
1958        rshrn           v17.4h,  v18.4s,  #6
1959        rshrn2          v17.8h,  v19.4s,  #6
1960        rshrn           v6.4h,   v2.4s,   #6
1961        rshrn2          v6.8h,   v3.4s,   #6
1962        rshrn           v7.4h,   v4.4s,   #6
1963        rshrn2          v7.8h,   v5.4s,   #6
1964        add             v2.8h,   v16.8h,  v0.8h  // dc + apply_sign()
1965        add             v3.8h,   v17.8h,  v0.8h
1966        add             v4.8h,   v6.8h,   v0.8h
1967        add             v5.8h,   v7.8h,   v0.8h
1968        smax            v2.8h,   v2.8h,   v30.8h
1969        smax            v3.8h,   v3.8h,   v30.8h
1970        smax            v4.8h,   v4.8h,   v30.8h
1971        smax            v5.8h,   v5.8h,   v30.8h
1972        smin            v2.8h,   v2.8h,   v31.8h
1973        smin            v3.8h,   v3.8h,   v31.8h
1974        smin            v4.8h,   v4.8h,   v31.8h
1975        smin            v5.8h,   v5.8h,   v31.8h
1976        st1             {v2.8h, v3.8h},  [x0], #32
1977        st1             {v4.8h, v5.8h},  [x6], #32
1978        b.gt            1b
1979        subs            w4,  w4,  #2
1980        add             x5,  x5,  w9, uxtw #1
1981        add             x7,  x7,  w9, uxtw #1
1982        add             x0,  x0,  x1
1983        add             x6,  x6,  x1
1984        mov             w3,  w9
1985        b.gt            1b
1986        ret
1987
1988L(ipred_cfl_128_tbl):
1989L(ipred_cfl_splat_tbl):
1990        .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16)
1991        .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16)
1992        .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w8)
1993        .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w4)
1994endfunc
1995
1996// void ipred_cfl_top_16bpc_neon(pixel *dst, const ptrdiff_t stride,
1997//                               const pixel *const topleft,
1998//                               const int width, const int height,
1999//                               const int16_t *ac, const int alpha,
2000//                               const int bitdepth_max);
2001function ipred_cfl_top_16bpc_neon, export=1
2002        dup             v31.8h,  w7   // bitdepth_max
2003        clz             w9,  w3
2004        adr             x7,  L(ipred_cfl_top_tbl)
2005        sub             w9,  w9,  #26
2006        ldrh            w9,  [x7, w9, uxtw #1]
2007        dup             v1.8h,   w6   // alpha
2008        add             x2,  x2,  #2
2009        sub             x7,  x7,  w9, uxtw
2010        add             x6,  x0,  x1
2011        lsl             x1,  x1,  #1
2012        movi            v30.8h,  #0
2013        br              x7
20144:
2015        ld1             {v0.4h},  [x2]
2016        addv            h0,      v0.4h
2017        urshr           v0.4h,   v0.4h,   #2
2018        dup             v0.8h,   v0.h[0]
2019        b               L(ipred_cfl_splat_w4)
20208:
2021        ld1             {v0.8h},  [x2]
2022        addv            h0,      v0.8h
2023        urshr           v0.4h,   v0.4h,   #3
2024        dup             v0.8h,   v0.h[0]
2025        b               L(ipred_cfl_splat_w8)
202616:
2027        ld1             {v2.8h, v3.8h}, [x2]
2028        addp            v0.8h,   v2.8h,   v3.8h
2029        addv            h0,      v0.8h
2030        urshr           v0.4h,   v0.4h,   #4
2031        dup             v0.8h,   v0.h[0]
2032        b               L(ipred_cfl_splat_w16)
203332:
2034        ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [x2]
2035        addp            v2.8h,   v2.8h,   v3.8h
2036        addp            v4.8h,   v4.8h,   v5.8h
2037        addp            v0.8h,   v2.8h,   v4.8h
2038        uaddlv          s0,      v0.8h
2039        rshrn           v0.4h,   v0.4s,   #5
2040        dup             v0.8h,   v0.h[0]
2041        b               L(ipred_cfl_splat_w16)
2042
2043L(ipred_cfl_top_tbl):
2044        .hword L(ipred_cfl_top_tbl) - 32b
2045        .hword L(ipred_cfl_top_tbl) - 16b
2046        .hword L(ipred_cfl_top_tbl) -  8b
2047        .hword L(ipred_cfl_top_tbl) -  4b
2048endfunc
2049
2050// void ipred_cfl_left_16bpc_neon(pixel *dst, const ptrdiff_t stride,
2051//                                const pixel *const topleft,
2052//                                const int width, const int height,
2053//                                const int16_t *ac, const int alpha,
2054//                                const int bitdepth_max);
2055function ipred_cfl_left_16bpc_neon, export=1
2056        dup             v31.8h,  w7   // bitdepth_max
2057        sub             x2,  x2,  w4, uxtw #1
2058        clz             w9,  w3
2059        clz             w8,  w4
2060        adr             x10, L(ipred_cfl_splat_tbl)
2061        adr             x7,  L(ipred_cfl_left_tbl)
2062        sub             w9,  w9,  #26
2063        sub             w8,  w8,  #26
2064        ldrh            w9,  [x10, w9, uxtw #1]
2065        ldrh            w8,  [x7,  w8, uxtw #1]
2066        dup             v1.8h,   w6   // alpha
2067        sub             x9,  x10, w9, uxtw
2068        sub             x7,  x7,  w8, uxtw
2069        add             x6,  x0,  x1
2070        lsl             x1,  x1,  #1
2071        movi            v30.8h,  #0
2072        br              x7
2073
2074L(ipred_cfl_left_h4):
2075        ld1             {v0.4h},  [x2]
2076        addv            h0,      v0.4h
2077        urshr           v0.4h,   v0.4h,   #2
2078        dup             v0.8h,   v0.h[0]
2079        br              x9
2080
2081L(ipred_cfl_left_h8):
2082        ld1             {v0.8h},  [x2]
2083        addv            h0,      v0.8h
2084        urshr           v0.4h,   v0.4h,   #3
2085        dup             v0.8h,   v0.h[0]
2086        br              x9
2087
2088L(ipred_cfl_left_h16):
2089        ld1             {v2.8h, v3.8h}, [x2]
2090        addp            v0.8h,   v2.8h,   v3.8h
2091        addv            h0,      v0.8h
2092        urshr           v0.4h,   v0.4h,   #4
2093        dup             v0.8h,   v0.h[0]
2094        br              x9
2095
2096L(ipred_cfl_left_h32):
2097        ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [x2]
2098        addp            v2.8h,   v2.8h,   v3.8h
2099        addp            v4.8h,   v4.8h,   v5.8h
2100        addp            v0.8h,   v2.8h,   v4.8h
2101        uaddlv          s0,      v0.8h
2102        rshrn           v0.4h,   v0.4s,   #5
2103        dup             v0.8h,   v0.h[0]
2104        br              x9
2105
2106L(ipred_cfl_left_tbl):
2107        .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h32)
2108        .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h16)
2109        .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h8)
2110        .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h4)
2111endfunc
2112
2113// void ipred_cfl_16bpc_neon(pixel *dst, const ptrdiff_t stride,
2114//                           const pixel *const topleft,
2115//                           const int width, const int height,
2116//                           const int16_t *ac, const int alpha,
2117//                           const int bitdepth_max);
2118function ipred_cfl_16bpc_neon, export=1
2119        dup             v31.8h,  w7              // bitdepth_max
2120        sub             x2,  x2,  w4, uxtw #1
2121        add             w8,  w3,  w4             // width + height
2122        dup             v1.8h,   w6              // alpha
2123        clz             w9,  w3
2124        clz             w6,  w4
2125        dup             v16.4s, w8               // width + height
2126        adr             x7,  L(ipred_cfl_tbl)
2127        rbit            w8,  w8                  // rbit(width + height)
2128        sub             w9,  w9,  #22            // 26 leading bits, minus table offset 4
2129        sub             w6,  w6,  #26
2130        clz             w8,  w8                  // ctz(width + height)
2131        ldrh            w9,  [x7, w9, uxtw #1]
2132        ldrh            w6,  [x7, w6, uxtw #1]
2133        neg             w8,  w8                  // -ctz(width + height)
2134        sub             x9,  x7,  w9, uxtw
2135        sub             x7,  x7,  w6, uxtw
2136        ushr            v16.4s,  v16.4s,  #1     // (width + height) >> 1
2137        dup             v17.4s,  w8              // -ctz(width + height)
2138        add             x6,  x0,  x1
2139        lsl             x1,  x1,  #1
2140        movi            v30.8h,  #0
2141        br              x7
2142
2143L(ipred_cfl_h4):
2144        ld1             {v0.4h},  [x2], #8
2145        uaddlv          s0,      v0.4h
2146        br              x9
2147L(ipred_cfl_w4):
2148        add             x2,  x2,  #2
2149        ld1             {v2.4h},  [x2]
2150        add             v0.2s,   v0.2s,   v16.2s
2151        uaddlv          s2,      v2.4h
2152        cmp             w4,  #4
2153        add             v0.2s,   v0.2s,   v2.2s
2154        ushl            v0.2s,   v0.2s,   v17.2s
2155        b.eq            1f
2156        // h = 8/16
2157        cmp             w4,  #16
2158        mov             w16, #0x6667
2159        mov             w17, #0xAAAB
2160        csel            w16, w16, w17, eq
2161        dup             v16.2s,  w16
2162        mul             v0.2s,   v0.2s,   v16.2s
2163        ushr            v0.2s,   v0.2s,   #17
21641:
2165        dup             v0.8h,   v0.h[0]
2166        b               L(ipred_cfl_splat_w4)
2167
2168L(ipred_cfl_h8):
2169        ld1             {v0.8h},  [x2], #16
2170        uaddlv          s0,      v0.8h
2171        br              x9
2172L(ipred_cfl_w8):
2173        add             x2,  x2,  #2
2174        ld1             {v2.8h},  [x2]
2175        add             v0.2s,   v0.2s,   v16.2s
2176        uaddlv          s2,      v2.8h
2177        cmp             w4,  #8
2178        add             v0.2s,   v0.2s,   v2.2s
2179        ushl            v0.2s,   v0.2s,   v17.2s
2180        b.eq            1f
2181        // h = 4/16/32
2182        cmp             w4,  #32
2183        mov             w16, #0x6667
2184        mov             w17, #0xAAAB
2185        csel            w16, w16, w17, eq
2186        dup             v16.2s,  w16
2187        mul             v0.2s,   v0.2s,   v16.2s
2188        ushr            v0.2s,   v0.2s,   #17
21891:
2190        dup             v0.8h,   v0.h[0]
2191        b               L(ipred_cfl_splat_w8)
2192
2193L(ipred_cfl_h16):
2194        ld1             {v2.8h, v3.8h}, [x2], #32
2195        addp            v0.8h,   v2.8h,   v3.8h
2196        uaddlv          s0,      v0.8h
2197        br              x9
2198L(ipred_cfl_w16):
2199        add             x2,  x2,  #2
2200        ld1             {v2.8h, v3.8h}, [x2]
2201        add             v0.2s,   v0.2s,   v16.2s
2202        addp            v2.8h,   v2.8h,   v3.8h
2203        uaddlv          s2,      v2.8h
2204        cmp             w4,  #16
2205        add             v0.2s,   v0.2s,   v2.2s
2206        ushl            v0.2s,   v0.2s,   v17.2s
2207        b.eq            1f
2208        // h = 4/8/32
2209        tst             w4,  #(32+16+8) // 16 added to make a consecutive bitmask
2210        mov             w16, #0x6667
2211        mov             w17, #0xAAAB
2212        csel            w16, w16, w17, eq
2213        dup             v16.2s,  w16
2214        mul             v0.2s,   v0.2s,   v16.2s
2215        ushr            v0.2s,   v0.2s,   #17
22161:
2217        dup             v0.8h,   v0.h[0]
2218        b               L(ipred_cfl_splat_w16)
2219
2220L(ipred_cfl_h32):
2221        ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [x2], #64
2222        addp            v2.8h,   v2.8h,   v3.8h
2223        addp            v4.8h,   v4.8h,   v5.8h
2224        addp            v0.8h,   v2.8h,   v4.8h
2225        uaddlv          s0,      v0.8h
2226        br              x9
2227L(ipred_cfl_w32):
2228        add             x2,  x2,  #2
2229        ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [x2]
2230        add             v0.4s,   v0.4s,   v16.4s
2231        addp            v2.8h,   v2.8h,   v3.8h
2232        addp            v4.8h,   v4.8h,   v5.8h
2233        addp            v2.8h,   v2.8h,   v4.8h
2234        cmp             w4,  #32
2235        uaddlv          s2,      v2.8h
2236        add             v0.2s,   v0.2s,   v2.2s
2237        ushl            v0.2s,   v0.2s,   v17.2s
2238        b.eq            1f
2239        // h = 8/16
2240        cmp             w4,  #8
2241        mov             w16, #0x6667
2242        mov             w17, #0xAAAB
2243        csel            w16, w16, w17, eq
2244        dup             v16.2s,  w16
2245        mul             v0.2s,   v0.2s,   v16.2s
2246        ushr            v0.2s,   v0.2s,   #17
22471:
2248        dup             v0.8h,   v0.h[0]
2249        b               L(ipred_cfl_splat_w16)
2250
2251L(ipred_cfl_tbl):
2252        .hword L(ipred_cfl_tbl) - L(ipred_cfl_h32)
2253        .hword L(ipred_cfl_tbl) - L(ipred_cfl_h16)
2254        .hword L(ipred_cfl_tbl) - L(ipred_cfl_h8)
2255        .hword L(ipred_cfl_tbl) - L(ipred_cfl_h4)
2256        .hword L(ipred_cfl_tbl) - L(ipred_cfl_w32)
2257        .hword L(ipred_cfl_tbl) - L(ipred_cfl_w16)
2258        .hword L(ipred_cfl_tbl) - L(ipred_cfl_w8)
2259        .hword L(ipred_cfl_tbl) - L(ipred_cfl_w4)
2260endfunc
2261
2262// void cfl_ac_420_16bpc_neon(int16_t *const ac, const pixel *const ypx,
2263//                            const ptrdiff_t stride, const int w_pad,
2264//                            const int h_pad, const int cw, const int ch);
2265function ipred_cfl_ac_420_16bpc_neon, export=1
2266        clz             w8,  w5
2267        lsl             w4,  w4,  #2
2268        adr             x7,  L(ipred_cfl_ac_420_tbl)
2269        sub             w8,  w8,  #27
2270        ldrh            w8,  [x7, w8, uxtw #1]
2271        movi            v24.4s,  #0
2272        movi            v25.4s,  #0
2273        movi            v26.4s,  #0
2274        movi            v27.4s,  #0
2275        sub             x7,  x7,  w8, uxtw
2276        sub             w8,  w6,  w4         // height - h_pad
2277        rbit            w9,  w5              // rbit(width)
2278        rbit            w10, w6              // rbit(height)
2279        clz             w9,  w9              // ctz(width)
2280        clz             w10, w10             // ctz(height)
2281        add             w9,  w9,  w10        // log2sz
2282        add             x10, x1,  x2
2283        dup             v31.4s,  w9
2284        lsl             x2,  x2,  #1
2285        neg             v31.4s,  v31.4s      // -log2sz
2286        br              x7
2287
2288L(ipred_cfl_ac_420_w4):
22891:      // Copy and subsample input
2290        ld1             {v0.8h}, [x1],  x2
2291        ld1             {v1.8h}, [x10], x2
2292        ld1             {v2.8h}, [x1],  x2
2293        ld1             {v3.8h}, [x10], x2
2294        addp            v0.8h,   v0.8h,   v2.8h
2295        addp            v1.8h,   v1.8h,   v3.8h
2296        add             v0.8h,   v0.8h,   v1.8h
2297        shl             v0.8h,   v0.8h,   #1
2298        subs            w8,  w8,  #2
2299        st1             {v0.8h}, [x0], #16
2300        uaddw           v24.4s,  v24.4s,  v0.4h
2301        uaddw2          v25.4s,  v25.4s,  v0.8h
2302        b.gt            1b
2303        trn2            v1.2d,   v0.2d,   v0.2d
2304        trn2            v0.2d,   v0.2d,   v0.2d
2305L(ipred_cfl_ac_420_w4_hpad):
2306        cbz             w4,  3f
23072:      // Vertical padding (h_pad > 0)
2308        subs            w4,  w4,  #4
2309        st1             {v0.8h, v1.8h}, [x0], #32
2310        uaddw           v24.4s,  v24.4s,  v0.4h
2311        uaddw2          v25.4s,  v25.4s,  v0.8h
2312        uaddw           v26.4s,  v26.4s,  v1.4h
2313        uaddw2          v27.4s,  v27.4s,  v1.8h
2314        b.gt            2b
23153:
2316L(ipred_cfl_ac_420_w4_calc_subtract_dc):
2317        // Aggregate the sums
2318        add             v24.4s,  v24.4s,  v25.4s
2319        add             v26.4s,  v26.4s,  v27.4s
2320        add             v0.4s,   v24.4s,  v26.4s
2321        addv            s0,  v0.4s                // sum
2322        sub             x0,  x0,  w6, uxtw #3
2323        urshl           v4.2s,   v0.2s,   v31.2s  // (sum + (1 << (log2sz - 1)))  >>= log2sz
2324        dup             v4.8h,   v4.h[0]
23256:      // Subtract dc from ac
2326        ld1             {v0.8h, v1.8h}, [x0]
2327        subs            w6,  w6,  #4
2328        sub             v0.8h,   v0.8h,   v4.8h
2329        sub             v1.8h,   v1.8h,   v4.8h
2330        st1             {v0.8h, v1.8h}, [x0], #32
2331        b.gt            6b
2332        ret
2333
2334L(ipred_cfl_ac_420_w8):
2335        cbnz            w3,  L(ipred_cfl_ac_420_w8_wpad)
23361:      // Copy and subsample input, without padding
2337        ld1             {v0.8h, v1.8h}, [x1],  x2
2338        ld1             {v2.8h, v3.8h}, [x10], x2
2339        ld1             {v4.8h, v5.8h}, [x1],  x2
2340        addp            v0.8h,   v0.8h,   v1.8h
2341        ld1             {v6.8h, v7.8h}, [x10], x2
2342        addp            v2.8h,   v2.8h,   v3.8h
2343        addp            v4.8h,   v4.8h,   v5.8h
2344        addp            v6.8h,   v6.8h,   v7.8h
2345        add             v0.8h,   v0.8h,   v2.8h
2346        add             v4.8h,   v4.8h,   v6.8h
2347        shl             v0.8h,   v0.8h,   #1
2348        shl             v1.8h,   v4.8h,   #1
2349        subs            w8,  w8,  #2
2350        st1             {v0.8h, v1.8h}, [x0], #32
2351        uaddw           v24.4s,  v24.4s,  v0.4h
2352        uaddw2          v25.4s,  v25.4s,  v0.8h
2353        uaddw           v26.4s,  v26.4s,  v1.4h
2354        uaddw2          v27.4s,  v27.4s,  v1.8h
2355        b.gt            1b
2356        mov             v0.16b,  v1.16b
2357        b               L(ipred_cfl_ac_420_w8_hpad)
2358
2359L(ipred_cfl_ac_420_w8_wpad):
23601:      // Copy and subsample input, padding 4
2361        ld1             {v0.8h}, [x1],  x2
2362        ld1             {v1.8h}, [x10], x2
2363        ld1             {v2.8h}, [x1],  x2
2364        ld1             {v3.8h}, [x10], x2
2365        addp            v0.8h,   v0.8h,   v2.8h
2366        addp            v1.8h,   v1.8h,   v3.8h
2367        add             v0.8h,   v0.8h,   v1.8h
2368        shl             v0.8h,   v0.8h,   #1
2369        dup             v1.4h,   v0.h[3]
2370        dup             v3.4h,   v0.h[7]
2371        trn2            v2.2d,   v0.2d,   v0.2d
2372        subs            w8,  w8,  #2
2373        st1             {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32
2374        uaddw           v24.4s,  v24.4s,  v0.4h
2375        uaddw           v25.4s,  v25.4s,  v1.4h
2376        uaddw           v26.4s,  v26.4s,  v2.4h
2377        uaddw           v27.4s,  v27.4s,  v3.4h
2378        b.gt            1b
2379        trn1            v0.2d,   v2.2d,   v3.2d
2380        trn1            v1.2d,   v2.2d,   v3.2d
2381
2382L(ipred_cfl_ac_420_w8_hpad):
2383        cbz             w4,  3f
23842:      // Vertical padding (h_pad > 0)
2385        subs            w4,  w4,  #4
2386        st1             {v0.8h, v1.8h}, [x0], #32
2387        uaddw           v24.4s,  v24.4s,  v0.4h
2388        uaddw2          v25.4s,  v25.4s,  v0.8h
2389        uaddw           v26.4s,  v26.4s,  v1.4h
2390        uaddw2          v27.4s,  v27.4s,  v1.8h
2391        st1             {v0.8h, v1.8h}, [x0], #32
2392        uaddw           v24.4s,  v24.4s,  v0.4h
2393        uaddw2          v25.4s,  v25.4s,  v0.8h
2394        uaddw           v26.4s,  v26.4s,  v1.4h
2395        uaddw2          v27.4s,  v27.4s,  v1.8h
2396        b.gt            2b
23973:
2398
2399        // Double the height and reuse the w4 summing/subtracting
2400        lsl             w6,  w6,  #1
2401        b               L(ipred_cfl_ac_420_w4_calc_subtract_dc)
2402
2403L(ipred_cfl_ac_420_w16):
2404        adr             x7,  L(ipred_cfl_ac_420_w16_tbl)
2405        ldrh            w3,  [x7, w3, uxtw #1]
2406        sub             x7,  x7,  w3, uxtw
2407        br              x7
2408
2409L(ipred_cfl_ac_420_w16_wpad0):
24101:      // Copy and subsample input, without padding
2411        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x1],  x2
2412        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x10], x2
2413        addp            v0.8h,   v0.8h,   v1.8h
2414        addp            v2.8h,   v2.8h,   v3.8h
2415        addp            v4.8h,   v4.8h,   v5.8h
2416        addp            v6.8h,   v6.8h,   v7.8h
2417        ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x1],  x2
2418        add             v0.8h,   v0.8h,   v4.8h
2419        ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x10], x2
2420        add             v2.8h,   v2.8h,   v6.8h
2421        addp            v16.8h,  v16.8h,  v17.8h
2422        addp            v18.8h,  v18.8h,  v19.8h
2423        addp            v20.8h,  v20.8h,  v21.8h
2424        addp            v22.8h,  v22.8h,  v23.8h
2425        add             v16.8h,  v16.8h,  v20.8h
2426        add             v18.8h,  v18.8h,  v22.8h
2427        shl             v0.8h,   v0.8h,   #1
2428        shl             v1.8h,   v2.8h,   #1
2429        shl             v2.8h,   v16.8h,  #1
2430        shl             v3.8h,   v18.8h,  #1
2431        subs            w8,  w8,  #2
2432        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
2433        uaddw           v24.4s,  v24.4s,  v0.4h
2434        uaddw2          v25.4s,  v25.4s,  v0.8h
2435        uaddw           v26.4s,  v26.4s,  v1.4h
2436        uaddw2          v27.4s,  v27.4s,  v1.8h
2437        uaddw           v24.4s,  v24.4s,  v2.4h
2438        uaddw2          v25.4s,  v25.4s,  v2.8h
2439        uaddw           v26.4s,  v26.4s,  v3.4h
2440        uaddw2          v27.4s,  v27.4s,  v3.8h
2441        b.gt            1b
2442        mov             v0.16b,  v2.16b
2443        mov             v1.16b,  v3.16b
2444        b               L(ipred_cfl_ac_420_w16_hpad)
2445
2446L(ipred_cfl_ac_420_w16_wpad1):
24471:      // Copy and subsample input, padding 4
2448        ldr             q2,  [x1,  #32]
2449        ld1             {v0.8h, v1.8h}, [x1],  x2
2450        ldr             q5,  [x10, #32]
2451        ld1             {v3.8h, v4.8h}, [x10], x2
2452        addp            v2.8h,   v2.8h,   v2.8h
2453        addp            v0.8h,   v0.8h,   v1.8h
2454        addp            v5.8h,   v5.8h,   v5.8h
2455        addp            v3.8h,   v3.8h,   v4.8h
2456        ldr             q18, [x1,  #32]
2457        add             v2.4h,   v2.4h,   v5.4h
2458        ld1             {v16.8h, v17.8h}, [x1],  x2
2459        add             v0.8h,   v0.8h,   v3.8h
2460        ldr             q21, [x10, #32]
2461        ld1             {v19.8h, v20.8h}, [x10], x2
2462        addp            v18.8h,  v18.8h,  v18.8h
2463        addp            v16.8h,  v16.8h,  v17.8h
2464        addp            v21.8h,  v21.8h,  v21.8h
2465        addp            v19.8h,  v19.8h,  v20.8h
2466        add             v18.4h,  v18.4h,  v21.4h
2467        add             v16.8h,  v16.8h,  v19.8h
2468        shl             v1.4h,   v2.4h,   #1
2469        shl             v0.8h,   v0.8h,   #1
2470        shl             v3.4h,   v18.4h,  #1
2471        shl             v2.8h,   v16.8h,  #1
2472        dup             v4.4h,   v1.h[3]
2473        dup             v5.4h,   v3.h[3]
2474        trn1            v1.2d,   v1.2d,   v4.2d
2475        trn1            v3.2d,   v3.2d,   v5.2d
2476        subs            w8,  w8,  #2
2477        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
2478        uaddw           v24.4s,  v24.4s,  v0.4h
2479        uaddw2          v25.4s,  v25.4s,  v0.8h
2480        uaddw           v26.4s,  v26.4s,  v1.4h
2481        uaddw2          v27.4s,  v27.4s,  v1.8h
2482        uaddw           v24.4s,  v24.4s,  v2.4h
2483        uaddw2          v25.4s,  v25.4s,  v2.8h
2484        uaddw           v26.4s,  v26.4s,  v3.4h
2485        uaddw2          v27.4s,  v27.4s,  v3.8h
2486        b.gt            1b
2487        mov             v0.16b,  v2.16b
2488        mov             v1.16b,  v3.16b
2489        b               L(ipred_cfl_ac_420_w16_hpad)
2490
2491L(ipred_cfl_ac_420_w16_wpad2):
24921:      // Copy and subsample input, padding 8
2493        ld1             {v0.8h, v1.8h}, [x1],  x2
2494        ld1             {v2.8h, v3.8h}, [x10], x2
2495        ld1             {v4.8h, v5.8h}, [x1],  x2
2496        addp            v0.8h,   v0.8h,   v1.8h
2497        ld1             {v6.8h, v7.8h}, [x10], x2
2498        addp            v2.8h,   v2.8h,   v3.8h
2499        addp            v4.8h,   v4.8h,   v5.8h
2500        addp            v6.8h,   v6.8h,   v7.8h
2501        add             v0.8h,   v0.8h,   v2.8h
2502        add             v4.8h,   v4.8h,   v6.8h
2503        shl             v0.8h,   v0.8h,   #1
2504        shl             v2.8h,   v4.8h,   #1
2505        dup             v1.8h,   v0.h[7]
2506        dup             v3.8h,   v2.h[7]
2507        subs            w8,  w8,  #2
2508        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
2509        uaddw           v24.4s,  v24.4s,  v0.4h
2510        uaddw2          v25.4s,  v25.4s,  v0.8h
2511        uaddw           v26.4s,  v26.4s,  v1.4h
2512        uaddw2          v27.4s,  v27.4s,  v1.8h
2513        uaddw           v24.4s,  v24.4s,  v2.4h
2514        uaddw2          v25.4s,  v25.4s,  v2.8h
2515        uaddw           v26.4s,  v26.4s,  v3.4h
2516        uaddw2          v27.4s,  v27.4s,  v3.8h
2517        b.gt            1b
2518        mov             v0.16b,  v2.16b
2519        mov             v1.16b,  v3.16b
2520        b               L(ipred_cfl_ac_420_w16_hpad)
2521
2522L(ipred_cfl_ac_420_w16_wpad3):
25231:      // Copy and subsample input, padding 12
2524        ld1             {v0.8h}, [x1],  x2
2525        ld1             {v2.8h}, [x10], x2
2526        ld1             {v4.8h}, [x1],  x2
2527        ld1             {v6.8h}, [x10], x2
2528        addp            v0.8h,   v0.8h,   v4.8h
2529        addp            v2.8h,   v2.8h,   v6.8h
2530        add             v0.8h,   v0.8h,   v2.8h
2531        shl             v0.8h,   v0.8h,   #1
2532        dup             v1.8h,   v0.h[3]
2533        dup             v3.8h,   v0.h[7]
2534        trn2            v2.2d,   v0.2d,   v3.2d
2535        trn1            v0.2d,   v0.2d,   v1.2d
2536        subs            w8,  w8,  #2
2537        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
2538        uaddw           v24.4s,  v24.4s,  v0.4h
2539        uaddw2          v25.4s,  v25.4s,  v0.8h
2540        uaddw           v26.4s,  v26.4s,  v1.4h
2541        uaddw2          v27.4s,  v27.4s,  v1.8h
2542        uaddw           v24.4s,  v24.4s,  v2.4h
2543        uaddw2          v25.4s,  v25.4s,  v2.8h
2544        uaddw           v26.4s,  v26.4s,  v3.4h
2545        uaddw2          v27.4s,  v27.4s,  v3.8h
2546        b.gt            1b
2547        mov             v0.16b,  v2.16b
2548        mov             v1.16b,  v3.16b
2549
2550L(ipred_cfl_ac_420_w16_hpad):
2551        cbz             w4,  3f
25522:      // Vertical padding (h_pad > 0)
2553        subs            w4,  w4,  #4
2554        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
2555        uaddw           v24.4s,  v24.4s,  v0.4h
2556        uaddw2          v25.4s,  v25.4s,  v0.8h
2557        uaddw           v26.4s,  v26.4s,  v1.4h
2558        uaddw2          v27.4s,  v27.4s,  v1.8h
2559        uaddw           v24.4s,  v24.4s,  v2.4h
2560        uaddw2          v25.4s,  v25.4s,  v2.8h
2561        uaddw           v26.4s,  v26.4s,  v3.4h
2562        uaddw2          v27.4s,  v27.4s,  v3.8h
2563        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
2564        uaddw           v24.4s,  v24.4s,  v0.4h
2565        uaddw2          v25.4s,  v25.4s,  v0.8h
2566        uaddw           v26.4s,  v26.4s,  v1.4h
2567        uaddw2          v27.4s,  v27.4s,  v1.8h
2568        uaddw           v24.4s,  v24.4s,  v2.4h
2569        uaddw2          v25.4s,  v25.4s,  v2.8h
2570        uaddw           v26.4s,  v26.4s,  v3.4h
2571        uaddw2          v27.4s,  v27.4s,  v3.8h
2572        b.gt            2b
25733:
2574
2575        // Quadruple the height and reuse the w4 summing/subtracting
2576        lsl             w6,  w6,  #2
2577        b               L(ipred_cfl_ac_420_w4_calc_subtract_dc)
2578
2579L(ipred_cfl_ac_420_tbl):
2580        .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w16)
2581        .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w8)
2582        .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w4)
2583        .hword 0
2584
2585L(ipred_cfl_ac_420_w16_tbl):
2586        .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad0)
2587        .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad1)
2588        .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad2)
2589        .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad3)
2590endfunc
2591
2592// void cfl_ac_422_16bpc_neon(int16_t *const ac, const pixel *const ypx,
2593//                            const ptrdiff_t stride, const int w_pad,
2594//                            const int h_pad, const int cw, const int ch);
2595function ipred_cfl_ac_422_16bpc_neon, export=1
2596        clz             w8,  w5
2597        lsl             w4,  w4,  #2
2598        adr             x7,  L(ipred_cfl_ac_422_tbl)
2599        sub             w8,  w8,  #27
2600        ldrh            w8,  [x7, w8, uxtw #1]
2601        movi            v24.4s,  #0
2602        movi            v25.4s,  #0
2603        movi            v26.4s,  #0
2604        movi            v27.4s,  #0
2605        sub             x7,  x7,  w8, uxtw
2606        sub             w8,  w6,  w4         // height - h_pad
2607        rbit            w9,  w5              // rbit(width)
2608        rbit            w10, w6              // rbit(height)
2609        clz             w9,  w9              // ctz(width)
2610        clz             w10, w10             // ctz(height)
2611        add             w9,  w9,  w10        // log2sz
2612        add             x10, x1,  x2
2613        dup             v31.4s,  w9
2614        lsl             x2,  x2,  #1
2615        neg             v31.4s,  v31.4s      // -log2sz
2616        br              x7
2617
2618L(ipred_cfl_ac_422_w4):
26191:      // Copy and subsample input
2620        ld1             {v0.8h}, [x1],  x2
2621        ld1             {v1.8h}, [x10], x2
2622        ld1             {v2.8h}, [x1],  x2
2623        ld1             {v3.8h}, [x10], x2
2624        addp            v0.8h,   v0.8h,   v1.8h
2625        addp            v2.8h,   v2.8h,   v3.8h
2626        shl             v0.8h,   v0.8h,   #2
2627        shl             v1.8h,   v2.8h,   #2
2628        subs            w8,  w8,  #4
2629        st1             {v0.8h, v1.8h}, [x0], #32
2630        uaddw           v24.4s,  v24.4s,  v0.4h
2631        uaddw2          v25.4s,  v25.4s,  v0.8h
2632        uaddw           v26.4s,  v26.4s,  v1.4h
2633        uaddw2          v27.4s,  v27.4s,  v1.8h
2634        b.gt            1b
2635        trn2            v0.2d,   v1.2d,   v1.2d
2636        trn2            v1.2d,   v1.2d,   v1.2d
2637        b               L(ipred_cfl_ac_420_w4_hpad)
2638
2639L(ipred_cfl_ac_422_w8):
2640        cbnz            w3,  L(ipred_cfl_ac_422_w8_wpad)
26411:      // Copy and subsample input, without padding
2642        ld1             {v0.8h, v1.8h}, [x1],  x2
2643        ld1             {v2.8h, v3.8h}, [x10], x2
2644        ld1             {v4.8h, v5.8h}, [x1],  x2
2645        addp            v0.8h,   v0.8h,   v1.8h
2646        ld1             {v6.8h, v7.8h}, [x10], x2
2647        addp            v2.8h,   v2.8h,   v3.8h
2648        addp            v4.8h,   v4.8h,   v5.8h
2649        addp            v6.8h,   v6.8h,   v7.8h
2650        shl             v0.8h,   v0.8h,   #2
2651        shl             v1.8h,   v2.8h,   #2
2652        shl             v2.8h,   v4.8h,   #2
2653        shl             v3.8h,   v6.8h,   #2
2654        subs            w8,  w8,  #4
2655        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
2656        uaddw           v24.4s,  v24.4s,  v0.4h
2657        uaddw2          v25.4s,  v25.4s,  v0.8h
2658        uaddw           v26.4s,  v26.4s,  v1.4h
2659        uaddw2          v27.4s,  v27.4s,  v1.8h
2660        uaddw           v24.4s,  v24.4s,  v2.4h
2661        uaddw2          v25.4s,  v25.4s,  v2.8h
2662        uaddw           v26.4s,  v26.4s,  v3.4h
2663        uaddw2          v27.4s,  v27.4s,  v3.8h
2664        b.gt            1b
2665        mov             v0.16b,  v3.16b
2666        mov             v1.16b,  v3.16b
2667        b               L(ipred_cfl_ac_420_w8_hpad)
2668
2669L(ipred_cfl_ac_422_w8_wpad):
26701:      // Copy and subsample input, padding 4
2671        ld1             {v0.8h}, [x1],  x2
2672        ld1             {v1.8h}, [x10], x2
2673        ld1             {v2.8h}, [x1],  x2
2674        ld1             {v3.8h}, [x10], x2
2675        addp            v0.8h,   v0.8h,   v1.8h
2676        addp            v2.8h,   v2.8h,   v3.8h
2677        shl             v0.8h,   v0.8h,   #2
2678        shl             v2.8h,   v2.8h,   #2
2679        dup             v4.4h,   v0.h[3]
2680        dup             v5.8h,   v0.h[7]
2681        dup             v6.4h,   v2.h[3]
2682        dup             v7.8h,   v2.h[7]
2683        trn2            v1.2d,   v0.2d,   v5.2d
2684        trn1            v0.2d,   v0.2d,   v4.2d
2685        trn2            v3.2d,   v2.2d,   v7.2d
2686        trn1            v2.2d,   v2.2d,   v6.2d
2687        subs            w8,  w8,  #4
2688        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
2689        uaddw           v24.4s,  v24.4s,  v0.4h
2690        uaddw2          v25.4s,  v25.4s,  v0.8h
2691        uaddw           v26.4s,  v26.4s,  v1.4h
2692        uaddw2          v27.4s,  v27.4s,  v1.8h
2693        uaddw           v24.4s,  v24.4s,  v2.4h
2694        uaddw2          v25.4s,  v25.4s,  v2.8h
2695        uaddw           v26.4s,  v26.4s,  v3.4h
2696        uaddw2          v27.4s,  v27.4s,  v3.8h
2697        b.gt            1b
2698        mov             v0.16b,  v3.16b
2699        mov             v1.16b,  v3.16b
2700        b               L(ipred_cfl_ac_420_w8_hpad)
2701
2702L(ipred_cfl_ac_422_w16):
2703        adr             x7,  L(ipred_cfl_ac_422_w16_tbl)
2704        ldrh            w3,  [x7, w3, uxtw #1]
2705        sub             x7,  x7,  w3, uxtw
2706        br              x7
2707
2708L(ipred_cfl_ac_422_w16_wpad0):
27091:      // Copy and subsample input, without padding
2710        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x1],  x2
2711        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x10], x2
2712        addp            v0.8h,   v0.8h,   v1.8h
2713        addp            v2.8h,   v2.8h,   v3.8h
2714        addp            v4.8h,   v4.8h,   v5.8h
2715        addp            v6.8h,   v6.8h,   v7.8h
2716        shl             v0.8h,   v0.8h,   #2
2717        shl             v1.8h,   v2.8h,   #2
2718        shl             v2.8h,   v4.8h,   #2
2719        shl             v3.8h,   v6.8h,   #2
2720        subs            w8,  w8,  #2
2721        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
2722        uaddw           v24.4s,  v24.4s,  v0.4h
2723        uaddw2          v25.4s,  v25.4s,  v0.8h
2724        uaddw           v26.4s,  v26.4s,  v1.4h
2725        uaddw2          v27.4s,  v27.4s,  v1.8h
2726        uaddw           v24.4s,  v24.4s,  v2.4h
2727        uaddw2          v25.4s,  v25.4s,  v2.8h
2728        uaddw           v26.4s,  v26.4s,  v3.4h
2729        uaddw2          v27.4s,  v27.4s,  v3.8h
2730        b.gt            1b
2731        mov             v0.16b,  v2.16b
2732        mov             v1.16b,  v3.16b
2733        b               L(ipred_cfl_ac_420_w16_hpad)
2734
2735L(ipred_cfl_ac_422_w16_wpad1):
27361:      // Copy and subsample input, padding 4
2737        ldr             q2,  [x1,  #32]
2738        ld1             {v0.8h, v1.8h}, [x1],  x2
2739        ldr             q6,  [x10, #32]
2740        ld1             {v4.8h, v5.8h}, [x10], x2
2741        addp            v2.8h,   v2.8h,   v2.8h
2742        addp            v0.8h,   v0.8h,   v1.8h
2743        addp            v6.8h,   v6.8h,   v6.8h
2744        addp            v4.8h,   v4.8h,   v5.8h
2745        shl             v1.4h,   v2.4h,   #2
2746        shl             v0.8h,   v0.8h,   #2
2747        shl             v3.4h,   v6.4h,   #2
2748        shl             v2.8h,   v4.8h,   #2
2749        dup             v4.4h,   v1.h[3]
2750        dup             v5.4h,   v3.h[3]
2751        trn1            v1.2d,   v1.2d,   v4.2d
2752        trn1            v3.2d,   v3.2d,   v5.2d
2753        subs            w8,  w8,  #2
2754        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
2755        uaddw           v24.4s,  v24.4s,  v0.4h
2756        uaddw2          v25.4s,  v25.4s,  v0.8h
2757        uaddw           v26.4s,  v26.4s,  v1.4h
2758        uaddw2          v27.4s,  v27.4s,  v1.8h
2759        uaddw           v24.4s,  v24.4s,  v2.4h
2760        uaddw2          v25.4s,  v25.4s,  v2.8h
2761        uaddw           v26.4s,  v26.4s,  v3.4h
2762        uaddw2          v27.4s,  v27.4s,  v3.8h
2763        b.gt            1b
2764        mov             v0.16b,  v2.16b
2765        mov             v1.16b,  v3.16b
2766        b               L(ipred_cfl_ac_420_w16_hpad)
2767
2768L(ipred_cfl_ac_422_w16_wpad2):
27691:      // Copy and subsample input, padding 8
2770        ld1             {v0.8h, v1.8h}, [x1],  x2
2771        ld1             {v2.8h, v3.8h}, [x10], x2
2772        addp            v0.8h,   v0.8h,   v1.8h
2773        addp            v2.8h,   v2.8h,   v3.8h
2774        shl             v0.8h,   v0.8h,   #2
2775        shl             v2.8h,   v2.8h,   #2
2776        dup             v1.8h,   v0.h[7]
2777        dup             v3.8h,   v2.h[7]
2778        subs            w8,  w8,  #2
2779        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
2780        uaddw           v24.4s,  v24.4s,  v0.4h
2781        uaddw2          v25.4s,  v25.4s,  v0.8h
2782        uaddw           v26.4s,  v26.4s,  v1.4h
2783        uaddw2          v27.4s,  v27.4s,  v1.8h
2784        uaddw           v24.4s,  v24.4s,  v2.4h
2785        uaddw2          v25.4s,  v25.4s,  v2.8h
2786        uaddw           v26.4s,  v26.4s,  v3.4h
2787        uaddw2          v27.4s,  v27.4s,  v3.8h
2788        b.gt            1b
2789        mov             v0.16b,  v2.16b
2790        mov             v1.16b,  v3.16b
2791        b               L(ipred_cfl_ac_420_w16_hpad)
2792
2793L(ipred_cfl_ac_422_w16_wpad3):
27941:      // Copy and subsample input, padding 12
2795        ld1             {v0.8h}, [x1],  x2
2796        ld1             {v2.8h}, [x10], x2
2797        addp            v0.8h,   v0.8h,   v0.8h
2798        addp            v2.8h,   v2.8h,   v2.8h
2799        shl             v0.4h,   v0.4h,   #2
2800        shl             v2.4h,   v2.4h,   #2
2801        dup             v1.8h,   v0.h[3]
2802        dup             v3.8h,   v2.h[3]
2803        trn1            v0.2d,   v0.2d,   v1.2d
2804        trn1            v2.2d,   v2.2d,   v3.2d
2805        subs            w8,  w8,  #2
2806        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
2807        uaddw           v24.4s,  v24.4s,  v0.4h
2808        uaddw2          v25.4s,  v25.4s,  v0.8h
2809        uaddw           v26.4s,  v26.4s,  v1.4h
2810        uaddw2          v27.4s,  v27.4s,  v1.8h
2811        uaddw           v24.4s,  v24.4s,  v2.4h
2812        uaddw2          v25.4s,  v25.4s,  v2.8h
2813        uaddw           v26.4s,  v26.4s,  v3.4h
2814        uaddw2          v27.4s,  v27.4s,  v3.8h
2815        b.gt            1b
2816        mov             v0.16b,  v2.16b
2817        mov             v1.16b,  v3.16b
2818        b               L(ipred_cfl_ac_420_w16_hpad)
2819
2820L(ipred_cfl_ac_422_tbl):
2821        .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w16)
2822        .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w8)
2823        .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w4)
2824        .hword 0
2825
2826L(ipred_cfl_ac_422_w16_tbl):
2827        .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad0)
2828        .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad1)
2829        .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad2)
2830        .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad3)
2831endfunc
2832
2833// void cfl_ac_444_16bpc_neon(int16_t *const ac, const pixel *const ypx,
2834//                            const ptrdiff_t stride, const int w_pad,
2835//                            const int h_pad, const int cw, const int ch);
2836function ipred_cfl_ac_444_16bpc_neon, export=1
2837        clz             w8,  w5
2838        lsl             w4,  w4,  #2
2839        adr             x7,  L(ipred_cfl_ac_444_tbl)
2840        sub             w8,  w8,  #26
2841        ldrh            w8,  [x7, w8, uxtw #1]
2842        movi            v24.4s,  #0
2843        movi            v25.4s,  #0
2844        movi            v26.4s,  #0
2845        movi            v27.4s,  #0
2846        sub             x7,  x7,  w8, uxtw
2847        sub             w8,  w6,  w4         // height - h_pad
2848        rbit            w9,  w5              // rbit(width)
2849        rbit            w10, w6              // rbit(height)
2850        clz             w9,  w9              // ctz(width)
2851        clz             w10, w10             // ctz(height)
2852        add             w9,  w9,  w10        // log2sz
2853        add             x10, x1,  x2
2854        dup             v31.4s,  w9
2855        lsl             x2,  x2,  #1
2856        neg             v31.4s,  v31.4s      // -log2sz
2857        br              x7
2858
2859L(ipred_cfl_ac_444_w4):
28601:      // Copy and expand input
2861        ld1             {v0.4h},   [x1],  x2
2862        ld1             {v0.d}[1], [x10], x2
2863        ld1             {v1.4h},   [x1],  x2
2864        ld1             {v1.d}[1], [x10], x2
2865        shl             v0.8h,   v0.8h,   #3
2866        shl             v1.8h,   v1.8h,   #3
2867        subs            w8,  w8,  #4
2868        st1             {v0.8h, v1.8h}, [x0], #32
2869        uaddw           v24.4s,  v24.4s,  v0.4h
2870        uaddw2          v25.4s,  v25.4s,  v0.8h
2871        uaddw           v26.4s,  v26.4s,  v1.4h
2872        uaddw2          v27.4s,  v27.4s,  v1.8h
2873        b.gt            1b
2874        trn2            v0.2d,   v1.2d,   v1.2d
2875        trn2            v1.2d,   v1.2d,   v1.2d
2876        b               L(ipred_cfl_ac_420_w4_hpad)
2877
2878L(ipred_cfl_ac_444_w8):
28791:      // Copy and expand input
2880        ld1             {v0.8h}, [x1],  x2
2881        ld1             {v1.8h}, [x10], x2
2882        ld1             {v2.8h}, [x1],  x2
2883        shl             v0.8h,   v0.8h,   #3
2884        ld1             {v3.8h}, [x10], x2
2885        shl             v1.8h,   v1.8h,   #3
2886        shl             v2.8h,   v2.8h,   #3
2887        shl             v3.8h,   v3.8h,   #3
2888        subs            w8,  w8,  #4
2889        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
2890        uaddw           v24.4s,  v24.4s,  v0.4h
2891        uaddw2          v25.4s,  v25.4s,  v0.8h
2892        uaddw           v26.4s,  v26.4s,  v1.4h
2893        uaddw2          v27.4s,  v27.4s,  v1.8h
2894        uaddw           v24.4s,  v24.4s,  v2.4h
2895        uaddw2          v25.4s,  v25.4s,  v2.8h
2896        uaddw           v26.4s,  v26.4s,  v3.4h
2897        uaddw2          v27.4s,  v27.4s,  v3.8h
2898        b.gt            1b
2899        mov             v0.16b,  v3.16b
2900        mov             v1.16b,  v3.16b
2901        b               L(ipred_cfl_ac_420_w8_hpad)
2902
2903L(ipred_cfl_ac_444_w16):
2904        cbnz            w3,  L(ipred_cfl_ac_444_w16_wpad)
29051:      // Copy and expand input, without padding
2906        ld1             {v0.8h, v1.8h}, [x1],  x2
2907        ld1             {v2.8h, v3.8h}, [x10], x2
2908        shl             v0.8h,   v0.8h,   #3
2909        shl             v1.8h,   v1.8h,   #3
2910        shl             v2.8h,   v2.8h,   #3
2911        shl             v3.8h,   v3.8h,   #3
2912        subs            w8,  w8,  #2
2913        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
2914        uaddw           v24.4s,  v24.4s,  v0.4h
2915        uaddw2          v25.4s,  v25.4s,  v0.8h
2916        uaddw           v26.4s,  v26.4s,  v1.4h
2917        uaddw2          v27.4s,  v27.4s,  v1.8h
2918        uaddw           v24.4s,  v24.4s,  v2.4h
2919        uaddw2          v25.4s,  v25.4s,  v2.8h
2920        uaddw           v26.4s,  v26.4s,  v3.4h
2921        uaddw2          v27.4s,  v27.4s,  v3.8h
2922        b.gt            1b
2923        mov             v0.16b,  v2.16b
2924        mov             v1.16b,  v3.16b
2925        b               L(ipred_cfl_ac_420_w16_hpad)
2926
2927L(ipred_cfl_ac_444_w16_wpad):
29281:      // Copy and expand input, padding 8
2929        ld1             {v0.8h}, [x1],  x2
2930        ld1             {v2.8h}, [x10], x2
2931        shl             v0.8h,   v0.8h,   #3
2932        shl             v2.8h,   v2.8h,   #3
2933        dup             v1.8h,   v0.h[7]
2934        dup             v3.8h,   v2.h[7]
2935        subs            w8,  w8,  #2
2936        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
2937        uaddw           v24.4s,  v24.4s,  v0.4h
2938        uaddw2          v25.4s,  v25.4s,  v0.8h
2939        uaddw           v26.4s,  v26.4s,  v1.4h
2940        uaddw2          v27.4s,  v27.4s,  v1.8h
2941        uaddw           v24.4s,  v24.4s,  v2.4h
2942        uaddw2          v25.4s,  v25.4s,  v2.8h
2943        uaddw           v26.4s,  v26.4s,  v3.4h
2944        uaddw2          v27.4s,  v27.4s,  v3.8h
2945        b.gt            1b
2946        mov             v0.16b,  v2.16b
2947        mov             v1.16b,  v3.16b
2948        b               L(ipred_cfl_ac_420_w16_hpad)
2949
2950L(ipred_cfl_ac_444_w32):
2951        adr             x7,  L(ipred_cfl_ac_444_w32_tbl)
2952        ldrh            w3,  [x7, w3, uxtw] // (w3>>1) << 1
2953        lsr             x2,  x2,  #1 // Restore the stride to one line increments
2954        sub             x7,  x7,  w3, uxtw
2955        br              x7
2956
2957L(ipred_cfl_ac_444_w32_wpad0):
29581:      // Copy and expand input, without padding
2959        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x1],  x2
2960        shl             v0.8h,   v0.8h,   #3
2961        shl             v1.8h,   v1.8h,   #3
2962        shl             v2.8h,   v2.8h,   #3
2963        shl             v3.8h,   v3.8h,   #3
2964        subs            w8,  w8,  #1
2965        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
2966        uaddw           v24.4s,  v24.4s,  v0.4h
2967        uaddw2          v25.4s,  v25.4s,  v0.8h
2968        uaddw           v26.4s,  v26.4s,  v1.4h
2969        uaddw2          v27.4s,  v27.4s,  v1.8h
2970        uaddw           v24.4s,  v24.4s,  v2.4h
2971        uaddw2          v25.4s,  v25.4s,  v2.8h
2972        uaddw           v26.4s,  v26.4s,  v3.4h
2973        uaddw2          v27.4s,  v27.4s,  v3.8h
2974        b.gt            1b
2975        b               L(ipred_cfl_ac_444_w32_hpad)
2976
2977L(ipred_cfl_ac_444_w32_wpad2):
29781:      // Copy and expand input, padding 8
2979        ld1             {v0.8h, v1.8h, v2.8h}, [x1],  x2
2980        shl             v2.8h,   v2.8h,   #3
2981        shl             v0.8h,   v0.8h,   #3
2982        shl             v1.8h,   v1.8h,   #3
2983        dup             v3.8h,   v2.h[7]
2984        subs            w8,  w8,  #1
2985        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
2986        uaddw           v24.4s,  v24.4s,  v0.4h
2987        uaddw2          v25.4s,  v25.4s,  v0.8h
2988        uaddw           v26.4s,  v26.4s,  v1.4h
2989        uaddw2          v27.4s,  v27.4s,  v1.8h
2990        uaddw           v24.4s,  v24.4s,  v2.4h
2991        uaddw2          v25.4s,  v25.4s,  v2.8h
2992        uaddw           v26.4s,  v26.4s,  v3.4h
2993        uaddw2          v27.4s,  v27.4s,  v3.8h
2994        b.gt            1b
2995        b               L(ipred_cfl_ac_444_w32_hpad)
2996
2997L(ipred_cfl_ac_444_w32_wpad4):
29981:      // Copy and expand input, padding 16
2999        ld1             {v0.8h, v1.8h}, [x1],  x2
3000        shl             v1.8h,   v1.8h,   #3
3001        shl             v0.8h,   v0.8h,   #3
3002        dup             v2.8h,   v1.h[7]
3003        dup             v3.8h,   v1.h[7]
3004        subs            w8,  w8,  #1
3005        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
3006        uaddw           v24.4s,  v24.4s,  v0.4h
3007        uaddw2          v25.4s,  v25.4s,  v0.8h
3008        uaddw           v26.4s,  v26.4s,  v1.4h
3009        uaddw2          v27.4s,  v27.4s,  v1.8h
3010        uaddw           v24.4s,  v24.4s,  v2.4h
3011        uaddw2          v25.4s,  v25.4s,  v2.8h
3012        uaddw           v26.4s,  v26.4s,  v3.4h
3013        uaddw2          v27.4s,  v27.4s,  v3.8h
3014        b.gt            1b
3015        b               L(ipred_cfl_ac_444_w32_hpad)
3016
3017L(ipred_cfl_ac_444_w32_wpad6):
30181:      // Copy and expand input, padding 24
3019        ld1             {v0.8h}, [x1],  x2
3020        shl             v0.8h,   v0.8h,   #3
3021        dup             v1.8h,   v0.h[7]
3022        dup             v2.8h,   v0.h[7]
3023        dup             v3.8h,   v0.h[7]
3024        subs            w8,  w8,  #1
3025        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
3026        uaddw           v24.4s,  v24.4s,  v0.4h
3027        uaddw2          v25.4s,  v25.4s,  v0.8h
3028        uaddw           v26.4s,  v26.4s,  v1.4h
3029        uaddw2          v27.4s,  v27.4s,  v1.8h
3030        uaddw           v24.4s,  v24.4s,  v2.4h
3031        uaddw2          v25.4s,  v25.4s,  v2.8h
3032        uaddw           v26.4s,  v26.4s,  v3.4h
3033        uaddw2          v27.4s,  v27.4s,  v3.8h
3034        b.gt            1b
3035
3036L(ipred_cfl_ac_444_w32_hpad):
3037        cbz             w4,  3f
30382:      // Vertical padding (h_pad > 0)
3039        subs            w4,  w4,  #2
3040        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
3041        uaddw           v24.4s,  v24.4s,  v0.4h
3042        uaddw2          v25.4s,  v25.4s,  v0.8h
3043        uaddw           v26.4s,  v26.4s,  v1.4h
3044        uaddw2          v27.4s,  v27.4s,  v1.8h
3045        uaddw           v24.4s,  v24.4s,  v2.4h
3046        uaddw2          v25.4s,  v25.4s,  v2.8h
3047        uaddw           v26.4s,  v26.4s,  v3.4h
3048        uaddw2          v27.4s,  v27.4s,  v3.8h
3049        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
3050        uaddw           v24.4s,  v24.4s,  v0.4h
3051        uaddw2          v25.4s,  v25.4s,  v0.8h
3052        uaddw           v26.4s,  v26.4s,  v1.4h
3053        uaddw2          v27.4s,  v27.4s,  v1.8h
3054        uaddw           v24.4s,  v24.4s,  v2.4h
3055        uaddw2          v25.4s,  v25.4s,  v2.8h
3056        uaddw           v26.4s,  v26.4s,  v3.4h
3057        uaddw2          v27.4s,  v27.4s,  v3.8h
3058        b.gt            2b
30593:
3060
3061        //  Multiply the height by eight and reuse the w4 subtracting
3062        lsl             w6,  w6,  #3
3063        b               L(ipred_cfl_ac_420_w4_calc_subtract_dc)
3064
3065L(ipred_cfl_ac_444_tbl):
3066        .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w32)
3067        .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w16)
3068        .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w8)
3069        .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w4)
3070
3071L(ipred_cfl_ac_444_w32_tbl):
3072        .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad0)
3073        .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad2)
3074        .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad4)
3075        .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad6)
3076endfunc
3077