1/*****************************************************************************
2 * predict.S: aarch64 intra prediction
3 *****************************************************************************
4 * Copyright (C) 2009-2021 x264 project
5 *
6 * Authors: David Conrad <lessen42@gmail.com>
7 *          Mans Rullgard <mans@mansr.com>
8 *          Janne Grunau <janne-x264@jannau.net>
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
23 *
24 * This program is also available under a commercial proprietary license.
25 * For more information, contact us at licensing@x264.com.
26 *****************************************************************************/
27
28#include "asm.S"
29
30const p8weight, align=4
31    .short      1, 2, 3, 4, 1, 2, 3, 4
32endconst
33const p16weight, align=4
34    .short      1, 2, 3, 4, 5, 6, 7, 8
35endconst
36
37.macro ldcol.8  vd,  xn,  xm,  n=8,  hi=0
38.if \n == 8 || \hi == 0
39    ld1        {\vd\().b}[0], [\xn], \xm
40    ld1        {\vd\().b}[1], [\xn], \xm
41    ld1        {\vd\().b}[2], [\xn], \xm
42    ld1        {\vd\().b}[3], [\xn], \xm
43.endif
44.if \n == 8 || \hi == 1
45    ld1        {\vd\().b}[4], [\xn], \xm
46    ld1        {\vd\().b}[5], [\xn], \xm
47    ld1        {\vd\().b}[6], [\xn], \xm
48    ld1        {\vd\().b}[7], [\xn], \xm
49.endif
50.endm
51
52.macro ldcol.16  vd,  xn,  xm
53    ldcol.8     \vd, \xn, \xm
54    ld1        {\vd\().b}[ 8], [\xn], \xm
55    ld1        {\vd\().b}[ 9], [\xn], \xm
56    ld1        {\vd\().b}[10], [\xn], \xm
57    ld1        {\vd\().b}[11], [\xn], \xm
58    ld1        {\vd\().b}[12], [\xn], \xm
59    ld1        {\vd\().b}[13], [\xn], \xm
60    ld1        {\vd\().b}[14], [\xn], \xm
61    ld1        {\vd\().b}[15], [\xn], \xm
62.endm
63
64
65function predict_4x4_h_aarch64, export=1
66    ldurb   w1,  [x0, #0*FDEC_STRIDE-1]
67    mov     w5,  #0x01010101
68    ldrb    w2,  [x0, #1*FDEC_STRIDE-1]
69    ldrb    w3,  [x0, #2*FDEC_STRIDE-1]
70    mul     w1,  w1,  w5
71    ldrb    w4,  [x0, #3*FDEC_STRIDE-1]
72    mul     w2,  w2,  w5
73    str     w1,  [x0, #0*FDEC_STRIDE]
74    mul     w3,  w3,  w5
75    str     w2,  [x0, #1*FDEC_STRIDE]
76    mul     w4,  w4,  w5
77    str     w3,  [x0, #2*FDEC_STRIDE]
78    str     w4,  [x0, #3*FDEC_STRIDE]
79    ret
80endfunc
81
82function predict_4x4_v_aarch64, export=1
83    ldur    w1,  [x0, #0 - 1 * FDEC_STRIDE]
84    str     w1,  [x0, #0 + 0 * FDEC_STRIDE]
85    str     w1,  [x0, #0 + 1 * FDEC_STRIDE]
86    str     w1,  [x0, #0 + 2 * FDEC_STRIDE]
87    str     w1,  [x0, #0 + 3 * FDEC_STRIDE]
88    ret
89endfunc
90
91function predict_4x4_dc_neon, export=1
92    sub         x1,  x0,  #FDEC_STRIDE
93    ldurb       w4,  [x0, #-1 + 0 * FDEC_STRIDE]
94    ldrb        w5,  [x0, #-1 + 1 * FDEC_STRIDE]
95    ldrb        w6,  [x0, #-1 + 2 * FDEC_STRIDE]
96    ldrb        w7,  [x0, #-1 + 3 * FDEC_STRIDE]
97    add         w4,  w4,  w5
98    ldr         s0, [x1]
99    add         w6,  w6,  w7
100    uaddlv      h0,  v0.8b
101    add         w4,  w4,  w6
102    dup         v0.4h,  v0.h[0]
103    dup         v1.4h,  w4
104    add         v0.4h,  v0.4h,  v1.4h
105    rshrn       v0.8b,  v0.8h,  #3
106    str         s0,  [x0]
107    str         s0,  [x0, #1 * FDEC_STRIDE]
108    str         s0,  [x0, #2 * FDEC_STRIDE]
109    str         s0,  [x0, #3 * FDEC_STRIDE]
110    ret
111endfunc
112
113function predict_4x4_dc_top_neon, export=1
114    sub         x1,  x0,  #FDEC_STRIDE
115    ldr         s0, [x1]
116    uaddlv      h0,  v0.8b
117    dup         v0.4h,  v0.h[0]
118    rshrn       v0.8b,  v0.8h,  #2
119    str         s0,  [x0]
120    str         s0,  [x0, #1 * FDEC_STRIDE]
121    str         s0,  [x0, #2 * FDEC_STRIDE]
122    str         s0,  [x0, #3 * FDEC_STRIDE]
123    ret
124    ret
125endfunc
126
127function predict_4x4_ddr_neon, export=1
128    sub         x1,  x0,  #FDEC_STRIDE+1
129    mov         x7,  #FDEC_STRIDE
130    ld1        {v0.8b}, [x1], x7            // # -FDEC_STRIDE-1
131    ld1r       {v1.8b}, [x1], x7            // #0*FDEC_STRIDE-1
132    ld1r       {v2.8b}, [x1], x7            // #1*FDEC_STRIDE-1
133    ext         v0.8b,  v1.8b,  v0.8b,  #7
134    ld1r       {v3.8b}, [x1], x7            // #2*FDEC_STRIDE-1
135    ext         v0.8b,  v2.8b,  v0.8b,  #7  // a
136    ld1r       {v4.8b}, [x1], x7            // #3*FDEC_STRIDE-1
137    ext         v1.8b,  v3.8b,  v0.8b,  #7  // b
138    ext         v2.8b,  v4.8b,  v1.8b,  #7  // c
139    uaddl       v0.8h,  v0.8b,  v1.8b
140    uaddl       v1.8h,  v1.8b,  v2.8b
141    add         v0.8h,  v0.8h,  v1.8h
142    rshrn       v0.8b,  v0.8h,  #2
143
144    ext         v3.8b,  v0.8b, v0.8b,  #3
145    ext         v2.8b,  v0.8b, v0.8b,  #2
146    ext         v1.8b,  v0.8b, v0.8b,  #1
147
148    str         s3,  [x0], #FDEC_STRIDE
149    str         s2,  [x0], #FDEC_STRIDE
150    str         s1,  [x0], #FDEC_STRIDE
151    str         s0,  [x0]
152    ret
153endfunc
154
155function predict_4x4_ddl_neon, export=1
156    sub         x0,  x0,  #FDEC_STRIDE
157    mov         x7,  #FDEC_STRIDE
158    ld1        {v0.8b}, [x0],  x7
159    dup         v3.8b,  v0.b[7]
160    ext         v1.8b,  v0.8b,  v0.8b,  #1
161    ext         v2.8b,  v0.8b,  v3.8b,  #2
162    uhadd       v0.8b,  v0.8b,  v2.8b
163    urhadd      v0.8b,  v0.8b,  v1.8b
164    str         s0,  [x0], #FDEC_STRIDE
165    ext         v1.8b,  v0.8b,  v0.8b,  #1
166    ext         v2.8b,  v0.8b,  v0.8b,  #2
167    str         s1,  [x0], #FDEC_STRIDE
168    ext         v3.8b,  v0.8b,  v0.8b,  #3
169    str         s2,  [x0], #FDEC_STRIDE
170    str         s3,  [x0]
171    ret
172endfunc
173
174function predict_8x8_dc_neon, export=1
175    mov         x7,  #FDEC_STRIDE
176    ld1        {v0.16b}, [x1], #16
177    ld1        {v1.8b},  [x1]
178    ext         v0.16b, v0.16b, v0.16b, #7
179    uaddlv      h1,  v1.8b
180    uaddlv      h0,  v0.8b
181    add         v0.8h,  v0.8h,  v1.8h
182    dup         v0.8h,  v0.h[0]
183    rshrn       v0.8b,  v0.8h,  #4
184.rept 8
185    st1        {v0.8b}, [x0], x7
186.endr
187    ret
188endfunc
189
190function predict_8x8_h_neon, export=1
191    mov         x7,  #FDEC_STRIDE
192    ld1        {v16.16b}, [x1]
193    dup         v0.8b, v16.b[14]
194    dup         v1.8b, v16.b[13]
195    st1        {v0.8b}, [x0], x7
196    dup         v2.8b, v16.b[12]
197    st1        {v1.8b}, [x0], x7
198    dup         v3.8b, v16.b[11]
199    st1        {v2.8b}, [x0], x7
200    dup         v4.8b, v16.b[10]
201    st1        {v3.8b}, [x0], x7
202    dup         v5.8b, v16.b[9]
203    st1        {v4.8b}, [x0], x7
204    dup         v6.8b, v16.b[8]
205    st1        {v5.8b}, [x0], x7
206    dup         v7.8b, v16.b[7]
207    st1        {v6.8b}, [x0], x7
208    st1        {v7.8b}, [x0], x7
209    ret
210endfunc
211
212function predict_8x8_v_neon, export=1
213    add         x1,  x1,  #16
214    mov         x7,  #FDEC_STRIDE
215    ld1        {v0.8b}, [x1]
216.rept 8
217    st1        {v0.8b}, [x0], x7
218.endr
219    ret
220endfunc
221
222function predict_8x8_ddl_neon, export=1
223    add         x1,  x1,  #16
224    mov         x7,  #FDEC_STRIDE
225    ld1        {v0.16b}, [x1]
226    movi        v3.16b, #0
227    dup         v2.16b, v0.b[15]
228    ext         v4.16b, v3.16b, v0.16b, #15
229    ext         v2.16b, v0.16b, v2.16b, #1
230    uhadd       v4.16b, v4.16b, v2.16b
231    urhadd      v0.16b, v0.16b, v4.16b
232    ext         v1.16b, v0.16b, v0.16b, #1
233    ext         v2.16b, v0.16b, v0.16b, #2
234    st1        {v1.8b}, [x0], x7
235    ext         v3.16b, v0.16b, v0.16b, #3
236    st1        {v2.8b}, [x0], x7
237    ext         v4.16b, v0.16b, v0.16b, #4
238    st1        {v3.8b}, [x0], x7
239    ext         v5.16b, v0.16b, v0.16b, #5
240    st1        {v4.8b}, [x0], x7
241    ext         v6.16b, v0.16b, v0.16b, #6
242    st1        {v5.8b}, [x0], x7
243    ext         v7.16b, v0.16b, v0.16b, #7
244    st1        {v6.8b}, [x0], x7
245    ext         v0.16b, v0.16b, v0.16b, #8
246    st1        {v7.8b}, [x0], x7
247    st1        {v0.8b}, [x0], x7
248    ret
249endfunc
250
251function predict_8x8_ddr_neon, export=1
252    ld1        {v0.16b,v1.16b}, [x1]
253    ext         v2.16b, v0.16b, v1.16b, #7
254    ext         v4.16b, v0.16b, v1.16b, #9
255    ext         v3.16b, v0.16b, v1.16b, #8
256
257    uhadd       v2.16b, v2.16b, v4.16b
258    urhadd      v7.16b, v3.16b, v2.16b
259
260    add         x0,  x0,  #7*FDEC_STRIDE
261    mov         x7,  #-1*FDEC_STRIDE
262
263    ext         v6.16b, v7.16b, v7.16b, #1
264    st1        {v7.8b},  [x0], x7
265    ext         v5.16b, v7.16b, v7.16b, #2
266    st1        {v6.8b},  [x0], x7
267    ext         v4.16b, v7.16b, v7.16b, #3
268    st1        {v5.8b},  [x0], x7
269    ext         v3.16b, v7.16b, v7.16b, #4
270    st1        {v4.8b},  [x0], x7
271    ext         v2.16b, v7.16b, v7.16b, #5
272    st1        {v3.8b},  [x0], x7
273    ext         v1.16b, v7.16b, v7.16b, #6
274    st1        {v2.8b},  [x0], x7
275    ext         v0.16b, v7.16b, v7.16b, #7
276    st1        {v1.8b},  [x0], x7
277    st1        {v0.8b},  [x0], x7
278    ret
279endfunc
280
281function predict_8x8_vl_neon, export=1
282    add         x1,  x1,  #16
283    mov         x7, #FDEC_STRIDE
284
285    ld1        {v0.16b}, [x1]
286    ext         v1.16b, v1.16b, v0.16b, #15
287    ext         v2.16b, v0.16b, v2.16b, #1
288
289    uhadd       v1.16b, v1.16b, v2.16b
290    urhadd      v3.16b, v0.16b, v2.16b
291
292    urhadd      v0.16b, v0.16b, v1.16b
293
294    ext        v4.16b, v0.16b, v0.16b, #1
295    st1        {v3.8b}, [x0], x7
296    ext        v5.16b, v3.16b, v3.16b, #1
297    st1        {v4.8b}, [x0], x7
298    ext        v6.16b, v0.16b, v0.16b, #2
299    st1        {v5.8b}, [x0], x7
300    ext        v7.16b, v3.16b, v3.16b, #2
301    st1        {v6.8b}, [x0], x7
302    ext        v4.16b, v0.16b, v0.16b, #3
303    st1        {v7.8b}, [x0], x7
304    ext        v5.16b, v3.16b, v3.16b, #3
305    st1        {v4.8b}, [x0], x7
306    ext        v6.16b, v0.16b, v0.16b, #4
307    st1        {v5.8b}, [x0], x7
308    st1        {v6.8b}, [x0], x7
309    ret
310endfunc
311
312function predict_8x8_vr_neon, export=1
313    add         x1,  x1,  #8
314    mov         x7,  #FDEC_STRIDE
315    ld1        {v2.16b}, [x1]
316
317    ext         v1.16b, v2.16b, v2.16b, #14
318    ext         v0.16b, v2.16b, v2.16b, #15
319
320    uhadd       v3.16b, v2.16b, v1.16b
321    urhadd      v2.16b, v2.16b, v0.16b
322    urhadd      v0.16b, v0.16b, v3.16b
323
324    ext         v1.16b, v2.16b, v2.16b, #8
325    uzp1        v2.8b,  v0.8b,  v0.8b
326    uzp2        v3.8b,  v0.8b,  v0.8b
327    ext         v0.16b, v0.16b, v0.16b, #8
328
329    st1        {v1.8b}, [x0], x7
330    st1        {v0.8b}, [x0], x7
331    ext         v4.8b, v3.8b, v1.8b, #7
332    ext         v5.8b, v2.8b, v0.8b, #7
333    st1        {v4.8b}, [x0], x7
334    st1        {v5.8b}, [x0], x7
335    ext         v6.8b, v3.8b, v1.8b, #6
336    ext         v7.8b, v2.8b, v0.8b, #6
337    st1        {v6.8b}, [x0], x7
338    st1        {v7.8b}, [x0], x7
339    ext         v1.8b, v3.8b, v1.8b, #5
340    ext         v0.8b, v2.8b, v0.8b, #5
341    st1        {v1.8b}, [x0], x7
342    st1        {v0.8b}, [x0], x7
343    ret
344endfunc
345
346function predict_8x8_hd_neon, export=1
347    add         x1,  x1,  #7
348    mov         x7, #FDEC_STRIDE
349
350    ld1        {v1.16b}, [x1]
351    ext         v3.16b, v1.16b, v1.16b, #1
352    ext         v2.16b, v1.16b, v1.16b, #2
353
354    urhadd      v4.16b, v1.16b, v3.16b
355
356    uhadd       v1.16b, v1.16b, v2.16b
357    urhadd      v0.16b, v1.16b, v3.16b
358
359    zip1        v16.8b, v4.8b,  v0.8b
360    zip2        v17.8b, v4.8b,  v0.8b
361    ext         v7.16b, v0.16b, v0.16b, #8
362
363    ext         v0.8b,  v17.8b, v7.8b,  #6
364    ext         v1.8b,  v17.8b, v7.8b,  #4
365    st1        {v0.8b},  [x0], x7
366    ext         v2.8b,  v17.8b, v7.8b,  #2
367    st1        {v1.8b},  [x0], x7
368    st1        {v2.8b},  [x0], x7
369    ext         v3.8b,  v16.8b, v17.8b, #6
370    st1        {v17.8b}, [x0], x7
371    ext         v4.8b,  v16.8b, v17.8b, #4
372    st1        {v3.8b},  [x0], x7
373    ext         v5.8b,  v16.8b, v17.8b, #2
374    st1        {v4.8b},  [x0], x7
375    st1        {v5.8b},  [x0], x7
376    st1        {v16.8b}, [x0], x7
377
378    ret
379endfunc
380
381function predict_8x8_hu_neon, export=1
382    add         x1,  x1,  #7
383    mov         x7,  #FDEC_STRIDE
384    ld1        {v7.8b}, [x1]
385    dup         v6.8b,  v7.b[0]
386    rev64       v7.8b,  v7.8b
387
388    ext         v4.8b,  v7.8b,  v6.8b,  #2
389    ext         v2.8b,  v7.8b,  v6.8b,  #1
390
391    uhadd       v5.8b,  v7.8b,  v4.8b
392    urhadd      v0.8b,  v2.8b,  v7.8b
393    urhadd      v1.8b,  v5.8b,  v2.8b
394
395    zip1        v16.8b, v0.8b,  v1.8b
396    zip2        v17.8b, v0.8b,  v1.8b
397
398    dup         v18.4h, v17.h[3]
399
400    ext         v0.8b,  v16.8b, v17.8b, #2
401    ext         v1.8b,  v16.8b, v17.8b, #4
402    ext         v2.8b,  v16.8b, v17.8b, #6
403    st1        {v16.8b}, [x0], x7
404    st1        {v0.8b},  [x0], x7
405    st1        {v1.8b},  [x0], x7
406    st1        {v2.8b},  [x0], x7
407
408    ext         v4.8b,  v17.8b, v18.8b, #2
409    ext         v5.8b,  v17.8b, v18.8b, #4
410    ext         v6.8b,  v17.8b, v18.8b, #6
411    st1        {v17.8b}, [x0], x7
412    st1        {v4.8b},  [x0], x7
413    st1        {v5.8b},  [x0], x7
414    st1        {v6.8b},  [x0]
415    ret
416endfunc
417
418
419function predict_8x8c_dc_top_neon, export=1
420    sub         x2,  x0,  #FDEC_STRIDE
421    mov         x1,  #FDEC_STRIDE
422    ld1        {v0.8b},  [x2]
423    uaddlp      v0.4h,  v0.8b
424    addp        v0.4h,  v0.4h,  v0.4h
425    rshrn       v0.8b,  v0.8h,  #2
426    dup         v3.8b,  v0.b[1]
427    dup         v2.8b,  v0.b[0]
428    transpose   v0.2s,  v1.2s,  v2.2s,  v3.2s
429    b           pred8x8c_dc_end
430endfunc
431
432function predict_8x8c_dc_left_neon, export=1
433    ldurb       w2,  [x0, #0 * FDEC_STRIDE - 1]
434    ldrb        w3,  [x0, #1 * FDEC_STRIDE - 1]
435    ldrb        w4,  [x0, #2 * FDEC_STRIDE - 1]
436    ldrb        w5,  [x0, #3 * FDEC_STRIDE - 1]
437    mov         x1,  #FDEC_STRIDE
438    add         w2,  w2,  w3
439    add         w3,  w4,  w5
440    ldrb        w6,  [x0, #4 * FDEC_STRIDE - 1]
441    ldrb        w7,  [x0, #5 * FDEC_STRIDE - 1]
442    ldrb        w8,  [x0, #6 * FDEC_STRIDE - 1]
443    ldrb        w9,  [x0, #7 * FDEC_STRIDE - 1]
444    add         w6,  w6,  w7
445    add         w7,  w8,  w9
446    add         w2,  w2,  w3
447    add         w6,  w6,  w7
448    dup         v0.8h,  w2
449    dup         v1.8h,  w6
450    rshrn       v0.8b,  v0.8h,  #2
451    rshrn       v1.8b,  v1.8h,  #2
452    b           pred8x8c_dc_end
453endfunc
454
455function predict_8x8c_dc_neon, export=1
456    mov         x1,  #FDEC_STRIDE
457    sub         x2,  x0,  #FDEC_STRIDE
458    ldurb       w10, [x0, #0 * FDEC_STRIDE - 1]
459    ldrb        w11, [x0, #1 * FDEC_STRIDE - 1]
460    ldrb        w12, [x0, #2 * FDEC_STRIDE - 1]
461    ldrb        w13, [x0, #3 * FDEC_STRIDE - 1]
462    add         w10, w10, w11
463    ldrb        w4,  [x0, #4 * FDEC_STRIDE - 1]
464    ldrb        w5,  [x0, #5 * FDEC_STRIDE - 1]
465    add         w12, w12, w13
466    ldrb        w6,  [x0, #6 * FDEC_STRIDE - 1]
467    ldrb        w7,  [x0, #7 * FDEC_STRIDE - 1]
468    add         w4,  w4,  w5
469    add         w6,  w6,  w7
470    add         w10, w10, w12, lsl #16
471    add         w4,  w4,  w6,  lsl #16
472    ld1        {v0.8b},  [x2]
473    add         x10, x10, x4,  lsl #32
474    uaddlp      v0.4h,  v0.8b  // s0, s1
475    mov         v1.d[0],  x10  // s2, s3
476    add         v3.4h,  v0.4h,  v1.4h
477    addp        v0.4h,  v0.4h,  v1.4h // s0, s1, s2, s3
478    addp        v1.4h,  v3.4h,  v3.4h // s0+s2, s1+s3, s0+s2, s1+s3
479    uzp2        v0.4h,  v0.4h,  v0.4h // s1,    s3,    s1,    s3
480    uzp1        v1.2d,  v1.2d,  v1.2d
481    uzp1        v0.2d,  v0.2d,  v0.2d
482    rshrn       v3.8b,  v1.8h,  #3
483    rshrn       v2.8b,  v0.8h,  #2
484    uzp1        v0.8b,  v3.8b,  v2.8b
485    uzp2        v1.8b,  v2.8b,  v3.8b
486pred8x8c_dc_end:
487    add         x2,  x0,  #2 * FDEC_STRIDE
488    add         x4,  x0,  #4 * FDEC_STRIDE
489    add         x5,  x0,  #6 * FDEC_STRIDE
490    st1        {v0.8b}, [x0], x1
491    st1        {v0.8b}, [x2], x1
492    st1        {v0.8b}, [x0]
493    st1        {v0.8b}, [x2]
494    st1        {v1.8b}, [x4], x1
495    st1        {v1.8b}, [x5], x1
496    st1        {v1.8b}, [x4]
497    st1        {v1.8b}, [x5]
498    ret
499endfunc
500
501function predict_8x8c_h_neon, export=1
502    sub         x1,  x0,  #1
503    mov         x7,  #FDEC_STRIDE
504.rept 4
505    ld1r       {v0.8b}, [x1], x7
506    ld1r       {v1.8b}, [x1], x7
507    st1        {v0.8b}, [x0], x7
508    st1        {v1.8b}, [x0], x7
509.endr
510    ret
511endfunc
512
513function predict_8x8c_v_aarch64, export=1
514    ldur        x1,  [x0, #-FDEC_STRIDE]
515.irp c, 0,1,2,3,4,5,6,7
516    str         x1,  [x0, #\c * FDEC_STRIDE]
517.endr
518    ret
519endfunc
520
521function predict_8x8c_p_neon, export=1
522    sub         x3,  x0,  #FDEC_STRIDE
523    mov         x1,  #FDEC_STRIDE
524    add         x2,  x3,  #4
525    sub         x3,  x3,  #1
526    ld1        {v0.s}[0], [x3]
527    ld1        {v2.s}[0], [x2], x1
528    ldcol.8     v0,  x3,  x1,  4,  hi=1
529    add         x3,  x3,  x1
530    ldcol.8     v3,  x3,  x1,  4
531    movrel      x4,  p8weight
532    movrel      x5,  p16weight
533    uaddl       v4.8h,  v2.8b,  v3.8b
534    rev32       v0.8b,  v0.8b
535    trn1        v2.2s,  v2.2s,  v3.2s
536    ld1        {v7.8h}, [x4]
537    usubl       v2.8h,  v2.8b,  v0.8b
538    mul         v2.8h,  v2.8h,  v7.8h
539    ld1        {v0.8h}, [x5]
540    saddlp      v2.4s,  v2.8h
541    addp        v2.4s,  v2.4s,  v2.4s
542    shl         v3.2s,  v2.2s,  #4
543    add         v2.2s,  v2.2s,  v3.2s
544    rshrn       v5.4h,  v2.4s,  #5    // b, c, x, x
545    addp        v2.4h,  v5.4h,  v5.4h
546    shl         v3.4h,  v2.4h,  #2
547    sub         v3.4h,  v3.4h,  v2.4h // 3 * (b + c)
548    rev64       v4.4h,  v4.4h
549    add         v4.4h,  v4.4h,  v0.4h
550    shl         v2.4h,  v4.4h,  #4              // a
551    sub         v2.4h,  v2.4h,  v3.4h           // a - 3 * (b + c) + 16
552    ext         v0.16b, v0.16b, v0.16b, #14
553    sub         v6.4h,  v5.4h,  v3.4h
554    mov         v0.h[0],  wzr
555    mul         v0.8h,  v0.8h,  v5.h[0]         // 0,1,2,3,4,5,6,7 * b
556    dup         v1.8h,  v2.h[0]                 // pix
557    dup         v2.8h,  v5.h[1]                 // c
558    add         v1.8h,  v1.8h,  v0.8h           // pix + x*b
559    mov         x3,  #8
5601:
561    subs        x3,  x3,  #1
562    sqshrun     v0.8b,  v1.8h,  #5
563    add         v1.8h,  v1.8h,  v2.8h
564    st1        {v0.8b}, [x0], x1
565    b.ne        1b
566    ret
567endfunc
568
569
570.macro loadsum4 wd, t1, t2, t3, x, idx
571  .if \idx == 0
572    ldurb       \wd,  [\x, #(\idx + 0) * FDEC_STRIDE - 1]
573  .else
574    ldrb        \wd,  [\x, #(\idx + 0) * FDEC_STRIDE - 1]
575  .endif
576    ldrb        \t1,  [\x, #(\idx + 1) * FDEC_STRIDE - 1]
577    ldrb        \t2,  [\x, #(\idx + 2) * FDEC_STRIDE - 1]
578    ldrb        \t3,  [\x, #(\idx + 3) * FDEC_STRIDE - 1]
579    add         \wd,  \wd,  \t1
580    add         \t1,  \t2,  \t3
581    add         \wd,  \wd,  \t1
582.endm
583
584function predict_8x16c_h_neon, export=1
585    sub         x2,  x0,  #1
586    add         x3,  x0,  #FDEC_STRIDE - 1
587    mov         x7,  #2 * FDEC_STRIDE
588    add         x1,  x0,  #FDEC_STRIDE
589.rept 4
590    ld1r       {v0.8b}, [x2], x7
591    ld1r       {v1.8b}, [x3], x7
592    ld1r       {v2.8b}, [x2], x7
593    ld1r       {v3.8b}, [x3], x7
594    st1        {v0.8b}, [x0], x7
595    st1        {v1.8b}, [x1], x7
596    st1        {v2.8b}, [x0], x7
597    st1        {v3.8b}, [x1], x7
598.endr
599    ret
600endfunc
601
602function predict_8x16c_v_neon, export=1
603    sub         x1,  x0,  #FDEC_STRIDE
604    mov         x2,  #2 * FDEC_STRIDE
605    ld1        {v0.8b}, [x1], x2
606.rept 8
607    st1        {v0.8b}, [x0], x2
608    st1        {v0.8b}, [x1], x2
609.endr
610    ret
611endfunc
612
613function predict_8x16c_p_neon, export=1
614    movrel      x4,  p16weight
615    ld1        {v17.8h}, [x4]
616    sub         x3,  x0,  #FDEC_STRIDE
617    mov         x1,  #FDEC_STRIDE
618    add         x2,  x3,  #4
619    sub         x3,  x3,  #1
620
621    ld1        {v0.8b}, [x3]
622    ld1        {v2.8b}, [x2], x1
623    ldcol.8     v1,  x3,  x1
624    add         x3,  x3,  x1
625    ldcol.8     v3,  x3,  x1
626    ext         v4.8b,  v2.8b,  v2.8b,  #3
627    ext         v5.8b,  v3.8b,  v3.8b,  #7
628    rev32       v0.8b,  v0.8b
629    rev64       v1.8b,  v1.8b
630
631    uaddl       v4.8h,  v5.8b,  v4.8b // a * 1/16
632
633    usubl       v2.8h,  v2.8b,  v0.8b
634    mul         v2.8h,  v2.8h,  v17.8h
635    saddlp      v2.4s,  v2.8h
636    addp        v2.4s,  v2.4s,  v2.4s  // H
637
638    usubl       v3.8h,  v3.8b,  v1.8b
639    mul         v3.8h,  v3.8h,  v17.8h
640    saddlp      v3.4s,  v3.8h
641    addp        v3.4s,  v3.4s,  v3.4s
642    addp        v3.4s,  v3.4s,  v3.4s  // V
643
644    ext         v17.16b, v17.16b, v17.16b, #14
645
646    shl         v4.4h,  v4.4h,  #4     // a
647    shl         v6.2s,  v2.2s,  #4     // 16 * H
648    shl         v7.2s,  v3.2s,  #2     // 4 * V
649    add         v2.2s,  v2.2s,  v6.2s  // 17 * H
650    add         v3.2s,  v3.2s,  v7.2s  // 5 * V
651    rshrn       v2.4h,  v2.4s,  #5     // b
652    rshrn       v3.4h,  v3.4s,  #6     // c
653
654    mov         v17.h[0],  wzr
655
656    sub         v4.4h,  v4.4h,  v2.4h  // a - b
657    shl         v6.4h,  v2.4h,  #1     // 2 * b
658    add         v4.4h,  v4.4h,  v3.4h  // a - b + c
659    shl         v7.4h,  v3.4h,  #3     // 8 * c
660    sub         v4.4h,  v4.4h,  v6.4h  // a - 3b + c
661    sub         v4.4h,  v4.4h,  v7.4h  // a - 3b - 7c
662
663    mul         v0.8h,  v17.8h, v2.h[0]         // 0,1,2,3,4,5,6,7 * b
664    dup         v1.8h,  v4.h[0]                 // i00
665    dup         v2.8h,  v3.h[0]                 // c
666    add         v1.8h,  v1.8h,  v0.8h           // pix + {0..7}*b
667    mov         x3,  #16
6681:
669    subs        x3,  x3,  #2
670    sqrshrun    v4.8b,  v1.8h,  #5
671    add         v1.8h,  v1.8h,  v2.8h
672    sqrshrun    v5.8b,  v1.8h,  #5
673    st1        {v4.8b}, [x0], x1
674    add         v1.8h,  v1.8h,  v2.8h
675    st1        {v5.8b}, [x0], x1
676    b.ne        1b
677    ret
678endfunc
679
680function predict_8x16c_dc_neon, export=1
681    mov         x1,  #FDEC_STRIDE
682    sub         x10, x0,  #FDEC_STRIDE
683    loadsum4    w2, w3, w4, w5, x0, 0
684    ld1        {v6.8b}, [x10]
685    loadsum4    w6, w7, w8, w9, x0, 4
686    uaddlp      v6.4h,  v6.8b
687    dup         v22.8h, w2              // s2
688    dup         v23.8h, w6              // s3
689    loadsum4    w2, w3, w4, w5, x0, 8
690    addp        v6.4h,  v6.4h,  v6.4h   // s0, s1
691    loadsum4    w6, w7, w8, w9, x0, 12
692    dup         v20.8h, v6.h[0]         // s0
693    dup         v21.8h, v6.h[1]         // s1
694    dup         v24.8h, w2              // s4
695    dup         v25.8h, w6              // s5
696
697    ext         v16.16b, v20.16b, v21.16b, #8
698    ext         v17.16b, v22.16b, v21.16b, #8
699    ext         v1.16b,  v23.16b, v21.16b, #8
700    ext         v2.16b,  v24.16b, v21.16b, #8
701    ext         v3.16b,  v25.16b, v21.16b, #8
702
703    add         v0.8h,  v16.8h, v17.8h
704    add         v1.8h,  v1.8h,  v23.8h
705    add         v2.8h,  v2.8h,  v24.8h
706    add         v3.8h,  v3.8h,  v25.8h
707
708    rshrn       v0.8b,  v0.8h,  #3
709    rshrn       v1.8b,  v1.8h,  #3
710    rshrn       v2.8b,  v2.8h,  #3
711    rshrn       v3.8b,  v3.8h,  #3
712
713    add         x11, x0,  #4  * FDEC_STRIDE
714    add         x12, x0,  #8  * FDEC_STRIDE
715    add         x13, x0,  #12 * FDEC_STRIDE
716.rept 4
717    st1        {v0.8b}, [x0],  x1
718    st1        {v1.8b}, [x11], x1
719    st1        {v2.8b}, [x12], x1
720    st1        {v3.8b}, [x13], x1
721.endr
722    ret
723endfunc
724
725function predict_8x16c_dc_left_neon, export=1
726    mov         x1,  #FDEC_STRIDE
727    ldurb       w2,  [x0, # 0 * FDEC_STRIDE - 1]
728    ldrb        w3,  [x0, # 1 * FDEC_STRIDE - 1]
729    ldrb        w4,  [x0, # 2 * FDEC_STRIDE - 1]
730    ldrb        w5,  [x0, # 3 * FDEC_STRIDE - 1]
731    add         w2,  w2,  w3
732
733    ldrb        w6,  [x0, # 4 * FDEC_STRIDE - 1]
734    add         w4,  w4,  w5
735    ldrb        w7,  [x0, # 5 * FDEC_STRIDE - 1]
736    add         w2,  w2,  w4
737    ldrb        w8,  [x0, # 6 * FDEC_STRIDE - 1]
738    ldrb        w9,  [x0, # 7 * FDEC_STRIDE - 1]
739    dup         v0.8h,  w2
740    add         w6,  w6,  w7
741    rshrn       v0.8b,  v0.8h,  #2
742    add         w8,  w8,  w9
743
744    ldrb        w10, [x0, # 8 * FDEC_STRIDE - 1]
745    ldrb        w11, [x0, # 9 * FDEC_STRIDE - 1]
746    add         w6,  w6,  w8
747    ldrb        w12, [x0, #10 * FDEC_STRIDE - 1]
748    ldrb        w13, [x0, #11 * FDEC_STRIDE - 1]
749    dup         v1.8h,  w6
750    add         w10,  w10,  w11
751    rshrn       v1.8b,  v1.8h,  #2
752    add         w12,  w12,  w13
753
754    ldrb        w2,  [x0, #12 * FDEC_STRIDE - 1]
755    ldrb        w3,  [x0, #13 * FDEC_STRIDE - 1]
756    add         w10,  w10,  w12
757    ldrb        w4,  [x0, #14 * FDEC_STRIDE - 1]
758    ldrb        w5,  [x0, #15 * FDEC_STRIDE - 1]
759    dup         v2.8h,  w10
760    add         w2,  w2,  w3
761    rshrn       v2.8b,  v2.8h,  #2
762    add         w4,  w4,  w5
763    st1        {v0.8b}, [x0], x1
764    st1        {v0.8b}, [x0], x1
765    add         w2,  w2,  w4
766    st1        {v0.8b}, [x0], x1
767    dup         v3.8h,  w2
768    st1        {v0.8b}, [x0], x1
769    rshrn       v3.8b,  v3.8h,  #2
770
771.irp  idx, 1, 2, 3
772.rept 4
773    st1        {v\idx\().8b}, [x0], x1
774.endr
775.endr
776    ret
777endfunc
778
779function predict_8x16c_dc_top_neon, export=1
780    sub         x2,  x0,  #FDEC_STRIDE
781    mov         x1,  #FDEC_STRIDE
782    ld1        {v0.8b}, [x2]
783    uaddlp      v0.4h,  v0.8b
784    addp        v0.4h,  v0.4h,  v0.4h
785    rshrn       v4.8b,  v0.8h,  #2
786    dup         v0.8b,  v4.b[0]
787    dup         v1.8b,  v4.b[1]
788    ext         v0.8b,  v0.8b,  v1.8b,  #4
789.rept 16
790    st1        {v0.8b}, [x0], x1
791.endr
792    ret
793endfunc
794
795
796function predict_16x16_dc_top_neon, export=1
797    sub         x2,  x0,  #FDEC_STRIDE
798    mov         x1,  #FDEC_STRIDE
799    ld1        {v0.16b}, [x2]
800    uaddlv      h0,     v0.16b
801    rshrn       v0.8b,  v0.8h,  #4
802    dup         v0.16b, v0.b[0]
803    b           pred16x16_dc_end
804endfunc
805
806function predict_16x16_dc_left_neon, export=1
807    sub         x2,  x0,  #1
808    mov         x1,  #FDEC_STRIDE
809    ldcol.16    v0,  x2,  x1
810    uaddlv      h0,     v0.16b
811    rshrn       v0.8b,  v0.8h,  #4
812    dup         v0.16b, v0.b[0]
813    b           pred16x16_dc_end
814endfunc
815
816function predict_16x16_dc_neon, export=1
817    sub         x3,  x0,  #FDEC_STRIDE
818    sub         x2,  x0,  #1
819    mov         x1,  #FDEC_STRIDE
820    ld1        {v0.16b}, [x3]
821    ldcol.16    v1,  x2,  x1
822    uaddlv      h0,     v0.16b
823    uaddlv      h1,     v1.16b
824    add         v0.4h,  v0.4h,  v1.4h
825    rshrn       v0.8b,  v0.8h,  #5
826    dup         v0.16b, v0.b[0]
827pred16x16_dc_end:
828.rept 16
829    st1        {v0.16b}, [x0], x1
830.endr
831    ret
832endfunc
833
834function predict_16x16_h_neon, export=1
835    sub         x1,  x0,  #1
836    mov         x7, #FDEC_STRIDE
837.rept 8
838    ld1r       {v0.16b}, [x1], x7
839    ld1r       {v1.16b}, [x1], x7
840    st1        {v0.16b}, [x0], x7
841    st1        {v1.16b}, [x0], x7
842.endr
843    ret
844endfunc
845
846function predict_16x16_v_neon, export=1
847    sub         x0,  x0,  #FDEC_STRIDE
848    mov         x7,  #FDEC_STRIDE
849    ld1        {v0.16b}, [x0], x7
850.rept 16
851    st1        {v0.16b}, [x0], x7
852.endr
853    ret
854endfunc
855
856function predict_16x16_p_neon, export=1
857    sub         x3,  x0,  #FDEC_STRIDE
858    mov         x1,  #FDEC_STRIDE
859    add         x2,  x3,  #8
860    sub         x3,  x3,  #1
861    ld1        {v0.8b}, [x3]
862    ld1        {v2.8b}, [x2], x1
863    ldcol.8     v1,  x3,  x1
864    add         x3,  x3,  x1
865    ldcol.8     v3,  x3,  x1
866    rev64       v0.8b,  v0.8b
867    rev64       v1.8b,  v1.8b
868    movrel      x4,  p16weight
869    uaddl       v4.8h,  v2.8b,  v3.8b
870    ld1        {v7.8h}, [x4]
871    usubl       v2.8h,  v2.8b,  v0.8b
872    usubl       v3.8h,  v3.8b,  v1.8b
873    mul         v2.8h,  v2.8h,  v7.8h
874    mul         v3.8h,  v3.8h,  v7.8h
875    saddlp      v2.4s,  v2.8h
876    saddlp      v3.4s,  v3.8h
877    addp        v2.4s,  v2.4s,  v3.4s
878    addp        v2.4s,  v2.4s,  v2.4s
879    shl         v3.2s,  v2.2s,  #2
880    add         v2.2s,  v2.2s,  v3.2s
881    rshrn       v5.4h,  v2.4s,  #6    // b, c, x, x
882    addp        v2.4h,  v5.4h,  v5.4h
883    shl         v3.4h,  v2.4h,  #3
884    sub         v3.4h,  v3.4h,  v2.4h // 7 * (b + c)
885    ext         v4.16b, v4.16b, v4.16b, #14
886    add         v4.4h,  v4.4h,  v7.4h
887    shl         v2.4h,  v4.4h,  #4              // a
888    sub         v2.4h,  v2.4h,  v3.4h           // a - 7 * (b + c) + 16
889    ext         v7.16b, v7.16b, v7.16b, #14
890    mov         v7.h[0],  wzr
891    dup         v3.8h,  v5.h[0]
892    mul         v0.8h,  v7.8h,  v5.h[0]         // 0,1,2,3,4,5,6,7 * b
893    dup         v1.8h,  v2.h[0]                 // pix
894    dup         v2.8h,  v5.h[1]                 // c
895    shl         v3.8h,  v3.8h,  #3
896    add         v1.8h,  v1.8h,  v0.8h           // pix + x*b
897    add         v3.8h,  v3.8h,  v1.8h           // pix + x{8-15}*b
898    mov         x3,  #16
8991:
900    subs        x3,  x3,  #1
901    sqshrun     v0.8b,  v1.8h,  #5
902    add         v1.8h,  v1.8h,  v2.8h
903    sqshrun2    v0.16b, v3.8h,  #5
904    add         v3.8h,  v3.8h,  v2.8h
905    st1        {v0.16b}, [x0], x1
906    b.ne        1b
907    ret
908endfunc
909