1/*****************************************************************************
2 * predict.S: arm intra prediction
3 *****************************************************************************
4 * Copyright (C) 2009-2014 x264 project
5 *
6 * Authors: David Conrad <lessen42@gmail.com>
7 *          Mans Rullgard <mans@mansr.com>
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
22 *
23 * This program is also available under a commercial proprietary license.
24 * For more information, contact us at licensing@x264.com.
25 *****************************************************************************/
26
27#include "asm.S"
28
29.section .rodata
30.align 4
31
32p16weight: .short 1,2,3,4,5,6,7,8
33
34.text
35
36.macro ldcol.8  rd,  rs,  rt,  n=8,  hi=0
37.if \n == 8 || \hi == 0
38    vld1.8          {\rd[0]}, [\rs], \rt
39    vld1.8          {\rd[1]}, [\rs], \rt
40    vld1.8          {\rd[2]}, [\rs], \rt
41    vld1.8          {\rd[3]}, [\rs], \rt
42.endif
43.if \n == 8 || \hi == 1
44    vld1.8          {\rd[4]}, [\rs], \rt
45    vld1.8          {\rd[5]}, [\rs], \rt
46    vld1.8          {\rd[6]}, [\rs], \rt
47    vld1.8          {\rd[7]}, [\rs], \rt
48.endif
49.endm
50
51.macro add16x8  dq,  dl,  dh,  rl,  rh
52    vaddl.u8        \dq, \rl, \rh
53    vadd.u16        \dl, \dl, \dh
54    vpadd.u16       \dl, \dl, \dl
55    vpadd.u16       \dl, \dl, \dl
56.endm
57
58
59// because gcc doesn't believe in using the free shift in add
60function x264_predict_4x4_h_armv6
61    ldrb    r1, [r0, #0*FDEC_STRIDE-1]
62    ldrb    r2, [r0, #1*FDEC_STRIDE-1]
63    ldrb    r3, [r0, #2*FDEC_STRIDE-1]
64    ldrb    ip, [r0, #3*FDEC_STRIDE-1]
65    add     r1, r1, r1, lsl #8
66    add     r2, r2, r2, lsl #8
67    add     r3, r3, r3, lsl #8
68    add     ip, ip, ip, lsl #8
69    add     r1, r1, r1, lsl #16
70    str     r1, [r0, #0*FDEC_STRIDE]
71    add     r2, r2, r2, lsl #16
72    str     r2, [r0, #1*FDEC_STRIDE]
73    add     r3, r3, r3, lsl #16
74    str     r3, [r0, #2*FDEC_STRIDE]
75    add     ip, ip, ip, lsl #16
76    str     ip, [r0, #3*FDEC_STRIDE]
77    bx      lr
78endfunc
79
80function x264_predict_4x4_v_armv6
81    ldr     r1,  [r0, #0 - 1 * FDEC_STRIDE]
82    str     r1,  [r0, #0 + 0 * FDEC_STRIDE]
83    str     r1,  [r0, #0 + 1 * FDEC_STRIDE]
84    str     r1,  [r0, #0 + 2 * FDEC_STRIDE]
85    str     r1,  [r0, #0 + 3 * FDEC_STRIDE]
86    bx      lr
87endfunc
88
89function x264_predict_4x4_dc_armv6
90    mov     ip, #0
91    ldr     r1, [r0, #-FDEC_STRIDE]
92    ldrb    r2, [r0, #0*FDEC_STRIDE-1]
93    ldrb    r3, [r0, #1*FDEC_STRIDE-1]
94    usad8   r1, r1, ip
95    add     r2, r2, #4
96    ldrb    ip, [r0, #2*FDEC_STRIDE-1]
97    add     r2, r2, r3
98    ldrb    r3, [r0, #3*FDEC_STRIDE-1]
99    add     r2, r2, ip
100    add     r2, r2, r3
101    add     r1, r1, r2
102    lsr     r1, r1, #3
103    add     r1, r1, r1, lsl #8
104    add     r1, r1, r1, lsl #16
105    str     r1, [r0, #0*FDEC_STRIDE]
106    str     r1, [r0, #1*FDEC_STRIDE]
107    str     r1, [r0, #2*FDEC_STRIDE]
108    str     r1, [r0, #3*FDEC_STRIDE]
109    bx      lr
110endfunc
111
112function x264_predict_4x4_dc_top_neon
113    mov         r12, #FDEC_STRIDE
114    sub         r1, r0, #FDEC_STRIDE
115    vld1.32     d1[], [r1,:32]
116    vpaddl.u8   d1, d1
117    vpadd.u16   d1, d1, d1
118    vrshr.u16   d1, d1, #2
119    vdup.8      d1, d1[0]
120    vst1.32     d1[0], [r0,:32], r12
121    vst1.32     d1[0], [r0,:32], r12
122    vst1.32     d1[0], [r0,:32], r12
123    vst1.32     d1[0], [r0,:32], r12
124    bx          lr
125endfunc
126
127// return a1 = (a1+2*b1+c1+2)>>2  a2 = (a2+2*b2+c2+2)>>2
128.macro PRED4x4_LOWPASS a1 b1 c1 a2 b2 c2 pb_1
129    uhadd8  \a1, \a1, \c1
130    uhadd8  \a2, \a2, \c2
131    uhadd8  \c1, \a1, \b1
132    uhadd8  \c2, \a2, \b2
133    eor     \a1, \a1, \b1
134    eor     \a2, \a2, \b2
135    and     \a1, \a1, \pb_1
136    and     \a2, \a2, \pb_1
137    uadd8   \a1, \a1, \c1
138    uadd8   \a2, \a2, \c2
139.endm
140
141function x264_predict_4x4_ddr_armv6
142    ldr     r1, [r0, # -FDEC_STRIDE]
143    ldrb    r2, [r0, # -FDEC_STRIDE-1]
144    ldrb    r3, [r0, #0*FDEC_STRIDE-1]
145    push    {r4-r6,lr}
146    add     r2, r2, r1, lsl #8
147    ldrb    r4, [r0, #1*FDEC_STRIDE-1]
148    add     r3, r3, r2, lsl #8
149    ldrb    r5, [r0, #2*FDEC_STRIDE-1]
150    ldrb    r6, [r0, #3*FDEC_STRIDE-1]
151    add     r4, r4, r3, lsl #8
152    add     r5, r5, r4, lsl #8
153    add     r6, r6, r5, lsl #8
154    ldr     ip, =0x01010101
155    PRED4x4_LOWPASS r1, r2, r3, r4, r5, r6, ip
156    str     r1, [r0, #0*FDEC_STRIDE]
157    lsl     r2, r1, #8
158    lsl     r3, r1, #16
159    lsl     r4, r4, #8
160    lsl     r5, r1, #24
161    add     r2, r2, r4, lsr #24
162    str     r2, [r0, #1*FDEC_STRIDE]
163    add     r3, r3, r4, lsr #16
164    str     r3, [r0, #2*FDEC_STRIDE]
165    add     r5, r5, r4, lsr #8
166    str     r5, [r0, #3*FDEC_STRIDE]
167    pop     {r4-r6,pc}
168endfunc
169
170function x264_predict_4x4_ddl_neon
171    sub         r0, #FDEC_STRIDE
172    mov         ip, #FDEC_STRIDE
173    vld1.64     {d0}, [r0], ip
174    vdup.8      d3, d0[7]
175    vext.8      d1, d0, d0, #1
176    vext.8      d2, d0, d3, #2
177    vhadd.u8    d0, d0, d2
178    vrhadd.u8   d0, d0, d1
179    vst1.32     {d0[0]}, [r0,:32], ip
180    vext.8      d1, d0, d0, #1
181    vext.8      d2, d0, d0, #2
182    vst1.32     {d1[0]}, [r0,:32], ip
183    vext.8      d3, d0, d0, #3
184    vst1.32     {d2[0]}, [r0,:32], ip
185    vst1.32     {d3[0]}, [r0,:32], ip
186    bx          lr
187endfunc
188
189function x264_predict_8x8_dc_neon
190    mov     ip, #0
191    ldrd    r2, r3, [r1, #8]
192    push    {r4-r5,lr}
193    ldrd    r4, r5, [r1, #16]
194    lsl     r3, r3, #8
195    ldrb    lr, [r1, #7]
196    usad8   r2, r2, ip
197    usad8   r3, r3, ip
198    usada8  r2, r4, ip, r2
199    add     lr, lr, #8
200    usada8  r3, r5, ip, r3
201    add     r2, r2, lr
202    mov     ip, #FDEC_STRIDE
203    add     r2, r2, r3
204    lsr     r2, r2, #4
205
206    vdup.8   d0, r2
207.rept 8
208    vst1.64 {d0}, [r0,:64], ip
209.endr
210    pop    {r4-r5,pc}
211endfunc
212
213function x264_predict_8x8_h_neon
214    add         r1, r1, #7
215    mov         ip, #FDEC_STRIDE
216    vld1.64     {d16}, [r1]
217    vdup.8      d0, d16[7]
218    vdup.8      d1, d16[6]
219    vst1.64     {d0}, [r0,:64], ip
220    vdup.8      d2, d16[5]
221    vst1.64     {d1}, [r0,:64], ip
222    vdup.8      d3, d16[4]
223    vst1.64     {d2}, [r0,:64], ip
224    vdup.8      d4, d16[3]
225    vst1.64     {d3}, [r0,:64], ip
226    vdup.8      d5, d16[2]
227    vst1.64     {d4}, [r0,:64], ip
228    vdup.8      d6, d16[1]
229    vst1.64     {d5}, [r0,:64], ip
230    vdup.8      d7, d16[0]
231    vst1.64     {d6}, [r0,:64], ip
232    vst1.64     {d7}, [r0,:64], ip
233    bx          lr
234endfunc
235
236function x264_predict_8x8_v_neon
237    add         r1, r1, #16
238    mov         r12, #FDEC_STRIDE
239    vld1.8      {d0}, [r1,:64]
240.rept 8
241    vst1.8      {d0}, [r0,:64], r12
242.endr
243    bx          lr
244endfunc
245
246function x264_predict_8x8_ddl_neon
247    add         r1, #16
248    vld1.8      {d0, d1}, [r1,:128]
249    vmov.i8     q3, #0
250    vrev64.8    d2, d1
251    vext.8      q8, q3, q0, #15
252    vext.8      q2, q0, q1, #1
253    vhadd.u8    q8, q2
254    mov         r12, #FDEC_STRIDE
255    vrhadd.u8   q0, q8
256    vext.8      d2, d0, d1, #1
257    vext.8      d3, d0, d1, #2
258    vst1.8      d2, [r0,:64], r12
259    vext.8      d2, d0, d1, #3
260    vst1.8      d3, [r0,:64], r12
261    vext.8      d3, d0, d1, #4
262    vst1.8      d2, [r0,:64], r12
263    vext.8      d2, d0, d1, #5
264    vst1.8      d3, [r0,:64], r12
265    vext.8      d3, d0, d1, #6
266    vst1.8      d2, [r0,:64], r12
267    vext.8      d2, d0, d1, #7
268    vst1.8      d3, [r0,:64], r12
269    vst1.8      d2, [r0,:64], r12
270    vst1.8      d1, [r0,:64], r12
271    bx          lr
272endfunc
273
274function x264_predict_8x8_ddr_neon
275    vld1.8      {d0-d3}, [r1,:128]
276    vext.8      q2, q0, q1, #7
277    vext.8      q3, q0, q1, #9
278
279    vhadd.u8    q2, q2, q3
280    vrhadd.u8   d0, d1, d4
281    vrhadd.u8   d1, d2, d5
282
283    add         r0, #7*FDEC_STRIDE
284    mov         r12, #-1*FDEC_STRIDE
285
286    vext.8      d2, d0, d1, #1
287    vst1.8      {d0}, [r0,:64], r12
288    vext.8      d4, d0, d1, #2
289    vst1.8      {d2}, [r0,:64], r12
290    vext.8      d5, d0, d1, #3
291    vst1.8      {d4}, [r0,:64], r12
292    vext.8      d4, d0, d1, #4
293    vst1.8      {d5}, [r0,:64], r12
294    vext.8      d5, d0, d1, #5
295    vst1.8      {d4}, [r0,:64], r12
296    vext.8      d4, d0, d1, #6
297    vst1.8      {d5}, [r0,:64], r12
298    vext.8      d5, d0, d1, #7
299    vst1.8      {d4}, [r0,:64], r12
300    vst1.8      {d5}, [r0,:64], r12
301    bx          lr
302endfunc
303
304function x264_predict_8x8_vl_neon
305    add         r1, #16
306    mov         r12, #FDEC_STRIDE
307
308    vld1.8      {d0, d1}, [r1,:128]
309    vext.8      q1, q1, q0, #15
310    vext.8      q2, q0, q2, #1
311
312    vrhadd.u8   q3, q0, q2
313
314    vhadd.u8    q1, q1, q2
315    vrhadd.u8   q0, q0, q1
316
317    vext.8      d2, d0, d1, #1
318    vst1.8      {d6}, [r0,:64], r12
319    vext.8      d3, d6, d7, #1
320    vst1.8      {d2}, [r0,:64], r12
321    vext.8      d2, d0, d1, #2
322    vst1.8      {d3}, [r0,:64], r12
323    vext.8      d3, d6, d7, #2
324    vst1.8      {d2}, [r0,:64], r12
325    vext.8      d2, d0, d1, #3
326    vst1.8      {d3}, [r0,:64], r12
327    vext.8      d3, d6, d7, #3
328    vst1.8      {d2}, [r0,:64], r12
329    vext.8      d2, d0, d1, #4
330    vst1.8      {d3}, [r0,:64], r12
331    vst1.8      {d2}, [r0,:64], r12
332    bx          lr
333endfunc
334
335function x264_predict_8x8_vr_neon
336    add         r1, #8
337    mov         r12, #FDEC_STRIDE
338    vld1.8      {d4,d5}, [r1,:64]
339
340    vext.8      q1, q2, q2, #14
341    vext.8      q0, q2, q2, #15
342
343    vhadd.u8    q3, q2, q1
344    vrhadd.u8   q2, q2, q0
345    vrhadd.u8   q0, q0, q3
346
347    vmov        d2, d0
348
349    vst1.8      {d5}, [r0,:64], r12
350    vuzp.8      d2, d0
351    vst1.8      {d1}, [r0,:64], r12
352    vext.8      d6, d0, d5, #7
353    vext.8      d3, d2, d1, #7
354    vst1.8      {d6}, [r0,:64], r12
355    vst1.8      {d3}, [r0,:64], r12
356    vext.8      d6, d0, d5, #6
357    vext.8      d3, d2, d1, #6
358    vst1.8      {d6}, [r0,:64], r12
359    vst1.8      {d3}, [r0,:64], r12
360    vext.8      d6, d0, d5, #5
361    vext.8      d3, d2, d1, #5
362    vst1.8      {d6}, [r0,:64], r12
363    vst1.8      {d3}, [r0,:64], r12
364    bx          lr
365endfunc
366
367function x264_predict_8x8_hd_neon
368    mov         r12, #FDEC_STRIDE
369    add         r1, #7
370
371    vld1.8      {d2,d3}, [r1]
372    vext.8      q3, q1, q1, #1
373    vext.8      q2, q1, q1, #2
374
375    vrhadd.u8   q8, q1, q3
376
377    vhadd.u8    q1, q2
378    vrhadd.u8   q0, q1, q3
379
380    vzip.8      d16, d0
381
382    vext.8      d2, d0, d1, #6
383    vext.8      d3, d0, d1, #4
384    vst1.8      {d2}, [r0,:64], r12
385    vext.8      d2, d0, d1, #2
386    vst1.8      {d3}, [r0,:64], r12
387    vst1.8      {d2}, [r0,:64], r12
388    vext.8      d2, d16, d0, #6
389    vst1.8      {d0}, [r0,:64], r12
390    vext.8      d3, d16, d0, #4
391    vst1.8      {d2}, [r0,:64], r12
392    vext.8      d2, d16, d0, #2
393    vst1.8      {d3}, [r0,:64], r12
394    vst1.8      {d2}, [r0,:64], r12
395    vst1.8      {d16}, [r0,:64], r12
396
397    bx          lr
398endfunc
399
400function x264_predict_8x8_hu_neon
401    mov         r12, #FDEC_STRIDE
402    add         r1, #7
403    vld1.8      {d7}, [r1]
404    vdup.8      d6, d7[0]
405    vrev64.8    d7, d7
406
407    vext.8      d4, d7, d6, #2
408    vext.8      d2, d7, d6, #1
409
410    vhadd.u8    d16, d7, d4
411    vrhadd.u8   d0, d2, d7
412    vrhadd.u8   d1, d16, d2
413
414    vzip.8      d0, d1
415
416    vdup.16     q1, d1[3]
417
418    vext.8      q2, q0, q1, #2
419    vext.8      q3, q0, q1, #4
420    vext.8      q8, q0, q1, #6
421    vst1.8      {d0}, [r0,:64], r12
422    vst1.8      {d4}, [r0,:64], r12
423    vst1.8      {d6}, [r0,:64], r12
424    vst1.8      {d16}, [r0,:64], r12
425
426    vst1.8      {d1}, [r0,:64], r12
427    vst1.8      {d5}, [r0,:64], r12
428    vst1.8      {d7}, [r0,:64], r12
429    vst1.8      {d17}, [r0,:64]
430    bx          lr
431endfunc
432
433function x264_predict_8x8c_dc_top_neon
434    sub         r2,  r0,  #FDEC_STRIDE
435    mov         r1,  #FDEC_STRIDE
436    vld1.8      {d0}, [r2,:64]
437    vpaddl.u8   d0,  d0
438    vpadd.u16   d0,  d0,  d0
439    vrshrn.u16  d0,  q0,  #2
440    vdup.8      d1,  d0[1]
441    vdup.8      d0,  d0[0]
442    vtrn.32     d0,  d1
443    b           pred8x8_dc_end
444endfunc
445
446function x264_predict_8x8c_dc_left_neon
447    mov         r1,  #FDEC_STRIDE
448    sub         r2,  r0,  #1
449    ldcol.8     d0,  r2,  r1
450    vpaddl.u8   d0,  d0
451    vpadd.u16   d0,  d0,  d0
452    vrshrn.u16  d0,  q0,  #2
453    vdup.8      d1,  d0[1]
454    vdup.8      d0,  d0[0]
455    b           pred8x8_dc_end
456endfunc
457
458function x264_predict_8x8c_dc_neon
459    sub         r2,  r0,  #FDEC_STRIDE
460    mov         r1,  #FDEC_STRIDE
461    vld1.8      {d0}, [r2,:64]
462    sub         r2,  r0,  #1
463    ldcol.8     d1,  r2,  r1
464    vtrn.32     d0,  d1
465    vpaddl.u8   q0,  q0
466    vpadd.u16   d0,  d0,  d1
467    vpadd.u16   d1,  d0,  d0
468    vrshrn.u16  d2,  q0,  #3
469    vrshrn.u16  d3,  q0,  #2
470    vdup.8      d0,  d2[4]
471    vdup.8      d1,  d3[3]
472    vdup.8      d4,  d3[2]
473    vdup.8      d5,  d2[5]
474    vtrn.32     q0,  q2
475pred8x8_dc_end:
476    add         r2,  r0,  r1,  lsl #2
477.rept 4
478    vst1.8      {d0}, [r0,:64], r1
479    vst1.8      {d1}, [r2,:64], r1
480.endr
481    bx          lr
482endfunc
483
484function x264_predict_8x8c_h_neon
485    sub         r1, r0, #1
486    mov         ip, #FDEC_STRIDE
487.rept 4
488    vld1.8      {d0[]}, [r1], ip
489    vld1.8      {d2[]}, [r1], ip
490    vst1.64     {d0}, [r0,:64], ip
491    vst1.64     {d2}, [r0,:64], ip
492.endr
493    bx          lr
494endfunc
495
496function x264_predict_8x8c_v_neon
497    sub         r0, r0, #FDEC_STRIDE
498    mov         ip, #FDEC_STRIDE
499    vld1.64     {d0}, [r0,:64], ip
500.rept 8
501    vst1.64     {d0}, [r0,:64], ip
502.endr
503    bx          lr
504endfunc
505
506function x264_predict_8x8c_p_neon
507    sub         r3,  r0,  #FDEC_STRIDE
508    mov         r1,  #FDEC_STRIDE
509    add         r2,  r3,  #4
510    sub         r3,  r3,  #1
511    vld1.32     {d0[0]}, [r3]
512    vld1.32     {d2[0]}, [r2,:32], r1
513    ldcol.8     d0,  r3,  r1,  4,  hi=1
514    add         r3,  r3,  r1
515    ldcol.8     d3,  r3,  r1,  4
516    vaddl.u8    q8,  d2,  d3
517    vrev32.8    d0,  d0
518    vtrn.32     d2,  d3
519    vsubl.u8    q2,  d2,  d0
520    movrel      r3,  p16weight
521    vld1.16     {q0}, [r3,:128]
522    vmul.s16    d4,  d4,  d0
523    vmul.s16    d5,  d5,  d0
524    vpadd.i16   d4,  d4,  d5
525    vpaddl.s16  d4,  d4
526    vshl.i32    d5,  d4,  #4
527    vadd.s32    d4,  d4,  d5
528    vrshrn.s32  d4,  q2,  #5
529    mov         r3,  #0
530    vtrn.16     d4,  d5
531    vadd.i16    d2,  d4,  d5
532    vshl.i16    d3,  d2,  #2
533    vrev64.16   d16, d16
534    vsub.i16    d3,  d3,  d2
535    vadd.i16    d16, d16, d0
536    vshl.i16    d2,  d16, #4
537    vsub.i16    d2,  d2,  d3
538    vshl.i16    d3,  d4,  #3
539    vext.16     q0,  q0,  q0,  #7
540    vsub.i16    d6,  d5,  d3
541    vmov.16     d0[0], r3
542    vmul.i16    q0,  q0,  d4[0]
543    vdup.16     q1,  d2[0]
544    vdup.16     q2,  d4[0]
545    vdup.16     q3,  d6[0]
546    vshl.i16    q2,  q2,  #3
547    vadd.i16    q1,  q1,  q0
548    vadd.i16    q3,  q3,  q2
549    mov         r3,  #8
5501:
551    vqshrun.s16 d0,  q1,  #5
552    vadd.i16    q1,  q1,  q3
553    vst1.8      {d0}, [r0,:64], r1
554    subs        r3,  r3,  #1
555    bne         1b
556    bx          lr
557endfunc
558
559
560function x264_predict_16x16_dc_top_neon
561    sub         r2,  r0,  #FDEC_STRIDE
562    mov         r1,  #FDEC_STRIDE
563    vld1.8      {q0}, [r2,:128]
564    add16x8     q0,  d0,  d1,  d0,  d1
565    vrshrn.u16  d0,  q0,  #4
566    vdup.8      q0,  d0[0]
567    b           pred16x16_dc_end
568endfunc
569
570function x264_predict_16x16_dc_left_neon
571    mov         r1,  #FDEC_STRIDE
572    sub         r2,  r0,  #1
573    ldcol.8     d0,  r2,  r1
574    ldcol.8     d1,  r2,  r1
575    add16x8     q0,  d0,  d1,  d0,  d1
576    vrshrn.u16  d0,  q0,  #4
577    vdup.8      q0,  d0[0]
578    b           pred16x16_dc_end
579endfunc
580
581function x264_predict_16x16_dc_neon
582    sub         r3, r0, #FDEC_STRIDE
583    sub         r0, r0, #1
584    vld1.64     {d0-d1}, [r3,:128]
585    ldrb        ip, [r0], #FDEC_STRIDE
586    vaddl.u8    q0, d0, d1
587    ldrb        r1, [r0], #FDEC_STRIDE
588    vadd.u16    d0, d0, d1
589    vpadd.u16   d0, d0, d0
590    vpadd.u16   d0, d0, d0
591.rept 4
592    ldrb        r2, [r0], #FDEC_STRIDE
593    add         ip, ip, r1
594    ldrb        r3, [r0], #FDEC_STRIDE
595    add         ip, ip, r2
596    ldrb        r1, [r0], #FDEC_STRIDE
597    add         ip, ip, r3
598.endr
599    ldrb        r2, [r0], #FDEC_STRIDE
600    add         ip, ip, r1
601    ldrb        r3, [r0], #FDEC_STRIDE
602    add         ip, ip, r2
603
604    sub         r0, r0, #FDEC_STRIDE*16
605    add         ip, ip, r3
606    vdup.16     d1, ip
607    vadd.u16    d0, d0, d1
608    mov         r1, #FDEC_STRIDE
609    add         r0, r0, #1
610    vrshr.u16   d0, d0, #5
611    vdup.8      q0, d0[0]
612pred16x16_dc_end:
613.rept 16
614    vst1.64     {d0-d1}, [r0,:128], r1
615.endr
616    bx          lr
617endfunc
618
619function x264_predict_16x16_h_neon
620    sub         r1, r0, #1
621    mov         ip, #FDEC_STRIDE
622.rept 8
623    vld1.8      {d0[]}, [r1], ip
624    vmov        d1, d0
625    vld1.8      {d2[]}, [r1], ip
626    vmov        d3, d2
627    vst1.64     {d0-d1}, [r0,:128], ip
628    vst1.64     {d2-d3}, [r0,:128], ip
629.endr
630    bx          lr
631endfunc
632
633function x264_predict_16x16_v_neon
634    sub         r0, r0, #FDEC_STRIDE
635    mov         ip, #FDEC_STRIDE
636    vld1.64     {d0-d1}, [r0,:128], ip
637.rept 16
638    vst1.64     {d0-d1}, [r0,:128], ip
639.endr
640    bx          lr
641endfunc
642
643function x264_predict_16x16_p_neon
644    sub         r3,  r0,  #FDEC_STRIDE
645    mov         r1,  #FDEC_STRIDE
646    add         r2,  r3,  #8
647    sub         r3,  r3,  #1
648    vld1.8      {d0}, [r3]
649    vld1.8      {d2}, [r2,:64], r1
650    ldcol.8     d1,  r3,  r1
651    add         r3,  r3,  r1
652    ldcol.8     d3,  r3,  r1
653    vrev64.8    q0,  q0
654    vaddl.u8    q8,  d2,  d3
655    vsubl.u8    q2,  d2,  d0
656    vsubl.u8    q3,  d3,  d1
657    movrel      r3,  p16weight
658    vld1.8      {q0}, [r3,:128]
659    vmul.s16    q2,  q2,  q0
660    vmul.s16    q3,  q3,  q0
661    vadd.i16    d4,  d4,  d5
662    vadd.i16    d5,  d6,  d7
663    vpadd.i16   d4,  d4,  d5
664    vpadd.i16   d4,  d4,  d4
665    vshll.s16   q3,  d4,  #2
666    vaddw.s16   q2,  q3,  d4
667    vrshrn.s32  d4,  q2,  #6
668    mov         r3,  #0
669    vtrn.16     d4,  d5
670    vadd.i16    d2,  d4,  d5
671    vshl.i16    d3,  d2,  #3
672    vrev64.16   d16, d17
673    vsub.i16    d3,  d3,  d2
674    vadd.i16    d16, d16, d0
675    vshl.i16    d2,  d16, #4
676    vsub.i16    d2,  d2,  d3
677    vshl.i16    d3,  d4,  #4
678    vext.16     q0,  q0,  q0,  #7
679    vsub.i16    d6,  d5,  d3
680    vmov.16     d0[0], r3
681    vmul.i16    q0,  q0,  d4[0]
682    vdup.16     q1,  d2[0]
683    vdup.16     q2,  d4[0]
684    vdup.16     q3,  d6[0]
685    vshl.i16    q2,  q2,  #3
686    vadd.i16    q1,  q1,  q0
687    vadd.i16    q3,  q3,  q2
688    mov         r3,  #16
6891:
690    vqshrun.s16 d0,  q1,  #5
691    vadd.i16    q1,  q1,  q2
692    vqshrun.s16 d1,  q1,  #5
693    vadd.i16    q1,  q1,  q3
694    vst1.8      {q0}, [r0,:128], r1
695    subs        r3,  r3,  #1
696    bne         1b
697    bx          lr
698endfunc
699