1/*
2 * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#include "libavutil/arm/asm.S"
22
23        .macro ldcol.8  rd,  rs,  rt,  n=8,  hi=0
24.if \n == 8 || \hi == 0
25        vld1.8          {\rd[0]}, [\rs], \rt
26        vld1.8          {\rd[1]}, [\rs], \rt
27        vld1.8          {\rd[2]}, [\rs], \rt
28        vld1.8          {\rd[3]}, [\rs], \rt
29.endif
30.if \n == 8 || \hi == 1
31        vld1.8          {\rd[4]}, [\rs], \rt
32        vld1.8          {\rd[5]}, [\rs], \rt
33        vld1.8          {\rd[6]}, [\rs], \rt
34        vld1.8          {\rd[7]}, [\rs], \rt
35.endif
36        .endm
37
38        .macro add16x8  dq,  dl,  dh,  rl,  rh
39        vaddl.u8        \dq, \rl, \rh
40        vadd.u16        \dl, \dl, \dh
41        vpadd.u16       \dl, \dl, \dl
42        vpadd.u16       \dl, \dl, \dl
43        .endm
44
45function ff_pred16x16_128_dc_neon, export=1
46        vmov.i8         q0,  #128
47        b               .L_pred16x16_dc_end
48endfunc
49
50function ff_pred16x16_top_dc_neon, export=1
51        sub             r2,  r0,  r1
52        vld1.8          {q0},     [r2,:128]
53        add16x8         q0,  d0,  d1,  d0,  d1
54        vrshrn.u16      d0,  q0,  #4
55        vdup.8          q0,  d0[0]
56        b               .L_pred16x16_dc_end
57endfunc
58
59function ff_pred16x16_left_dc_neon, export=1
60        sub             r2,  r0,  #1
61        ldcol.8         d0,  r2,  r1
62        ldcol.8         d1,  r2,  r1
63        add16x8         q0,  d0,  d1,  d0,  d1
64        vrshrn.u16      d0,  q0,  #4
65        vdup.8          q0,  d0[0]
66        b               .L_pred16x16_dc_end
67endfunc
68
69function ff_pred16x16_dc_neon, export=1
70        sub             r2,  r0,  r1
71        vld1.8          {q0},     [r2,:128]
72        sub             r2,  r0,  #1
73        ldcol.8         d2,  r2,  r1
74        ldcol.8         d3,  r2,  r1
75        vaddl.u8        q0,  d0,  d1
76        vaddl.u8        q1,  d2,  d3
77        vadd.u16        q0,  q0,  q1
78        vadd.u16        d0,  d0,  d1
79        vpadd.u16       d0,  d0,  d0
80        vpadd.u16       d0,  d0,  d0
81        vrshrn.u16      d0,  q0,  #5
82        vdup.8          q0,  d0[0]
83.L_pred16x16_dc_end:
84        mov             r3,  #8
856:      vst1.8          {q0},     [r0,:128], r1
86        vst1.8          {q0},     [r0,:128], r1
87        subs            r3,  r3,  #1
88        bne             6b
89        bx              lr
90endfunc
91
92function ff_pred16x16_hor_neon, export=1
93        sub             r2,  r0,  #1
94        mov             r3,  #16
951:      vld1.8          {d0[],d1[]},[r2],      r1
96        vst1.8          {q0},       [r0,:128], r1
97        subs            r3,  r3,  #1
98        bne             1b
99        bx              lr
100endfunc
101
102function ff_pred16x16_vert_neon, export=1
103        sub             r0,  r0,  r1
104        vld1.8          {q0},     [r0,:128], r1
105        mov             r3,  #8
1061:      vst1.8          {q0},     [r0,:128], r1
107        vst1.8          {q0},     [r0,:128], r1
108        subs            r3,  r3,  #1
109        bne             1b
110        bx              lr
111endfunc
112
113function ff_pred16x16_plane_neon, export=1
114        sub             r3,  r0,  r1
115        add             r2,  r3,  #8
116        sub             r3,  r3,  #1
117        vld1.8          {d0},     [r3]
118        vld1.8          {d2},     [r2,:64], r1
119        ldcol.8         d1,  r3,  r1
120        add             r3,  r3,  r1
121        ldcol.8         d3,  r3,  r1
122        vrev64.8        q0,  q0
123        vaddl.u8        q8,  d2,  d3
124        vsubl.u8        q2,  d2,  d0
125        vsubl.u8        q3,  d3,  d1
126        movrel          r3,  p16weight
127        vld1.8          {q0},     [r3,:128]
128        vmul.s16        q2,  q2,  q0
129        vmul.s16        q3,  q3,  q0
130        vadd.i16        d4,  d4,  d5
131        vadd.i16        d5,  d6,  d7
132        vpadd.i16       d4,  d4,  d5
133        vpadd.i16       d4,  d4,  d4
134        vshll.s16       q3,  d4,  #2
135        vaddw.s16       q2,  q3,  d4
136        vrshrn.s32      d4,  q2,  #6
137        mov             r3,  #0
138        vtrn.16         d4,  d5
139        vadd.i16        d2,  d4,  d5
140        vshl.i16        d3,  d2,  #3
141        vrev64.16       d16, d17
142        vsub.i16        d3,  d3,  d2
143        vadd.i16        d16, d16, d0
144        vshl.i16        d2,  d16, #4
145        vsub.i16        d2,  d2,  d3
146        vshl.i16        d3,  d4,  #4
147        vext.16         q0,  q0,  q0,  #7
148        vsub.i16        d6,  d5,  d3
149        vmov.16         d0[0], r3
150        vmul.i16        q0,  q0,  d4[0]
151        vdup.16         q1,  d2[0]
152        vdup.16         q2,  d4[0]
153        vdup.16         q3,  d6[0]
154        vshl.i16        q2,  q2,  #3
155        vadd.i16        q1,  q1,  q0
156        vadd.i16        q3,  q3,  q2
157        mov             r3,  #16
1581:
159        vqshrun.s16     d0,  q1,  #5
160        vadd.i16        q1,  q1,  q2
161        vqshrun.s16     d1,  q1,  #5
162        vadd.i16        q1,  q1,  q3
163        vst1.8          {q0},     [r0,:128], r1
164        subs            r3,  r3,  #1
165        bne             1b
166        bx              lr
167endfunc
168
169const   p16weight, align=4
170        .short          1,2,3,4,5,6,7,8
171endconst
172
173function ff_pred8x8_hor_neon, export=1
174        sub             r2,  r0,  #1
175        mov             r3,  #8
1761:      vld1.8          {d0[]},   [r2],     r1
177        vst1.8          {d0},     [r0,:64], r1
178        subs            r3,  r3,  #1
179        bne             1b
180        bx              lr
181endfunc
182
183function ff_pred8x8_vert_neon, export=1
184        sub             r0,  r0,  r1
185        vld1.8          {d0},     [r0,:64], r1
186        mov             r3,  #4
1871:      vst1.8          {d0},     [r0,:64], r1
188        vst1.8          {d0},     [r0,:64], r1
189        subs            r3,  r3,  #1
190        bne             1b
191        bx              lr
192endfunc
193
194function ff_pred8x8_plane_neon, export=1
195        sub             r3,  r0,  r1
196        add             r2,  r3,  #4
197        sub             r3,  r3,  #1
198        vld1.32         {d0[0]},  [r3]
199        vld1.32         {d2[0]},  [r2,:32], r1
200        ldcol.8         d0,  r3,  r1,  4,  hi=1
201        add             r3,  r3,  r1
202        ldcol.8         d3,  r3,  r1,  4
203        vaddl.u8        q8,  d2,  d3
204        vrev32.8        d0,  d0
205        vtrn.32         d2,  d3
206        vsubl.u8        q2,  d2,  d0
207        movrel          r3,  p16weight
208        vld1.16         {q0},     [r3,:128]
209        vmul.s16        d4,  d4,  d0
210        vmul.s16        d5,  d5,  d0
211        vpadd.i16       d4,  d4,  d5
212        vpaddl.s16      d4,  d4
213        vshl.i32        d5,  d4,  #4
214        vadd.s32        d4,  d4,  d5
215        vrshrn.s32      d4,  q2,  #5
216        mov             r3,  #0
217        vtrn.16         d4,  d5
218        vadd.i16        d2,  d4,  d5
219        vshl.i16        d3,  d2,  #2
220        vrev64.16       d16, d16
221        vsub.i16        d3,  d3,  d2
222        vadd.i16        d16, d16, d0
223        vshl.i16        d2,  d16, #4
224        vsub.i16        d2,  d2,  d3
225        vshl.i16        d3,  d4,  #3
226        vext.16         q0,  q0,  q0,  #7
227        vsub.i16        d6,  d5,  d3
228        vmov.16         d0[0], r3
229        vmul.i16        q0,  q0,  d4[0]
230        vdup.16         q1,  d2[0]
231        vdup.16         q2,  d4[0]
232        vdup.16         q3,  d6[0]
233        vshl.i16        q2,  q2,  #3
234        vadd.i16        q1,  q1,  q0
235        vadd.i16        q3,  q3,  q2
236        mov             r3,  #8
2371:
238        vqshrun.s16     d0,  q1,  #5
239        vadd.i16        q1,  q1,  q3
240        vst1.8          {d0},     [r0,:64], r1
241        subs            r3,  r3,  #1
242        bne             1b
243        bx              lr
244endfunc
245
246function ff_pred8x8_128_dc_neon, export=1
247        vmov.i8         q0,  #128
248        b               .L_pred8x8_dc_end
249endfunc
250
251function ff_pred8x8_top_dc_neon, export=1
252        sub             r2,  r0,  r1
253        vld1.8          {d0},     [r2,:64]
254        vpaddl.u8       d0,  d0
255        vpadd.u16       d0,  d0,  d0
256        vrshrn.u16      d0,  q0,  #2
257        vdup.8          d1,  d0[1]
258        vdup.8          d0,  d0[0]
259        vtrn.32         d0,  d1
260        b               .L_pred8x8_dc_end
261endfunc
262
263function ff_pred8x8_left_dc_neon, export=1
264        sub             r2,  r0,  #1
265        ldcol.8         d0,  r2,  r1
266        vpaddl.u8       d0,  d0
267        vpadd.u16       d0,  d0,  d0
268        vrshrn.u16      d0,  q0,  #2
269        vdup.8          d1,  d0[1]
270        vdup.8          d0,  d0[0]
271        b               .L_pred8x8_dc_end
272endfunc
273
274function ff_pred8x8_dc_neon, export=1
275        sub             r2,  r0,  r1
276        vld1.8          {d0},     [r2,:64]
277        sub             r2,  r0,  #1
278        ldcol.8         d1,  r2,  r1
279        vtrn.32         d0,  d1
280        vpaddl.u8       q0,  q0
281        vpadd.u16       d0,  d0,  d1
282        vpadd.u16       d1,  d0,  d0
283        vrshrn.u16      d2,  q0,  #3
284        vrshrn.u16      d3,  q0,  #2
285        vdup.8          d0,  d2[4]
286        vdup.8          d1,  d3[3]
287        vdup.8          d4,  d3[2]
288        vdup.8          d5,  d2[5]
289        vtrn.32         q0,  q2
290.L_pred8x8_dc_end:
291        mov             r3,  #4
292        add             r2,  r0,  r1,  lsl #2
2936:      vst1.8          {d0},     [r0,:64], r1
294        vst1.8          {d1},     [r2,:64], r1
295        subs            r3,  r3,  #1
296        bne             6b
297        bx              lr
298endfunc
299
300function ff_pred8x8_l0t_dc_neon, export=1
301        sub             r2,  r0,  r1
302        vld1.8          {d0},     [r2,:64]
303        sub             r2,  r0,  #1
304        ldcol.8         d1,  r2,  r1,  4
305        vtrn.32         d0,  d1
306        vpaddl.u8       q0,  q0
307        vpadd.u16       d0,  d0,  d1
308        vpadd.u16       d1,  d0,  d0
309        vrshrn.u16      d2,  q0,  #3
310        vrshrn.u16      d3,  q0,  #2
311        vdup.8          d0,  d2[4]
312        vdup.8          d1,  d3[0]
313        vdup.8          q2,  d3[2]
314        vtrn.32         q0,  q2
315        b               .L_pred8x8_dc_end
316endfunc
317
318function ff_pred8x8_l00_dc_neon, export=1
319        sub             r2,  r0,  #1
320        ldcol.8         d0,  r2,  r1,  4
321        vpaddl.u8       d0,  d0
322        vpadd.u16       d0,  d0,  d0
323        vrshrn.u16      d0,  q0,  #2
324        vmov.i8         d1,  #128
325        vdup.8          d0,  d0[0]
326        b               .L_pred8x8_dc_end
327endfunc
328
329function ff_pred8x8_0lt_dc_neon, export=1
330        sub             r2,  r0,  r1
331        vld1.8          {d0},     [r2,:64]
332        add             r2,  r0,  r1,  lsl #2
333        sub             r2,  r2,  #1
334        ldcol.8         d1,  r2,  r1,  4,  hi=1
335        vtrn.32         d0,  d1
336        vpaddl.u8       q0,  q0
337        vpadd.u16       d0,  d0,  d1
338        vpadd.u16       d1,  d0,  d0
339        vrshrn.u16      d3,  q0,  #2
340        vrshrn.u16      d2,  q0,  #3
341        vdup.8          d0,  d3[0]
342        vdup.8          d1,  d3[3]
343        vdup.8          d4,  d3[2]
344        vdup.8          d5,  d2[5]
345        vtrn.32         q0,  q2
346        b               .L_pred8x8_dc_end
347endfunc
348
349function ff_pred8x8_0l0_dc_neon, export=1
350        add             r2,  r0,  r1,  lsl #2
351        sub             r2,  r2,  #1
352        ldcol.8         d1,  r2,  r1,  4
353        vpaddl.u8       d2,  d1
354        vpadd.u16       d2,  d2,  d2
355        vrshrn.u16      d1,  q1,  #2
356        vmov.i8         d0,  #128
357        vdup.8          d1,  d1[0]
358        b               .L_pred8x8_dc_end
359endfunc
360