1/*
2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#include "libavutil/arm/asm.S"
22#include "neon.S"
23
24        /* H.264 qpel MC */
25
26.macro  lowpass_const   r
27        movw            \r,  #5
28        movt            \r,  #20
29        vmov.32         d6[0], \r
30.endm
31
32.macro  lowpass_8       r0,  r1,  r2,  r3,  d0,  d1,  narrow=1
33  .if \narrow
34        t0 .req q0
35        t1 .req q8
36  .else
37        t0 .req \d0
38        t1 .req \d1
39  .endif
40        vext.8          d2,  \r0, \r1, #2
41        vext.8          d3,  \r0, \r1, #3
42        vaddl.u8        q1,  d2,  d3
43        vext.8          d4,  \r0, \r1, #1
44        vext.8          d5,  \r0, \r1, #4
45        vaddl.u8        q2,  d4,  d5
46        vext.8          d30, \r0, \r1, #5
47        vaddl.u8        t0,  \r0, d30
48        vext.8          d18, \r2, \r3, #2
49        vmla.i16        t0,  q1,  d6[1]
50        vext.8          d19, \r2, \r3, #3
51        vaddl.u8        q9,  d18, d19
52        vext.8          d20, \r2, \r3, #1
53        vmls.i16        t0,  q2,  d6[0]
54        vext.8          d21, \r2, \r3, #4
55        vaddl.u8        q10, d20, d21
56        vext.8          d31, \r2, \r3, #5
57        vaddl.u8        t1,  \r2, d31
58        vmla.i16        t1,  q9,  d6[1]
59        vmls.i16        t1,  q10, d6[0]
60  .if \narrow
61        vqrshrun.s16    \d0, t0,  #5
62        vqrshrun.s16    \d1, t1,  #5
63  .endif
64        .unreq  t0
65        .unreq  t1
66.endm
67
68.macro  lowpass_8_1     r0,  r1,  d0,  narrow=1
69  .if \narrow
70        t0 .req q0
71  .else
72        t0 .req \d0
73  .endif
74        vext.8          d2,  \r0, \r1, #2
75        vext.8          d3,  \r0, \r1, #3
76        vaddl.u8        q1,  d2,  d3
77        vext.8          d4,  \r0, \r1, #1
78        vext.8          d5,  \r0, \r1, #4
79        vaddl.u8        q2,  d4,  d5
80        vext.8          d30, \r0, \r1, #5
81        vaddl.u8        t0,  \r0, d30
82        vmla.i16        t0,  q1,  d6[1]
83        vmls.i16        t0,  q2,  d6[0]
84  .if \narrow
85        vqrshrun.s16    \d0, t0,  #5
86  .endif
87        .unreq  t0
88.endm
89
90.macro  lowpass_8.16    r0,  r1,  l0,  h0,  l1,  h1,  d
91        vext.16         q1,  \r0, \r1, #2
92        vext.16         q0,  \r0, \r1, #3
93        vaddl.s16       q9,  d2,  d0
94        vext.16         q2,  \r0, \r1, #1
95        vaddl.s16       q1,  d3,  d1
96        vext.16         q3,  \r0, \r1, #4
97        vaddl.s16       q10, d4,  d6
98        vext.16         \r1, \r0, \r1, #5
99        vaddl.s16       q2,  d5,  d7
100        vaddl.s16       q0,  \h0, \h1
101        vaddl.s16       q8,  \l0, \l1
102
103        vshl.i32        q3,  q9,  #4
104        vshl.i32        q9,  q9,  #2
105        vshl.i32        q15, q10, #2
106        vadd.i32        q9,  q9,  q3
107        vadd.i32        q10, q10, q15
108
109        vshl.i32        q3,  q1,  #4
110        vshl.i32        q1,  q1,  #2
111        vshl.i32        q15, q2,  #2
112        vadd.i32        q1,  q1,  q3
113        vadd.i32        q2,  q2,  q15
114
115        vadd.i32        q9,  q9,  q8
116        vsub.i32        q9,  q9,  q10
117
118        vadd.i32        q1,  q1,  q0
119        vsub.i32        q1,  q1,  q2
120
121        vrshrn.s32      d18, q9,  #10
122        vrshrn.s32      d19, q1,  #10
123
124        vqmovun.s16     \d,  q9
125.endm
126
127function put_h264_qpel16_h_lowpass_neon_packed
128        mov             r4,  lr
129        mov             r12, #16
130        mov             r3,  #8
131        bl              put_h264_qpel8_h_lowpass_neon
132        sub             r1,  r1,  r2, lsl #4
133        add             r1,  r1,  #8
134        mov             r12, #16
135        mov             lr,  r4
136        b               put_h264_qpel8_h_lowpass_neon
137endfunc
138
139.macro  h264_qpel_h_lowpass type
140function \type\()_h264_qpel16_h_lowpass_neon
141        push            {lr}
142        mov             r12, #16
143        bl              \type\()_h264_qpel8_h_lowpass_neon
144        sub             r0,  r0,  r3, lsl #4
145        sub             r1,  r1,  r2, lsl #4
146        add             r0,  r0,  #8
147        add             r1,  r1,  #8
148        mov             r12, #16
149        pop             {lr}
150endfunc
151
152function \type\()_h264_qpel8_h_lowpass_neon
1531:      vld1.8          {d0, d1},  [r1], r2
154        vld1.8          {d16,d17}, [r1], r2
155        subs            r12, r12, #2
156        lowpass_8       d0,  d1,  d16, d17, d0,  d16
157  .ifc \type,avg
158        vld1.8          {d2},     [r0,:64], r3
159        vrhadd.u8       d0,  d0,  d2
160        vld1.8          {d3},     [r0,:64]
161        vrhadd.u8       d16, d16, d3
162        sub             r0,  r0,  r3
163  .endif
164        vst1.8          {d0},     [r0,:64], r3
165        vst1.8          {d16},    [r0,:64], r3
166        bne             1b
167        bx              lr
168endfunc
169.endm
170
171        h264_qpel_h_lowpass put
172        h264_qpel_h_lowpass avg
173
174.macro  h264_qpel_h_lowpass_l2 type
175function \type\()_h264_qpel16_h_lowpass_l2_neon
176        push            {lr}
177        mov             r12, #16
178        bl              \type\()_h264_qpel8_h_lowpass_l2_neon
179        sub             r0,  r0,  r2, lsl #4
180        sub             r1,  r1,  r2, lsl #4
181        sub             r3,  r3,  r2, lsl #4
182        add             r0,  r0,  #8
183        add             r1,  r1,  #8
184        add             r3,  r3,  #8
185        mov             r12, #16
186        pop             {lr}
187endfunc
188
189function \type\()_h264_qpel8_h_lowpass_l2_neon
1901:      vld1.8          {d0, d1},  [r1], r2
191        vld1.8          {d16,d17}, [r1], r2
192        vld1.8          {d28},     [r3], r2
193        vld1.8          {d29},     [r3], r2
194        subs            r12, r12, #2
195        lowpass_8       d0,  d1,  d16, d17, d0,  d1
196        vrhadd.u8       q0,  q0,  q14
197  .ifc \type,avg
198        vld1.8          {d2},      [r0,:64], r2
199        vrhadd.u8       d0,  d0,  d2
200        vld1.8          {d3},      [r0,:64]
201        vrhadd.u8       d1,  d1,  d3
202        sub             r0,  r0,  r2
203  .endif
204        vst1.8          {d0},      [r0,:64], r2
205        vst1.8          {d1},      [r0,:64], r2
206        bne             1b
207        bx              lr
208endfunc
209.endm
210
211        h264_qpel_h_lowpass_l2 put
212        h264_qpel_h_lowpass_l2 avg
213
214function put_h264_qpel16_v_lowpass_neon_packed
215        mov             r4,  lr
216        mov             r2,  #8
217        bl              put_h264_qpel8_v_lowpass_neon
218        sub             r1,  r1,  r3, lsl #2
219        bl              put_h264_qpel8_v_lowpass_neon
220        sub             r1,  r1,  r3, lsl #4
221        sub             r1,  r1,  r3, lsl #2
222        add             r1,  r1,  #8
223        bl              put_h264_qpel8_v_lowpass_neon
224        sub             r1,  r1,  r3, lsl #2
225        mov             lr,  r4
226        b               put_h264_qpel8_v_lowpass_neon
227endfunc
228
229.macro  h264_qpel_v_lowpass type
230function \type\()_h264_qpel16_v_lowpass_neon
231        mov             r4,  lr
232        bl              \type\()_h264_qpel8_v_lowpass_neon
233        sub             r1,  r1,  r3, lsl #2
234        bl              \type\()_h264_qpel8_v_lowpass_neon
235        sub             r0,  r0,  r2, lsl #4
236        add             r0,  r0,  #8
237        sub             r1,  r1,  r3, lsl #4
238        sub             r1,  r1,  r3, lsl #2
239        add             r1,  r1,  #8
240        bl              \type\()_h264_qpel8_v_lowpass_neon
241        sub             r1,  r1,  r3, lsl #2
242        mov             lr,  r4
243endfunc
244
245function \type\()_h264_qpel8_v_lowpass_neon
246        vld1.8          {d8},  [r1], r3
247        vld1.8          {d10}, [r1], r3
248        vld1.8          {d12}, [r1], r3
249        vld1.8          {d14}, [r1], r3
250        vld1.8          {d22}, [r1], r3
251        vld1.8          {d24}, [r1], r3
252        vld1.8          {d26}, [r1], r3
253        vld1.8          {d28}, [r1], r3
254        vld1.8          {d9},  [r1], r3
255        vld1.8          {d11}, [r1], r3
256        vld1.8          {d13}, [r1], r3
257        vld1.8          {d15}, [r1], r3
258        vld1.8          {d23}, [r1]
259
260        transpose_8x8   q4,  q5,  q6,  q7,  q11, q12, q13, q14
261        lowpass_8       d8,  d9,  d10, d11, d8,  d10
262        lowpass_8       d12, d13, d14, d15, d12, d14
263        lowpass_8       d22, d23, d24, d25, d22, d24
264        lowpass_8       d26, d27, d28, d29, d26, d28
265        transpose_8x8   d8,  d10, d12, d14, d22, d24, d26, d28
266
267  .ifc \type,avg
268        vld1.8          {d9},  [r0,:64], r2
269        vrhadd.u8       d8,  d8,  d9
270        vld1.8          {d11}, [r0,:64], r2
271        vrhadd.u8       d10, d10, d11
272        vld1.8          {d13}, [r0,:64], r2
273        vrhadd.u8       d12, d12, d13
274        vld1.8          {d15}, [r0,:64], r2
275        vrhadd.u8       d14, d14, d15
276        vld1.8          {d23}, [r0,:64], r2
277        vrhadd.u8       d22, d22, d23
278        vld1.8          {d25}, [r0,:64], r2
279        vrhadd.u8       d24, d24, d25
280        vld1.8          {d27}, [r0,:64], r2
281        vrhadd.u8       d26, d26, d27
282        vld1.8          {d29}, [r0,:64], r2
283        vrhadd.u8       d28, d28, d29
284        sub             r0,  r0,  r2,  lsl #3
285  .endif
286
287        vst1.8          {d8},  [r0,:64], r2
288        vst1.8          {d10}, [r0,:64], r2
289        vst1.8          {d12}, [r0,:64], r2
290        vst1.8          {d14}, [r0,:64], r2
291        vst1.8          {d22}, [r0,:64], r2
292        vst1.8          {d24}, [r0,:64], r2
293        vst1.8          {d26}, [r0,:64], r2
294        vst1.8          {d28}, [r0,:64], r2
295
296        bx              lr
297endfunc
298.endm
299
300        h264_qpel_v_lowpass put
301        h264_qpel_v_lowpass avg
302
303.macro  h264_qpel_v_lowpass_l2 type
304function \type\()_h264_qpel16_v_lowpass_l2_neon
305        mov             r4,  lr
306        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
307        sub             r1,  r1,  r3, lsl #2
308        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
309        sub             r0,  r0,  r3, lsl #4
310        sub             r12, r12, r2, lsl #4
311        add             r0,  r0,  #8
312        add             r12, r12, #8
313        sub             r1,  r1,  r3, lsl #4
314        sub             r1,  r1,  r3, lsl #2
315        add             r1,  r1,  #8
316        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
317        sub             r1,  r1,  r3, lsl #2
318        mov             lr,  r4
319endfunc
320
321function \type\()_h264_qpel8_v_lowpass_l2_neon
322        vld1.8          {d8},  [r1], r3
323        vld1.8          {d10}, [r1], r3
324        vld1.8          {d12}, [r1], r3
325        vld1.8          {d14}, [r1], r3
326        vld1.8          {d22}, [r1], r3
327        vld1.8          {d24}, [r1], r3
328        vld1.8          {d26}, [r1], r3
329        vld1.8          {d28}, [r1], r3
330        vld1.8          {d9},  [r1], r3
331        vld1.8          {d11}, [r1], r3
332        vld1.8          {d13}, [r1], r3
333        vld1.8          {d15}, [r1], r3
334        vld1.8          {d23}, [r1]
335
336        transpose_8x8   q4,  q5,  q6,  q7,  q11, q12, q13, q14
337        lowpass_8       d8,  d9,  d10, d11, d8,  d9
338        lowpass_8       d12, d13, d14, d15, d12, d13
339        lowpass_8       d22, d23, d24, d25, d22, d23
340        lowpass_8       d26, d27, d28, d29, d26, d27
341        transpose_8x8   d8,  d9,  d12, d13, d22, d23, d26, d27
342
343        vld1.8          {d0},  [r12], r2
344        vld1.8          {d1},  [r12], r2
345        vld1.8          {d2},  [r12], r2
346        vld1.8          {d3},  [r12], r2
347        vld1.8          {d4},  [r12], r2
348        vrhadd.u8       q0,  q0,  q4
349        vld1.8          {d5},  [r12], r2
350        vrhadd.u8       q1,  q1,  q6
351        vld1.8          {d10}, [r12], r2
352        vrhadd.u8       q2,  q2,  q11
353        vld1.8          {d11}, [r12], r2
354        vrhadd.u8       q5,  q5,  q13
355
356  .ifc \type,avg
357        vld1.8          {d16}, [r0,:64], r3
358        vrhadd.u8       d0,  d0,  d16
359        vld1.8          {d17}, [r0,:64], r3
360        vrhadd.u8       d1,  d1,  d17
361        vld1.8          {d16}, [r0,:64], r3
362        vrhadd.u8       d2,  d2,  d16
363        vld1.8          {d17}, [r0,:64], r3
364        vrhadd.u8       d3,  d3,  d17
365        vld1.8          {d16}, [r0,:64], r3
366        vrhadd.u8       d4,  d4,  d16
367        vld1.8          {d17}, [r0,:64], r3
368        vrhadd.u8       d5,  d5,  d17
369        vld1.8          {d16}, [r0,:64], r3
370        vrhadd.u8       d10, d10, d16
371        vld1.8          {d17}, [r0,:64], r3
372        vrhadd.u8       d11, d11, d17
373        sub             r0,  r0,  r3,  lsl #3
374  .endif
375
376        vst1.8          {d0},  [r0,:64], r3
377        vst1.8          {d1},  [r0,:64], r3
378        vst1.8          {d2},  [r0,:64], r3
379        vst1.8          {d3},  [r0,:64], r3
380        vst1.8          {d4},  [r0,:64], r3
381        vst1.8          {d5},  [r0,:64], r3
382        vst1.8          {d10}, [r0,:64], r3
383        vst1.8          {d11}, [r0,:64], r3
384
385        bx              lr
386endfunc
387.endm
388
389        h264_qpel_v_lowpass_l2 put
390        h264_qpel_v_lowpass_l2 avg
391
392function put_h264_qpel8_hv_lowpass_neon_top
393        lowpass_const   r12
394        mov             r12, #12
3951:      vld1.8          {d0, d1},  [r1], r3
396        vld1.8          {d16,d17}, [r1], r3
397        subs            r12, r12, #2
398        lowpass_8       d0,  d1,  d16, d17, q11, q12, narrow=0
399        vst1.8          {d22-d25}, [r4,:128]!
400        bne             1b
401
402        vld1.8          {d0, d1},  [r1]
403        lowpass_8_1     d0,  d1,  q12, narrow=0
404
405        mov             r12, #-16
406        add             r4,  r4,  r12
407        vld1.8          {d30,d31}, [r4,:128], r12
408        vld1.8          {d20,d21}, [r4,:128], r12
409        vld1.8          {d18,d19}, [r4,:128], r12
410        vld1.8          {d16,d17}, [r4,:128], r12
411        vld1.8          {d14,d15}, [r4,:128], r12
412        vld1.8          {d12,d13}, [r4,:128], r12
413        vld1.8          {d10,d11}, [r4,:128], r12
414        vld1.8          {d8, d9},  [r4,:128], r12
415        vld1.8          {d6, d7},  [r4,:128], r12
416        vld1.8          {d4, d5},  [r4,:128], r12
417        vld1.8          {d2, d3},  [r4,:128], r12
418        vld1.8          {d0, d1},  [r4,:128]
419
420        swap4           d1,  d3,  d5,  d7,  d8,  d10, d12, d14
421        transpose16_4x4 q0,  q1,  q2,  q3,  q4,  q5,  q6,  q7
422
423        swap4           d17, d19, d21, d31, d24, d26, d28, d22
424        transpose16_4x4 q8,  q9,  q10, q15, q12, q13, q14, q11
425
426        vst1.8          {d30,d31}, [r4,:128]!
427        vst1.8          {d6, d7},  [r4,:128]!
428        vst1.8          {d20,d21}, [r4,:128]!
429        vst1.8          {d4, d5},  [r4,:128]!
430        vst1.8          {d18,d19}, [r4,:128]!
431        vst1.8          {d2, d3},  [r4,:128]!
432        vst1.8          {d16,d17}, [r4,:128]!
433        vst1.8          {d0, d1},  [r4,:128]
434
435        lowpass_8.16    q4,  q12, d8,  d9,  d24, d25, d8
436        lowpass_8.16    q5,  q13, d10, d11, d26, d27, d9
437        lowpass_8.16    q6,  q14, d12, d13, d28, d29, d10
438        lowpass_8.16    q7,  q11, d14, d15, d22, d23, d11
439
440        vld1.8          {d16,d17}, [r4,:128], r12
441        vld1.8          {d30,d31}, [r4,:128], r12
442        lowpass_8.16    q8,  q15, d16, d17, d30, d31, d12
443        vld1.8          {d16,d17}, [r4,:128], r12
444        vld1.8          {d30,d31}, [r4,:128], r12
445        lowpass_8.16    q8,  q15, d16, d17, d30, d31, d13
446        vld1.8          {d16,d17}, [r4,:128], r12
447        vld1.8          {d30,d31}, [r4,:128], r12
448        lowpass_8.16    q8,  q15, d16, d17, d30, d31, d14
449        vld1.8          {d16,d17}, [r4,:128], r12
450        vld1.8          {d30,d31}, [r4,:128]
451        lowpass_8.16    q8,  q15, d16, d17, d30, d31, d15
452
453        transpose_8x8   d12, d13, d14, d15, d8,  d9,  d10, d11
454
455        bx              lr
456endfunc
457
458.macro  h264_qpel8_hv_lowpass type
459function \type\()_h264_qpel8_hv_lowpass_neon
460        mov             r10, lr
461        bl              put_h264_qpel8_hv_lowpass_neon_top
462  .ifc \type,avg
463        vld1.8          {d0},      [r0,:64], r2
464        vrhadd.u8       d12, d12, d0
465        vld1.8          {d1},      [r0,:64], r2
466        vrhadd.u8       d13, d13, d1
467        vld1.8          {d2},      [r0,:64], r2
468        vrhadd.u8       d14, d14, d2
469        vld1.8          {d3},      [r0,:64], r2
470        vrhadd.u8       d15, d15, d3
471        vld1.8          {d4},      [r0,:64], r2
472        vrhadd.u8       d8,  d8,  d4
473        vld1.8          {d5},      [r0,:64], r2
474        vrhadd.u8       d9,  d9,  d5
475        vld1.8          {d6},      [r0,:64], r2
476        vrhadd.u8       d10, d10, d6
477        vld1.8          {d7},      [r0,:64], r2
478        vrhadd.u8       d11, d11, d7
479        sub             r0,  r0,  r2,  lsl #3
480  .endif
481
482        vst1.8          {d12},     [r0,:64], r2
483        vst1.8          {d13},     [r0,:64], r2
484        vst1.8          {d14},     [r0,:64], r2
485        vst1.8          {d15},     [r0,:64], r2
486        vst1.8          {d8},      [r0,:64], r2
487        vst1.8          {d9},      [r0,:64], r2
488        vst1.8          {d10},     [r0,:64], r2
489        vst1.8          {d11},     [r0,:64], r2
490
491        mov             lr,  r10
492        bx              lr
493endfunc
494.endm
495
496        h264_qpel8_hv_lowpass put
497        h264_qpel8_hv_lowpass avg
498
499.macro  h264_qpel8_hv_lowpass_l2 type
500function \type\()_h264_qpel8_hv_lowpass_l2_neon
501        mov             r10, lr
502        bl              put_h264_qpel8_hv_lowpass_neon_top
503
504        vld1.8          {d0, d1},  [r2,:128]!
505        vld1.8          {d2, d3},  [r2,:128]!
506        vrhadd.u8       q0,  q0,  q6
507        vld1.8          {d4, d5},  [r2,:128]!
508        vrhadd.u8       q1,  q1,  q7
509        vld1.8          {d6, d7},  [r2,:128]!
510        vrhadd.u8       q2,  q2,  q4
511        vrhadd.u8       q3,  q3,  q5
512  .ifc \type,avg
513        vld1.8          {d16},     [r0,:64], r3
514        vrhadd.u8       d0,  d0,  d16
515        vld1.8          {d17},     [r0,:64], r3
516        vrhadd.u8       d1,  d1,  d17
517        vld1.8          {d18},     [r0,:64], r3
518        vrhadd.u8       d2,  d2,  d18
519        vld1.8          {d19},     [r0,:64], r3
520        vrhadd.u8       d3,  d3,  d19
521        vld1.8          {d20},     [r0,:64], r3
522        vrhadd.u8       d4,  d4,  d20
523        vld1.8          {d21},     [r0,:64], r3
524        vrhadd.u8       d5,  d5,  d21
525        vld1.8          {d22},     [r0,:64], r3
526        vrhadd.u8       d6,  d6,  d22
527        vld1.8          {d23},     [r0,:64], r3
528        vrhadd.u8       d7,  d7,  d23
529        sub             r0,  r0,  r3,  lsl #3
530  .endif
531        vst1.8          {d0},      [r0,:64], r3
532        vst1.8          {d1},      [r0,:64], r3
533        vst1.8          {d2},      [r0,:64], r3
534        vst1.8          {d3},      [r0,:64], r3
535        vst1.8          {d4},      [r0,:64], r3
536        vst1.8          {d5},      [r0,:64], r3
537        vst1.8          {d6},      [r0,:64], r3
538        vst1.8          {d7},      [r0,:64], r3
539
540        mov             lr,  r10
541        bx              lr
542endfunc
543.endm
544
545        h264_qpel8_hv_lowpass_l2 put
546        h264_qpel8_hv_lowpass_l2 avg
547
548.macro  h264_qpel16_hv  type
549function \type\()_h264_qpel16_hv_lowpass_neon
550        mov             r9,  lr
551        bl              \type\()_h264_qpel8_hv_lowpass_neon
552        sub             r1,  r1,  r3, lsl #2
553        bl              \type\()_h264_qpel8_hv_lowpass_neon
554        sub             r1,  r1,  r3, lsl #4
555        sub             r1,  r1,  r3, lsl #2
556        add             r1,  r1,  #8
557        sub             r0,  r0,  r2, lsl #4
558        add             r0,  r0,  #8
559        bl              \type\()_h264_qpel8_hv_lowpass_neon
560        sub             r1,  r1,  r3, lsl #2
561        mov             lr,  r9
562        b               \type\()_h264_qpel8_hv_lowpass_neon
563endfunc
564
565function \type\()_h264_qpel16_hv_lowpass_l2_neon
566        mov             r9,  lr
567        sub             r2,  r4,  #256
568        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
569        sub             r1,  r1,  r3, lsl #2
570        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
571        sub             r1,  r1,  r3, lsl #4
572        sub             r1,  r1,  r3, lsl #2
573        add             r1,  r1,  #8
574        sub             r0,  r0,  r3, lsl #4
575        add             r0,  r0,  #8
576        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
577        sub             r1,  r1,  r3, lsl #2
578        mov             lr,  r9
579        b               \type\()_h264_qpel8_hv_lowpass_l2_neon
580endfunc
581.endm
582
583        h264_qpel16_hv put
584        h264_qpel16_hv avg
585
586.macro  h264_qpel8      type
587function ff_\type\()_h264_qpel8_mc10_neon, export=1
588        lowpass_const   r3
589        mov             r3,  r1
590        sub             r1,  r1,  #2
591        mov             r12, #8
592        b               \type\()_h264_qpel8_h_lowpass_l2_neon
593endfunc
594
595function ff_\type\()_h264_qpel8_mc20_neon, export=1
596        lowpass_const   r3
597        sub             r1,  r1,  #2
598        mov             r3,  r2
599        mov             r12, #8
600        b               \type\()_h264_qpel8_h_lowpass_neon
601endfunc
602
603function ff_\type\()_h264_qpel8_mc30_neon, export=1
604        lowpass_const   r3
605        add             r3,  r1,  #1
606        sub             r1,  r1,  #2
607        mov             r12, #8
608        b               \type\()_h264_qpel8_h_lowpass_l2_neon
609endfunc
610
611function ff_\type\()_h264_qpel8_mc01_neon, export=1
612        push            {lr}
613        mov             r12, r1
614\type\()_h264_qpel8_mc01:
615        lowpass_const   r3
616        mov             r3,  r2
617        sub             r1,  r1,  r2, lsl #1
618        vpush           {d8-d15}
619        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
620        vpop            {d8-d15}
621        pop             {pc}
622endfunc
623
624function ff_\type\()_h264_qpel8_mc11_neon, export=1
625        push            {r0, r1, r11, lr}
626\type\()_h264_qpel8_mc11:
627        lowpass_const   r3
628        mov             r11, sp
629A       bic             sp,  sp,  #15
630T       bic             r0,  r11, #15
631T       mov             sp,  r0
632        sub             sp,  sp,  #64
633        mov             r0,  sp
634        sub             r1,  r1,  #2
635        mov             r3,  #8
636        mov             r12, #8
637        vpush           {d8-d15}
638        bl              put_h264_qpel8_h_lowpass_neon
639        ldrd            r0,  r1,  [r11], #8
640        mov             r3,  r2
641        add             r12, sp,  #64
642        sub             r1,  r1,  r2, lsl #1
643        mov             r2,  #8
644        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
645        vpop            {d8-d15}
646        mov             sp,  r11
647        pop             {r11, pc}
648endfunc
649
650function ff_\type\()_h264_qpel8_mc21_neon, export=1
651        push            {r0, r1, r4, r10, r11, lr}
652\type\()_h264_qpel8_mc21:
653        lowpass_const   r3
654        mov             r11, sp
655A       bic             sp,  sp,  #15
656T       bic             r0,  r11, #15
657T       mov             sp,  r0
658        sub             sp,  sp,  #(8*8+16*12)
659        sub             r1,  r1,  #2
660        mov             r3,  #8
661        mov             r0,  sp
662        mov             r12, #8
663        vpush           {d8-d15}
664        bl              put_h264_qpel8_h_lowpass_neon
665        mov             r4,  r0
666        ldrd            r0,  r1,  [r11], #8
667        sub             r1,  r1,  r2, lsl #1
668        sub             r1,  r1,  #2
669        mov             r3,  r2
670        sub             r2,  r4,  #64
671        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
672        vpop            {d8-d15}
673        mov             sp,  r11
674        pop             {r4, r10, r11, pc}
675endfunc
676
677function ff_\type\()_h264_qpel8_mc31_neon, export=1
678        add             r1,  r1,  #1
679        push            {r0, r1, r11, lr}
680        sub             r1,  r1,  #1
681        b               \type\()_h264_qpel8_mc11
682endfunc
683
684function ff_\type\()_h264_qpel8_mc02_neon, export=1
685        push            {lr}
686        lowpass_const   r3
687        sub             r1,  r1,  r2, lsl #1
688        mov             r3,  r2
689        vpush           {d8-d15}
690        bl              \type\()_h264_qpel8_v_lowpass_neon
691        vpop            {d8-d15}
692        pop             {pc}
693endfunc
694
695function ff_\type\()_h264_qpel8_mc12_neon, export=1
696        push            {r0, r1, r4, r10, r11, lr}
697\type\()_h264_qpel8_mc12:
698        lowpass_const   r3
699        mov             r11, sp
700A       bic             sp,  sp,  #15
701T       bic             r0,  r11, #15
702T       mov             sp,  r0
703        sub             sp,  sp,  #(8*8+16*12)
704        sub             r1,  r1,  r2, lsl #1
705        mov             r3,  r2
706        mov             r2,  #8
707        mov             r0,  sp
708        vpush           {d8-d15}
709        bl              put_h264_qpel8_v_lowpass_neon
710        mov             r4,  r0
711        ldrd            r0,  r1,  [r11], #8
712        sub             r1,  r1,  r3, lsl #1
713        sub             r1,  r1,  #2
714        sub             r2,  r4,  #64
715        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
716        vpop            {d8-d15}
717        mov             sp,  r11
718        pop             {r4, r10, r11, pc}
719endfunc
720
721function ff_\type\()_h264_qpel8_mc22_neon, export=1
722        push            {r4, r10, r11, lr}
723        mov             r11, sp
724A       bic             sp,  sp,  #15
725T       bic             r4,  r11, #15
726T       mov             sp,  r4
727        sub             r1,  r1,  r2, lsl #1
728        sub             r1,  r1,  #2
729        mov             r3,  r2
730        sub             sp,  sp,  #(16*12)
731        mov             r4,  sp
732        vpush           {d8-d15}
733        bl              \type\()_h264_qpel8_hv_lowpass_neon
734        vpop            {d8-d15}
735        mov             sp,  r11
736        pop             {r4, r10, r11, pc}
737endfunc
738
739function ff_\type\()_h264_qpel8_mc32_neon, export=1
740        push            {r0, r1, r4, r10, r11, lr}
741        add             r1,  r1,  #1
742        b               \type\()_h264_qpel8_mc12
743endfunc
744
745function ff_\type\()_h264_qpel8_mc03_neon, export=1
746        push            {lr}
747        add             r12, r1,  r2
748        b               \type\()_h264_qpel8_mc01
749endfunc
750
751function ff_\type\()_h264_qpel8_mc13_neon, export=1
752        push            {r0, r1, r11, lr}
753        add             r1,  r1,  r2
754        b               \type\()_h264_qpel8_mc11
755endfunc
756
757function ff_\type\()_h264_qpel8_mc23_neon, export=1
758        push            {r0, r1, r4, r10, r11, lr}
759        add             r1,  r1,  r2
760        b               \type\()_h264_qpel8_mc21
761endfunc
762
763function ff_\type\()_h264_qpel8_mc33_neon, export=1
764        add             r1,  r1,  #1
765        push            {r0, r1, r11, lr}
766        add             r1,  r1,  r2
767        sub             r1,  r1,  #1
768        b               \type\()_h264_qpel8_mc11
769endfunc
770.endm
771
772        h264_qpel8 put
773        h264_qpel8 avg
774
775.macro  h264_qpel16     type
776function ff_\type\()_h264_qpel16_mc10_neon, export=1
777        lowpass_const   r3
778        mov             r3,  r1
779        sub             r1,  r1,  #2
780        b               \type\()_h264_qpel16_h_lowpass_l2_neon
781endfunc
782
783function ff_\type\()_h264_qpel16_mc20_neon, export=1
784        lowpass_const   r3
785        sub             r1,  r1,  #2
786        mov             r3,  r2
787        b               \type\()_h264_qpel16_h_lowpass_neon
788endfunc
789
790function ff_\type\()_h264_qpel16_mc30_neon, export=1
791        lowpass_const   r3
792        add             r3,  r1,  #1
793        sub             r1,  r1,  #2
794        b               \type\()_h264_qpel16_h_lowpass_l2_neon
795endfunc
796
797function ff_\type\()_h264_qpel16_mc01_neon, export=1
798        push            {r4, lr}
799        mov             r12, r1
800\type\()_h264_qpel16_mc01:
801        lowpass_const   r3
802        mov             r3,  r2
803        sub             r1,  r1,  r2, lsl #1
804        vpush           {d8-d15}
805        bl              \type\()_h264_qpel16_v_lowpass_l2_neon
806        vpop            {d8-d15}
807        pop             {r4, pc}
808endfunc
809
810function ff_\type\()_h264_qpel16_mc11_neon, export=1
811        push            {r0, r1, r4, r11, lr}
812\type\()_h264_qpel16_mc11:
813        lowpass_const   r3
814        mov             r11, sp
815A       bic             sp,  sp,  #15
816T       bic             r0,  r11, #15
817T       mov             sp,  r0
818        sub             sp,  sp,  #256
819        mov             r0,  sp
820        sub             r1,  r1,  #2
821        mov             r3,  #16
822        vpush           {d8-d15}
823        bl              put_h264_qpel16_h_lowpass_neon
824        ldrd            r0,  r1,  [r11], #8
825        mov             r3,  r2
826        add             r12, sp,  #64
827        sub             r1,  r1,  r2, lsl #1
828        mov             r2,  #16
829        bl              \type\()_h264_qpel16_v_lowpass_l2_neon
830        vpop            {d8-d15}
831        mov             sp,  r11
832        pop             {r4, r11, pc}
833endfunc
834
835function ff_\type\()_h264_qpel16_mc21_neon, export=1
836        push            {r0, r1, r4-r5, r9-r11, lr}
837\type\()_h264_qpel16_mc21:
838        lowpass_const   r3
839        mov             r11, sp
840A       bic             sp,  sp,  #15
841T       bic             r0,  r11, #15
842T       mov             sp,  r0
843        sub             sp,  sp,  #(16*16+16*12)
844        sub             r1,  r1,  #2
845        mov             r0,  sp
846        vpush           {d8-d15}
847        bl              put_h264_qpel16_h_lowpass_neon_packed
848        mov             r4,  r0
849        ldrd            r0,  r1,  [r11], #8
850        sub             r1,  r1,  r2, lsl #1
851        sub             r1,  r1,  #2
852        mov             r3,  r2
853        bl              \type\()_h264_qpel16_hv_lowpass_l2_neon
854        vpop            {d8-d15}
855        mov             sp,  r11
856        pop             {r4-r5, r9-r11, pc}
857endfunc
858
859function ff_\type\()_h264_qpel16_mc31_neon, export=1
860        add             r1,  r1,  #1
861        push            {r0, r1, r4, r11, lr}
862        sub             r1,  r1,  #1
863        b               \type\()_h264_qpel16_mc11
864endfunc
865
866function ff_\type\()_h264_qpel16_mc02_neon, export=1
867        push            {r4, lr}
868        lowpass_const   r3
869        sub             r1,  r1,  r2, lsl #1
870        mov             r3,  r2
871        vpush           {d8-d15}
872        bl              \type\()_h264_qpel16_v_lowpass_neon
873        vpop            {d8-d15}
874        pop             {r4, pc}
875endfunc
876
877function ff_\type\()_h264_qpel16_mc12_neon, export=1
878        push            {r0, r1, r4-r5, r9-r11, lr}
879\type\()_h264_qpel16_mc12:
880        lowpass_const   r3
881        mov             r11, sp
882A       bic             sp,  sp,  #15
883T       bic             r0,  r11, #15
884T       mov             sp,  r0
885        sub             sp,  sp,  #(16*16+16*12)
886        sub             r1,  r1,  r2, lsl #1
887        mov             r0,  sp
888        mov             r3,  r2
889        vpush           {d8-d15}
890        bl              put_h264_qpel16_v_lowpass_neon_packed
891        mov             r4,  r0
892        ldrd            r0,  r1,  [r11], #8
893        sub             r1,  r1,  r3, lsl #1
894        sub             r1,  r1,  #2
895        mov             r2,  r3
896        bl              \type\()_h264_qpel16_hv_lowpass_l2_neon
897        vpop            {d8-d15}
898        mov             sp,  r11
899        pop             {r4-r5, r9-r11, pc}
900endfunc
901
902function ff_\type\()_h264_qpel16_mc22_neon, export=1
903        push            {r4, r9-r11, lr}
904        lowpass_const   r3
905        mov             r11, sp
906A       bic             sp,  sp,  #15
907T       bic             r4,  r11, #15
908T       mov             sp,  r4
909        sub             r1,  r1,  r2, lsl #1
910        sub             r1,  r1,  #2
911        mov             r3,  r2
912        sub             sp,  sp,  #(16*12)
913        mov             r4,  sp
914        vpush           {d8-d15}
915        bl              \type\()_h264_qpel16_hv_lowpass_neon
916        vpop            {d8-d15}
917        mov             sp,  r11
918        pop             {r4, r9-r11, pc}
919endfunc
920
921function ff_\type\()_h264_qpel16_mc32_neon, export=1
922        push            {r0, r1, r4-r5, r9-r11, lr}
923        add             r1,  r1,  #1
924        b               \type\()_h264_qpel16_mc12
925endfunc
926
927function ff_\type\()_h264_qpel16_mc03_neon, export=1
928        push            {r4, lr}
929        add             r12, r1,  r2
930        b               \type\()_h264_qpel16_mc01
931endfunc
932
933function ff_\type\()_h264_qpel16_mc13_neon, export=1
934        push            {r0, r1, r4, r11, lr}
935        add             r1,  r1,  r2
936        b               \type\()_h264_qpel16_mc11
937endfunc
938
939function ff_\type\()_h264_qpel16_mc23_neon, export=1
940        push            {r0, r1, r4-r5, r9-r11, lr}
941        add             r1,  r1,  r2
942        b               \type\()_h264_qpel16_mc21
943endfunc
944
945function ff_\type\()_h264_qpel16_mc33_neon, export=1
946        add             r1,  r1,  #1
947        push            {r0, r1, r4, r11, lr}
948        add             r1,  r1,  r2
949        sub             r1,  r1,  #1
950        b               \type\()_h264_qpel16_mc11
951endfunc
952.endm
953
954        h264_qpel16 put
955        h264_qpel16 avg
956