1/*
2 * ARM NEON optimised DSP functions
3 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22#include "libavutil/arm/asm.S"
23
24.macro  pixels16        rnd=1, avg=0
25  .if \avg
26        mov             r12, r0
27  .endif
281:      vld1.8          {q0},     [r1], r2
29        vld1.8          {q1},     [r1], r2
30        vld1.8          {q2},     [r1], r2
31        pld             [r1, r2, lsl #2]
32        vld1.8          {q3},     [r1], r2
33        pld             [r1]
34        pld             [r1, r2]
35        pld             [r1, r2, lsl #1]
36  .if \avg
37        vld1.8          {q8},     [r12,:128], r2
38        vrhadd.u8       q0,  q0,  q8
39        vld1.8          {q9},     [r12,:128], r2
40        vrhadd.u8       q1,  q1,  q9
41        vld1.8          {q10},    [r12,:128], r2
42        vrhadd.u8       q2,  q2,  q10
43        vld1.8          {q11},    [r12,:128], r2
44        vrhadd.u8       q3,  q3,  q11
45  .endif
46        subs            r3,  r3,  #4
47        vst1.64         {q0},     [r0,:128], r2
48        vst1.64         {q1},     [r0,:128], r2
49        vst1.64         {q2},     [r0,:128], r2
50        vst1.64         {q3},     [r0,:128], r2
51        bne             1b
52        bx              lr
53.endm
54
55.macro  pixels16_x2     rnd=1, avg=0
561:      vld1.8          {d0-d2},  [r1], r2
57        vld1.8          {d4-d6},  [r1], r2
58        pld             [r1]
59        pld             [r1, r2]
60        subs            r3,  r3,  #2
61        vext.8          q1,  q0,  q1,  #1
62        avg             q0,  q0,  q1
63        vext.8          q3,  q2,  q3,  #1
64        avg             q2,  q2,  q3
65  .if \avg
66        vld1.8          {q1},     [r0,:128], r2
67        vld1.8          {q3},     [r0,:128]
68        vrhadd.u8       q0,  q0,  q1
69        vrhadd.u8       q2,  q2,  q3
70        sub             r0,  r0,  r2
71  .endif
72        vst1.8          {q0},     [r0,:128], r2
73        vst1.8          {q2},     [r0,:128], r2
74        bne             1b
75        bx              lr
76.endm
77
78.macro  pixels16_y2     rnd=1, avg=0
79        sub             r3,  r3,  #2
80        vld1.8          {q0},     [r1], r2
81        vld1.8          {q1},     [r1], r2
821:      subs            r3,  r3,  #2
83        avg             q2,  q0,  q1
84        vld1.8          {q0},     [r1], r2
85        avg             q3,  q0,  q1
86        vld1.8          {q1},     [r1], r2
87        pld             [r1]
88        pld             [r1, r2]
89  .if \avg
90        vld1.8          {q8},     [r0,:128], r2
91        vld1.8          {q9},     [r0,:128]
92        vrhadd.u8       q2,  q2,  q8
93        vrhadd.u8       q3,  q3,  q9
94        sub             r0,  r0,  r2
95  .endif
96        vst1.8          {q2},     [r0,:128], r2
97        vst1.8          {q3},     [r0,:128], r2
98        bne             1b
99
100        avg             q2,  q0,  q1
101        vld1.8          {q0},     [r1], r2
102        avg             q3,  q0,  q1
103  .if \avg
104        vld1.8          {q8},     [r0,:128], r2
105        vld1.8          {q9},     [r0,:128]
106        vrhadd.u8       q2,  q2,  q8
107        vrhadd.u8       q3,  q3,  q9
108        sub             r0,  r0,  r2
109  .endif
110        vst1.8          {q2},     [r0,:128], r2
111        vst1.8          {q3},     [r0,:128], r2
112
113        bx              lr
114.endm
115
116.macro  pixels16_xy2    rnd=1, avg=0
117        sub             r3,  r3,  #2
118        vld1.8          {d0-d2},  [r1], r2
119        vld1.8          {d4-d6},  [r1], r2
120NRND    vmov.i16        q13, #1
121        pld             [r1]
122        pld             [r1, r2]
123        vext.8          q1,  q0,  q1,  #1
124        vext.8          q3,  q2,  q3,  #1
125        vaddl.u8        q8,  d0,  d2
126        vaddl.u8        q10, d1,  d3
127        vaddl.u8        q9,  d4,  d6
128        vaddl.u8        q11, d5,  d7
1291:      subs            r3,  r3,  #2
130        vld1.8          {d0-d2},  [r1], r2
131        vadd.u16        q12, q8,  q9
132        pld             [r1]
133NRND    vadd.u16        q12, q12, q13
134        vext.8          q15, q0,  q1,  #1
135        vadd.u16        q1 , q10, q11
136        shrn            d28, q12, #2
137NRND    vadd.u16        q1,  q1,  q13
138        shrn            d29, q1,  #2
139  .if \avg
140        vld1.8          {q8},     [r0,:128]
141        vrhadd.u8       q14, q14, q8
142  .endif
143        vaddl.u8        q8,  d0,  d30
144        vld1.8          {d2-d4},  [r1], r2
145        vaddl.u8        q10, d1,  d31
146        vst1.8          {q14},    [r0,:128], r2
147        vadd.u16        q12, q8,  q9
148        pld             [r1, r2]
149NRND    vadd.u16        q12, q12, q13
150        vext.8          q2,  q1,  q2,  #1
151        vadd.u16        q0,  q10, q11
152        shrn            d30, q12, #2
153NRND    vadd.u16        q0,  q0,  q13
154        shrn            d31, q0,  #2
155  .if \avg
156        vld1.8          {q9},     [r0,:128]
157        vrhadd.u8       q15, q15, q9
158  .endif
159        vaddl.u8        q9,  d2,  d4
160        vaddl.u8        q11, d3,  d5
161        vst1.8          {q15},    [r0,:128], r2
162        bgt             1b
163
164        vld1.8          {d0-d2},  [r1], r2
165        vadd.u16        q12, q8,  q9
166NRND    vadd.u16        q12, q12, q13
167        vext.8          q15, q0,  q1,  #1
168        vadd.u16        q1 , q10, q11
169        shrn            d28, q12, #2
170NRND    vadd.u16        q1,  q1,  q13
171        shrn            d29, q1,  #2
172  .if \avg
173        vld1.8          {q8},     [r0,:128]
174        vrhadd.u8       q14, q14, q8
175  .endif
176        vaddl.u8        q8,  d0,  d30
177        vaddl.u8        q10, d1,  d31
178        vst1.8          {q14},    [r0,:128], r2
179        vadd.u16        q12, q8,  q9
180NRND    vadd.u16        q12, q12, q13
181        vadd.u16        q0,  q10, q11
182        shrn            d30, q12, #2
183NRND    vadd.u16        q0,  q0,  q13
184        shrn            d31, q0,  #2
185  .if \avg
186        vld1.8          {q9},     [r0,:128]
187        vrhadd.u8       q15, q15, q9
188  .endif
189        vst1.8          {q15},    [r0,:128], r2
190
191        bx              lr
192.endm
193
194.macro  pixels8         rnd=1, avg=0
1951:      vld1.8          {d0},     [r1], r2
196        vld1.8          {d1},     [r1], r2
197        vld1.8          {d2},     [r1], r2
198        pld             [r1, r2, lsl #2]
199        vld1.8          {d3},     [r1], r2
200        pld             [r1]
201        pld             [r1, r2]
202        pld             [r1, r2, lsl #1]
203  .if \avg
204        vld1.8          {d4},     [r0,:64], r2
205        vrhadd.u8       d0,  d0,  d4
206        vld1.8          {d5},     [r0,:64], r2
207        vrhadd.u8       d1,  d1,  d5
208        vld1.8          {d6},     [r0,:64], r2
209        vrhadd.u8       d2,  d2,  d6
210        vld1.8          {d7},     [r0,:64], r2
211        vrhadd.u8       d3,  d3,  d7
212        sub             r0,  r0,  r2,  lsl #2
213  .endif
214        subs            r3,  r3,  #4
215        vst1.8          {d0},     [r0,:64], r2
216        vst1.8          {d1},     [r0,:64], r2
217        vst1.8          {d2},     [r0,:64], r2
218        vst1.8          {d3},     [r0,:64], r2
219        bne             1b
220        bx              lr
221.endm
222
223.macro  pixels8_x2      rnd=1, avg=0
2241:      vld1.8          {q0},     [r1], r2
225        vext.8          d1,  d0,  d1,  #1
226        vld1.8          {q1},     [r1], r2
227        vext.8          d3,  d2,  d3,  #1
228        pld             [r1]
229        pld             [r1, r2]
230        subs            r3,  r3,  #2
231        vswp            d1,  d2
232        avg             q0,  q0,  q1
233  .if \avg
234        vld1.8          {d4},     [r0,:64], r2
235        vld1.8          {d5},     [r0,:64]
236        vrhadd.u8       q0,  q0,  q2
237        sub             r0,  r0,  r2
238  .endif
239        vst1.8          {d0},     [r0,:64], r2
240        vst1.8          {d1},     [r0,:64], r2
241        bne             1b
242        bx              lr
243.endm
244
245.macro  pixels8_y2      rnd=1, avg=0
246        sub             r3,  r3,  #2
247        vld1.8          {d0},     [r1], r2
248        vld1.8          {d1},     [r1], r2
2491:      subs            r3,  r3,  #2
250        avg             d4,  d0,  d1
251        vld1.8          {d0},     [r1], r2
252        avg             d5,  d0,  d1
253        vld1.8          {d1},     [r1], r2
254        pld             [r1]
255        pld             [r1, r2]
256  .if \avg
257        vld1.8          {d2},     [r0,:64], r2
258        vld1.8          {d3},     [r0,:64]
259        vrhadd.u8       q2,  q2,  q1
260        sub             r0,  r0,  r2
261  .endif
262        vst1.8          {d4},     [r0,:64], r2
263        vst1.8          {d5},     [r0,:64], r2
264        bne             1b
265
266        avg             d4,  d0,  d1
267        vld1.8          {d0},     [r1], r2
268        avg             d5,  d0,  d1
269  .if \avg
270        vld1.8          {d2},     [r0,:64], r2
271        vld1.8          {d3},     [r0,:64]
272        vrhadd.u8       q2,  q2,  q1
273        sub             r0,  r0,  r2
274  .endif
275        vst1.8          {d4},     [r0,:64], r2
276        vst1.8          {d5},     [r0,:64], r2
277
278        bx              lr
279.endm
280
281.macro  pixels8_xy2     rnd=1, avg=0
282        sub             r3,  r3,  #2
283        vld1.8          {q0},     [r1], r2
284        vld1.8          {q1},     [r1], r2
285NRND    vmov.i16        q11, #1
286        pld             [r1]
287        pld             [r1, r2]
288        vext.8          d4,  d0,  d1,  #1
289        vext.8          d6,  d2,  d3,  #1
290        vaddl.u8        q8,  d0,  d4
291        vaddl.u8        q9,  d2,  d6
2921:      subs            r3,  r3,  #2
293        vld1.8          {q0},     [r1], r2
294        pld             [r1]
295        vadd.u16        q10, q8,  q9
296        vext.8          d4,  d0,  d1,  #1
297NRND    vadd.u16        q10, q10, q11
298        vaddl.u8        q8,  d0,  d4
299        shrn            d5,  q10, #2
300        vld1.8          {q1},     [r1], r2
301        vadd.u16        q10, q8,  q9
302        pld             [r1, r2]
303  .if \avg
304        vld1.8          {d7},     [r0,:64]
305        vrhadd.u8       d5,  d5,  d7
306  .endif
307NRND    vadd.u16        q10, q10, q11
308        vst1.8          {d5},     [r0,:64], r2
309        shrn            d7,  q10, #2
310  .if \avg
311        vld1.8          {d5},     [r0,:64]
312        vrhadd.u8       d7,  d7,  d5
313  .endif
314        vext.8          d6,  d2,  d3,  #1
315        vaddl.u8        q9,  d2,  d6
316        vst1.8          {d7},     [r0,:64], r2
317        bgt             1b
318
319        vld1.8          {q0},     [r1], r2
320        vadd.u16        q10, q8,  q9
321        vext.8          d4,  d0,  d1,  #1
322NRND    vadd.u16        q10, q10, q11
323        vaddl.u8        q8,  d0,  d4
324        shrn            d5,  q10, #2
325        vadd.u16        q10, q8,  q9
326  .if \avg
327        vld1.8          {d7},     [r0,:64]
328        vrhadd.u8       d5,  d5,  d7
329  .endif
330NRND    vadd.u16        q10, q10, q11
331        vst1.8          {d5},     [r0,:64], r2
332        shrn            d7,  q10, #2
333  .if \avg
334        vld1.8          {d5},     [r0,:64]
335        vrhadd.u8       d7,  d7,  d5
336  .endif
337        vst1.8          {d7},     [r0,:64], r2
338
339        bx              lr
340.endm
341
342.macro  pixfunc         pfx, name, suf, rnd=1, avg=0
343  .if \rnd
344    .macro avg  rd, rn, rm
345        vrhadd.u8       \rd, \rn, \rm
346    .endm
347    .macro shrn rd, rn, rm
348        vrshrn.u16      \rd, \rn, \rm
349    .endm
350    .macro NRND insn:vararg
351    .endm
352  .else
353    .macro avg  rd, rn, rm
354        vhadd.u8        \rd, \rn, \rm
355    .endm
356    .macro shrn rd, rn, rm
357        vshrn.u16       \rd, \rn, \rm
358    .endm
359    .macro NRND insn:vararg
360        \insn
361    .endm
362  .endif
363function ff_\pfx\name\suf\()_neon, export=1
364        \name           \rnd, \avg
365endfunc
366        .purgem         avg
367        .purgem         shrn
368        .purgem         NRND
369.endm
370
371.macro  pixfunc2        pfx, name, avg=0
372        pixfunc         \pfx, \name,          rnd=1, avg=\avg
373        pixfunc         \pfx, \name, _no_rnd, rnd=0, avg=\avg
374.endm
375
376function ff_put_h264_qpel16_mc00_neon, export=1
377        mov             r3,  #16
378endfunc
379
380        pixfunc         put_, pixels16,     avg=0
381        pixfunc2        put_, pixels16_x2,  avg=0
382        pixfunc2        put_, pixels16_y2,  avg=0
383        pixfunc2        put_, pixels16_xy2, avg=0
384
385function ff_avg_h264_qpel16_mc00_neon, export=1
386        mov             r3,  #16
387endfunc
388
389        pixfunc         avg_, pixels16,     avg=1
390        pixfunc2        avg_, pixels16_x2,  avg=1
391        pixfunc2        avg_, pixels16_y2,  avg=1
392        pixfunc2        avg_, pixels16_xy2, avg=1
393
394function ff_put_h264_qpel8_mc00_neon, export=1
395        mov             r3,  #8
396endfunc
397
398        pixfunc         put_, pixels8,     avg=0
399        pixfunc2        put_, pixels8_x2,  avg=0
400        pixfunc2        put_, pixels8_y2,  avg=0
401        pixfunc2        put_, pixels8_xy2, avg=0
402
403function ff_avg_h264_qpel8_mc00_neon, export=1
404        mov             r3,  #8
405endfunc
406
407        pixfunc         avg_, pixels8,     avg=1
408        pixfunc         avg_, pixels8_x2,  avg=1
409        pixfunc         avg_, pixels8_y2,  avg=1
410        pixfunc         avg_, pixels8_xy2, avg=1
411