1/*
2 * ARM NEON optimised DSP functions
3 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
4 * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
5 *
6 * This file is part of FFmpeg.
7 *
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
12 *
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16 * Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 */
22
23#include "libavutil/aarch64/asm.S"
24
25.macro  pixels16        rnd=1, avg=0
26  .if \avg
27        mov             x12, x0
28  .endif
291:      ld1             {v0.16B},  [x1], x2
30        ld1             {v1.16B},  [x1], x2
31        ld1             {v2.16B},  [x1], x2
32        ld1             {v3.16B},  [x1], x2
33  .if \avg
34        ld1             {v4.16B},  [x12], x2
35        urhadd          v0.16B,  v0.16B,  v4.16B
36        ld1             {v5.16B},  [x12], x2
37        urhadd          v1.16B,  v1.16B,  v5.16B
38        ld1             {v6.16B},  [x12], x2
39        urhadd          v2.16B,  v2.16B,  v6.16B
40        ld1             {v7.16B},  [x12], x2
41        urhadd          v3.16B,  v3.16B,  v7.16B
42  .endif
43        subs            w3,  w3,  #4
44        st1             {v0.16B},  [x0], x2
45        st1             {v1.16B},  [x0], x2
46        st1             {v2.16B},  [x0], x2
47        st1             {v3.16B},  [x0], x2
48        b.ne            1b
49        ret
50.endm
51
52.macro  pixels16_x2     rnd=1, avg=0
531:      ld1             {v0.16B, v1.16B}, [x1], x2
54        ld1             {v2.16B, v3.16B}, [x1], x2
55        subs            w3,  w3,  #2
56        ext             v1.16B,  v0.16B,  v1.16B,  #1
57        avg             v0.16B,  v0.16B,  v1.16B
58        ext             v3.16B,  v2.16B,  v3.16B,  #1
59        avg             v2.16B,  v2.16B,  v3.16B
60  .if \avg
61        ld1             {v1.16B}, [x0], x2
62        ld1             {v3.16B}, [x0]
63        urhadd          v0.16B,  v0.16B,  v1.16B
64        urhadd          v2.16B,  v2.16B,  v3.16B
65        sub             x0,  x0,  x2
66  .endif
67        st1             {v0.16B}, [x0], x2
68        st1             {v2.16B}, [x0], x2
69        b.ne            1b
70        ret
71.endm
72
73.macro  pixels16_y2     rnd=1, avg=0
74        sub             w3,  w3,  #2
75        ld1             {v0.16B}, [x1], x2
76        ld1             {v1.16B}, [x1], x2
771:      subs            w3,  w3,  #2
78        avg             v2.16B,  v0.16B,  v1.16B
79        ld1             {v0.16B}, [x1], x2
80        avg             v3.16B,  v0.16B,  v1.16B
81        ld1             {v1.16B}, [x1], x2
82  .if \avg
83        ld1             {v4.16B}, [x0], x2
84        ld1             {v5.16B}, [x0]
85        urhadd          v2.16B,  v2.16B,  v4.16B
86        urhadd          v3.16B,  v3.16B,  v5.16B
87        sub             x0,  x0,  x2
88  .endif
89        st1             {v2.16B}, [x0], x2
90        st1             {v3.16B}, [x0], x2
91        b.ne            1b
92
93        avg             v2.16B,  v0.16B,  v1.16B
94        ld1             {v0.16B}, [x1], x2
95        avg             v3.16B,  v0.16B,  v1.16B
96  .if \avg
97        ld1             {v4.16B}, [x0], x2
98        ld1             {v5.16B}, [x0]
99        urhadd          v2.16B,  v2.16B,  v4.16B
100        urhadd          v3.16B,  v3.16B,  v5.16B
101        sub             x0,  x0,  x2
102  .endif
103        st1             {v2.16B},     [x0], x2
104        st1             {v3.16B},     [x0], x2
105
106        ret
107.endm
108
109.macro  pixels16_xy2    rnd=1, avg=0
110        sub             w3,  w3,  #2
111        ld1             {v0.16B, v1.16B}, [x1], x2
112        ld1             {v4.16B, v5.16B}, [x1], x2
113NRND    movi            v26.8H, #1
114        ext             v1.16B,  v0.16B,  v1.16B,  #1
115        ext             v5.16B,  v4.16B,  v5.16B,  #1
116        uaddl           v16.8H,  v0.8B,   v1.8B
117        uaddl2          v20.8H,  v0.16B,  v1.16B
118        uaddl           v18.8H,  v4.8B,   v5.8B
119        uaddl2          v22.8H,  v4.16B,  v5.16B
1201:      subs            w3,  w3,  #2
121        ld1             {v0.16B, v1.16B}, [x1], x2
122        add             v24.8H,  v16.8H,  v18.8H
123NRND    add             v24.8H,  v24.8H,  v26.8H
124        ext             v30.16B, v0.16B,  v1.16B,  #1
125        add             v1.8H,   v20.8H,  v22.8H
126        mshrn           v28.8B,  v24.8H,  #2
127NRND    add             v1.8H,   v1.8H,   v26.8H
128        mshrn2          v28.16B, v1.8H,   #2
129  .if \avg
130        ld1             {v16.16B},        [x0]
131        urhadd          v28.16B, v28.16B, v16.16B
132  .endif
133        uaddl           v16.8H,  v0.8B,   v30.8B
134        ld1             {v2.16B, v3.16B}, [x1], x2
135        uaddl2          v20.8H,  v0.16B,  v30.16B
136        st1             {v28.16B},        [x0], x2
137        add             v24.8H,  v16.8H,  v18.8H
138NRND    add             v24.8H,  v24.8H,  v26.8H
139        ext             v3.16B,  v2.16B,  v3.16B,  #1
140        add             v0.8H,   v20.8H,  v22.8H
141        mshrn           v30.8B,  v24.8H,  #2
142NRND    add             v0.8H,   v0.8H,   v26.8H
143        mshrn2          v30.16B, v0.8H,   #2
144  .if \avg
145        ld1             {v18.16B},        [x0]
146        urhadd          v30.16B, v30.16B, v18.16B
147  .endif
148        uaddl           v18.8H,   v2.8B,  v3.8B
149        uaddl2          v22.8H,   v2.16B, v3.16B
150        st1             {v30.16B},        [x0], x2
151        b.gt            1b
152
153        ld1             {v0.16B, v1.16B}, [x1], x2
154        add             v24.8H,  v16.8H,  v18.8H
155NRND    add             v24.8H,  v24.8H,  v26.8H
156        ext             v30.16B, v0.16B,  v1.16B,  #1
157        add             v1.8H,   v20.8H,  v22.8H
158        mshrn           v28.8B,  v24.8H,  #2
159NRND    add             v1.8H,   v1.8H,   v26.8H
160        mshrn2          v28.16B, v1.8H,   #2
161  .if \avg
162        ld1             {v16.16B},        [x0]
163        urhadd          v28.16B, v28.16B, v16.16B
164  .endif
165        uaddl           v16.8H,  v0.8B,   v30.8B
166        uaddl2          v20.8H,  v0.16B,  v30.16B
167        st1             {v28.16B},        [x0], x2
168        add             v24.8H,  v16.8H,  v18.8H
169NRND    add             v24.8H,  v24.8H,  v26.8H
170        add             v0.8H,   v20.8H,  v22.8H
171        mshrn           v30.8B,  v24.8H,  #2
172NRND    add             v0.8H,   v0.8H,   v26.8H
173        mshrn2          v30.16B, v0.8H,   #2
174  .if \avg
175        ld1             {v18.16B},        [x0]
176        urhadd          v30.16B, v30.16B, v18.16B
177  .endif
178        st1             {v30.16B},        [x0], x2
179
180        ret
181.endm
182
183.macro  pixels8         rnd=1, avg=0
1841:      ld1             {v0.8B}, [x1], x2
185        ld1             {v1.8B}, [x1], x2
186        ld1             {v2.8B}, [x1], x2
187        ld1             {v3.8B}, [x1], x2
188  .if \avg
189        ld1             {v4.8B}, [x0], x2
190        urhadd          v0.8B,  v0.8B,  v4.8B
191        ld1             {v5.8B}, [x0], x2
192        urhadd          v1.8B,  v1.8B,  v5.8B
193        ld1             {v6.8B}, [x0], x2
194        urhadd          v2.8B,  v2.8B,  v6.8B
195        ld1             {v7.8B}, [x0], x2
196        urhadd          v3.8B,  v3.8B,  v7.8B
197        sub             x0,  x0,  x2,  lsl #2
198  .endif
199        subs            w3,  w3,  #4
200        st1             {v0.8B}, [x0], x2
201        st1             {v1.8B}, [x0], x2
202        st1             {v2.8B}, [x0], x2
203        st1             {v3.8B}, [x0], x2
204        b.ne            1b
205        ret
206.endm
207
208.macro  pixels8_x2      rnd=1, avg=0
2091:      ld1             {v0.8B, v1.8B}, [x1], x2
210        ext             v1.8B,  v0.8B,  v1.8B,  #1
211        ld1             {v2.8B, v3.8B}, [x1], x2
212        ext             v3.8B,  v2.8B,  v3.8B,  #1
213        subs            w3,  w3,  #2
214        avg             v0.8B,   v0.8B,   v1.8B
215        avg             v2.8B,   v2.8B,   v3.8B
216  .if \avg
217        ld1             {v4.8B},     [x0], x2
218        ld1             {v5.8B},     [x0]
219        urhadd          v0.8B,   v0.8B,   v4.8B
220        urhadd          v2.8B,   v2.8B,   v5.8B
221        sub             x0,  x0,  x2
222  .endif
223        st1             {v0.8B}, [x0], x2
224        st1             {v2.8B}, [x0], x2
225        b.ne            1b
226        ret
227.endm
228
229.macro  pixels8_y2      rnd=1, avg=0
230        sub             w3,  w3,  #2
231        ld1             {v0.8B},  [x1], x2
232        ld1             {v1.8B},  [x1], x2
2331:      subs            w3,  w3,  #2
234        avg             v4.8B,  v0.8B,  v1.8B
235        ld1             {v0.8B},  [x1], x2
236        avg             v5.8B,  v0.8B,  v1.8B
237        ld1             {v1.8B},  [x1], x2
238  .if \avg
239        ld1             {v2.8B},     [x0], x2
240        ld1             {v3.8B},     [x0]
241        urhadd          v4.8B,  v4.8B,  v2.8B
242        urhadd          v5.8B,  v5.8B,  v3.8B
243        sub             x0,  x0,  x2
244  .endif
245        st1             {v4.8B},     [x0], x2
246        st1             {v5.8B},     [x0], x2
247        b.ne            1b
248
249        avg             v4.8B,  v0.8B,  v1.8B
250        ld1             {v0.8B},  [x1], x2
251        avg             v5.8B,  v0.8B,  v1.8B
252  .if \avg
253        ld1             {v2.8B},     [x0], x2
254        ld1             {v3.8B},     [x0]
255        urhadd          v4.8B,  v4.8B,  v2.8B
256        urhadd          v5.8B,  v5.8B,  v3.8B
257        sub             x0,  x0,  x2
258  .endif
259        st1             {v4.8B},     [x0], x2
260        st1             {v5.8B},     [x0], x2
261
262        ret
263.endm
264
265.macro  pixels8_xy2     rnd=1, avg=0
266        sub             w3,  w3,  #2
267        ld1             {v0.16B},     [x1], x2
268        ld1             {v1.16B},     [x1], x2
269NRND    movi            v19.8H, #1
270        ext             v4.16B,  v0.16B,  v4.16B,  #1
271        ext             v6.16B,  v1.16B,  v6.16B,  #1
272        uaddl           v16.8H,  v0.8B,  v4.8B
273        uaddl           v17.8H,  v1.8B,  v6.8B
2741:      subs            w3,  w3,  #2
275        ld1             {v0.16B},     [x1], x2
276        add             v18.8H, v16.8H,  v17.8H
277        ext             v4.16B,  v0.16B,  v4.16B,  #1
278NRND    add             v18.8H, v18.8H, v19.8H
279        uaddl           v16.8H,  v0.8B,  v4.8B
280        mshrn           v5.8B,  v18.8H, #2
281        ld1             {v1.16B},     [x1], x2
282        add             v18.8H, v16.8H,  v17.8H
283  .if \avg
284        ld1             {v7.8B},     [x0]
285        urhadd          v5.8B,  v5.8B,  v7.8B
286  .endif
287NRND    add             v18.8H, v18.8H, v19.8H
288        st1             {v5.8B},     [x0], x2
289        mshrn           v7.8B,  v18.8H, #2
290  .if \avg
291        ld1             {v5.8B},     [x0]
292        urhadd          v7.8B,  v7.8B,  v5.8B
293  .endif
294        ext             v6.16B,  v1.16B,  v6.16B,  #1
295        uaddl           v17.8H,  v1.8B,   v6.8B
296        st1             {v7.8B},     [x0], x2
297        b.gt            1b
298
299        ld1             {v0.16B},     [x1], x2
300        add             v18.8H, v16.8H, v17.8H
301        ext             v4.16B, v0.16B, v4.16B,  #1
302NRND    add             v18.8H, v18.8H, v19.8H
303        uaddl           v16.8H,  v0.8B, v4.8B
304        mshrn           v5.8B,  v18.8H, #2
305        add             v18.8H, v16.8H, v17.8H
306  .if \avg
307        ld1             {v7.8B},     [x0]
308        urhadd          v5.8B,  v5.8B,  v7.8B
309  .endif
310NRND    add             v18.8H, v18.8H, v19.8H
311        st1             {v5.8B},     [x0], x2
312        mshrn           v7.8B,  v18.8H, #2
313  .if \avg
314        ld1             {v5.8B},     [x0]
315        urhadd          v7.8B,  v7.8B,  v5.8B
316  .endif
317        st1             {v7.8B},     [x0], x2
318
319        ret
320.endm
321
322.macro  pixfunc         pfx, name, suf, rnd=1, avg=0
323  .if \rnd
324    .macro avg  rd, rn, rm
325        urhadd          \rd, \rn, \rm
326    .endm
327    .macro mshrn rd, rn, rm
328        rshrn           \rd, \rn, \rm
329    .endm
330    .macro mshrn2 rd, rn, rm
331        rshrn2          \rd, \rn, \rm
332    .endm
333    .macro NRND insn:vararg
334    .endm
335  .else
336    .macro avg  rd, rn, rm
337        uhadd           \rd, \rn, \rm
338    .endm
339    .macro mshrn rd, rn, rm
340        shrn            \rd, \rn, \rm
341    .endm
342    .macro mshrn2 rd, rn, rm
343        shrn2           \rd, \rn, \rm
344    .endm
345    .macro NRND insn:vararg
346        \insn
347    .endm
348  .endif
349function ff_\pfx\name\suf\()_neon, export=1
350        \name           \rnd, \avg
351endfunc
352        .purgem         avg
353        .purgem         mshrn
354        .purgem         mshrn2
355        .purgem         NRND
356.endm
357
358.macro  pixfunc2        pfx, name, avg=0
359        pixfunc         \pfx, \name,          rnd=1, avg=\avg
360        pixfunc         \pfx, \name, _no_rnd, rnd=0, avg=\avg
361.endm
362
363function ff_put_h264_qpel16_mc00_neon, export=1
364        mov             w3,  #16
365endfunc
366
367        pixfunc         put_, pixels16,     avg=0
368        pixfunc2        put_, pixels16_x2,  avg=0
369        pixfunc2        put_, pixels16_y2,  avg=0
370        pixfunc2        put_, pixels16_xy2, avg=0
371
372function ff_avg_h264_qpel16_mc00_neon, export=1
373        mov             w3,  #16
374endfunc
375
376        pixfunc         avg_, pixels16,     avg=1
377        pixfunc2        avg_, pixels16_x2,  avg=1
378        pixfunc2        avg_, pixels16_y2,  avg=1
379        pixfunc2        avg_, pixels16_xy2, avg=1
380
381function ff_put_h264_qpel8_mc00_neon, export=1
382        mov             w3,  #8
383endfunc
384
385        pixfunc         put_, pixels8,     avg=0
386        pixfunc2        put_, pixels8_x2,  avg=0
387        pixfunc2        put_, pixels8_y2,  avg=0
388        pixfunc2        put_, pixels8_xy2, avg=0
389
390function ff_avg_h264_qpel8_mc00_neon, export=1
391        mov             w3,  #8
392endfunc
393
394        pixfunc         avg_, pixels8,     avg=1
395        pixfunc         avg_, pixels8_x2,  avg=1
396        pixfunc         avg_, pixels8_y2,  avg=1
397        pixfunc         avg_, pixels8_xy2, avg=1
398