1/*
2 * ARM NEON IDCT
3 *
4 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
5 * Copyright (c) 2017 Matthieu Bouron <matthieu.bouron@gmail.com>
6 *
7 * Based on Simple IDCT
8 * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
9 *
10 * This file is part of FFmpeg.
11 *
12 * FFmpeg is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License as published by the Free Software Foundation; either
15 * version 2.1 of the License, or (at your option) any later version.
16 *
17 * FFmpeg is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20 * Lesser General Public License for more details.
21 *
22 * You should have received a copy of the GNU Lesser General Public
23 * License along with FFmpeg; if not, write to the Free Software
24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25 */
26
27#include "libavutil/aarch64/asm.S"
28
29#define Z1  22725  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
30#define Z2  21407  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
31#define Z3  19266  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
32#define Z4  16383  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
33#define Z5  12873  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
34#define Z6  8867   //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
35#define Z7  4520   //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
36#define Z4c ((1<<(COL_SHIFT-1))/Z4)
37#define ROW_SHIFT 11
38#define COL_SHIFT 20
39
40#define z1 v0.H[0]
41#define z2 v0.H[1]
42#define z3 v0.H[2]
43#define z4 v0.H[3]
44#define z5 v0.H[4]
45#define z6 v0.H[5]
46#define z7 v0.H[6]
47#define z4c v0.H[7]
48
49const   idct_coeff_neon, align=4
50        .short Z1, Z2, Z3, Z4, Z5, Z6, Z7, Z4c
51endconst
52
53.macro idct_start data
54        prfm            pldl1keep, [\data]
55        mov             x10, x30
56        movrel          x3, idct_coeff_neon
57        ld1             {v0.2D}, [x3]
58.endm
59
60.macro idct_end
61        br              x10
62.endm
63
64.macro smull1 a, b, c
65        smull           \a, \b, \c
66.endm
67
68.macro smlal1 a, b, c
69        smlal           \a, \b, \c
70.endm
71
72.macro smlsl1 a, b, c
73        smlsl           \a, \b, \c
74.endm
75
76.macro idct_col4_top y1, y2, y3, y4, i, l
77        smull\i         v7.4S,  \y3\l, z2
78        smull\i         v16.4S, \y3\l, z6
79        smull\i         v17.4S, \y2\l, z1
80        add             v19.4S, v23.4S, v7.4S
81        smull\i         v18.4S, \y2\l, z3
82        add             v20.4S, v23.4S, v16.4S
83        smull\i         v5.4S,  \y2\l, z5
84        sub             v21.4S, v23.4S, v16.4S
85        smull\i         v6.4S,  \y2\l, z7
86        sub             v22.4S, v23.4S, v7.4S
87
88        smlal\i         v17.4S, \y4\l, z3
89        smlsl\i         v18.4S, \y4\l, z7
90        smlsl\i         v5.4S,  \y4\l, z1
91        smlsl\i         v6.4S,  \y4\l, z5
92.endm
93
94.macro idct_row4_neon y1, y2, y3, y4, pass
95        ld1             {\y1\().2D,\y2\().2D}, [x2], #32
96        movi            v23.4S, #1<<2, lsl #8
97        orr             v5.16B, \y1\().16B, \y2\().16B
98        ld1             {\y3\().2D,\y4\().2D}, [x2], #32
99        orr             v6.16B, \y3\().16B, \y4\().16B
100        orr             v5.16B, v5.16B, v6.16B
101        mov             x3, v5.D[1]
102        smlal           v23.4S, \y1\().4H, z4
103
104        idct_col4_top   \y1, \y2, \y3, \y4, 1, .4H
105
106        cmp             x3, #0
107        b.eq            \pass\()f
108
109        smull2          v7.4S, \y1\().8H, z4
110        smlal2          v17.4S, \y2\().8H, z5
111        smlsl2          v18.4S, \y2\().8H, z1
112        smull2          v16.4S, \y3\().8H, z2
113        smlal2          v5.4S, \y2\().8H, z7
114        add             v19.4S, v19.4S, v7.4S
115        sub             v20.4S, v20.4S, v7.4S
116        sub             v21.4S, v21.4S, v7.4S
117        add             v22.4S, v22.4S, v7.4S
118        smlal2          v6.4S, \y2\().8H, z3
119        smull2          v7.4S, \y3\().8H, z6
120        smlal2          v17.4S, \y4\().8H, z7
121        smlsl2          v18.4S, \y4\().8H, z5
122        smlal2          v5.4S, \y4\().8H, z3
123        smlsl2          v6.4S, \y4\().8H, z1
124        add             v19.4S, v19.4S, v7.4S
125        sub             v20.4S, v20.4S, v16.4S
126        add             v21.4S, v21.4S, v16.4S
127        sub             v22.4S, v22.4S, v7.4S
128
129\pass:  add             \y3\().4S, v19.4S, v17.4S
130        add             \y4\().4S, v20.4S, v18.4S
131        shrn            \y1\().4H, \y3\().4S, #ROW_SHIFT
132        shrn            \y2\().4H, \y4\().4S, #ROW_SHIFT
133        add             v7.4S, v21.4S, v5.4S
134        add             v16.4S, v22.4S, v6.4S
135        shrn            \y3\().4H, v7.4S, #ROW_SHIFT
136        shrn            \y4\().4H, v16.4S, #ROW_SHIFT
137        sub             v22.4S, v22.4S, v6.4S
138        sub             v19.4S, v19.4S, v17.4S
139        sub             v21.4S, v21.4S, v5.4S
140        shrn2           \y1\().8H, v22.4S, #ROW_SHIFT
141        sub             v20.4S, v20.4S, v18.4S
142        shrn2           \y2\().8H, v21.4S, #ROW_SHIFT
143        shrn2           \y3\().8H, v20.4S, #ROW_SHIFT
144        shrn2           \y4\().8H, v19.4S, #ROW_SHIFT
145
146        trn1            v16.8H, \y1\().8H, \y2\().8H
147        trn2            v17.8H, \y1\().8H, \y2\().8H
148        trn1            v18.8H, \y3\().8H, \y4\().8H
149        trn2            v19.8H, \y3\().8H, \y4\().8H
150        trn1            \y1\().4S, v16.4S, v18.4S
151        trn1            \y2\().4S, v17.4S, v19.4S
152        trn2            \y3\().4S, v16.4S, v18.4S
153        trn2            \y4\().4S, v17.4S, v19.4S
154.endm
155
156.macro declare_idct_col4_neon i, l
157function idct_col4_neon\i
158        dup             v23.4H, z4c
159.if \i == 1
160        add             v23.4H, v23.4H, v24.4H
161.else
162        mov             v5.D[0], v24.D[1]
163        add             v23.4H, v23.4H, v5.4H
164.endif
165        smull           v23.4S, v23.4H, z4
166
167        idct_col4_top   v24, v25, v26, v27, \i, \l
168
169        mov             x4, v28.D[\i - 1]
170        mov             x5, v29.D[\i - 1]
171        cmp             x4, #0
172        b.eq            1f
173
174        smull\i         v7.4S,  v28\l,  z4
175        add             v19.4S, v19.4S, v7.4S
176        sub             v20.4S, v20.4S, v7.4S
177        sub             v21.4S, v21.4S, v7.4S
178        add             v22.4S, v22.4S, v7.4S
179
1801:      mov             x4, v30.D[\i - 1]
181        cmp             x5, #0
182        b.eq            2f
183
184        smlal\i         v17.4S, v29\l, z5
185        smlsl\i         v18.4S, v29\l, z1
186        smlal\i         v5.4S,  v29\l, z7
187        smlal\i         v6.4S,  v29\l, z3
188
1892:      mov             x5, v31.D[\i - 1]
190        cmp             x4, #0
191        b.eq            3f
192
193        smull\i         v7.4S,  v30\l, z6
194        smull\i         v16.4S, v30\l, z2
195        add             v19.4S, v19.4S, v7.4S
196        sub             v22.4S, v22.4S, v7.4S
197        sub             v20.4S, v20.4S, v16.4S
198        add             v21.4S, v21.4S, v16.4S
199
2003:      cmp             x5, #0
201        b.eq            4f
202
203        smlal\i         v17.4S, v31\l, z7
204        smlsl\i         v18.4S, v31\l, z5
205        smlal\i         v5.4S,  v31\l, z3
206        smlsl\i         v6.4S,  v31\l, z1
207
2084:      addhn           v7.4H, v19.4S, v17.4S
209        addhn2          v7.8H, v20.4S, v18.4S
210        subhn           v18.4H, v20.4S, v18.4S
211        subhn2          v18.8H, v19.4S, v17.4S
212
213        addhn           v16.4H, v21.4S, v5.4S
214        addhn2          v16.8H, v22.4S, v6.4S
215        subhn           v17.4H, v22.4S, v6.4S
216        subhn2          v17.8H, v21.4S, v5.4S
217
218        ret
219endfunc
220.endm
221
222declare_idct_col4_neon 1, .4H
223declare_idct_col4_neon 2, .8H
224
225function ff_simple_idct_put_neon, export=1
226        idct_start      x2
227
228        idct_row4_neon  v24, v25, v26, v27, 1
229        idct_row4_neon  v28, v29, v30, v31, 2
230        bl              idct_col4_neon1
231
232        sqshrun         v1.8B,  v7.8H, #COL_SHIFT-16
233        sqshrun2        v1.16B, v16.8H, #COL_SHIFT-16
234        sqshrun         v3.8B,  v17.8H, #COL_SHIFT-16
235        sqshrun2        v3.16B, v18.8H, #COL_SHIFT-16
236
237        bl              idct_col4_neon2
238
239        sqshrun         v2.8B,  v7.8H, #COL_SHIFT-16
240        sqshrun2        v2.16B, v16.8H, #COL_SHIFT-16
241        sqshrun         v4.8B,  v17.8H, #COL_SHIFT-16
242        sqshrun2        v4.16B, v18.8H, #COL_SHIFT-16
243
244        zip1            v16.4S, v1.4S, v2.4S
245        zip2            v17.4S, v1.4S, v2.4S
246
247        st1             {v16.D}[0], [x0], x1
248        st1             {v16.D}[1], [x0], x1
249
250        zip1            v18.4S, v3.4S, v4.4S
251        zip2            v19.4S, v3.4S, v4.4S
252
253        st1             {v17.D}[0], [x0], x1
254        st1             {v17.D}[1], [x0], x1
255        st1             {v18.D}[0], [x0], x1
256        st1             {v18.D}[1], [x0], x1
257        st1             {v19.D}[0], [x0], x1
258        st1             {v19.D}[1], [x0], x1
259
260        idct_end
261endfunc
262
263function ff_simple_idct_add_neon, export=1
264        idct_start      x2
265
266        idct_row4_neon  v24, v25, v26, v27, 1
267        idct_row4_neon  v28, v29, v30, v31, 2
268        bl              idct_col4_neon1
269
270        sshr            v1.8H, v7.8H, #COL_SHIFT-16
271        sshr            v2.8H, v16.8H, #COL_SHIFT-16
272        sshr            v3.8H, v17.8H, #COL_SHIFT-16
273        sshr            v4.8H, v18.8H, #COL_SHIFT-16
274
275        bl              idct_col4_neon2
276
277        sshr            v7.8H, v7.8H, #COL_SHIFT-16
278        sshr            v16.8H, v16.8H, #COL_SHIFT-16
279        sshr            v17.8H, v17.8H, #COL_SHIFT-16
280        sshr            v18.8H, v18.8H, #COL_SHIFT-16
281
282        mov             x9,  x0
283        ld1             {v19.D}[0], [x0], x1
284        zip1            v23.2D, v1.2D, v7.2D
285        zip2            v24.2D, v1.2D, v7.2D
286        ld1             {v19.D}[1], [x0], x1
287        zip1            v25.2D, v2.2D, v16.2D
288        zip2            v26.2D, v2.2D, v16.2D
289        ld1             {v20.D}[0], [x0], x1
290        zip1            v27.2D, v3.2D, v17.2D
291        zip2            v28.2D, v3.2D, v17.2D
292        ld1             {v20.D}[1], [x0], x1
293        zip1            v29.2D, v4.2D, v18.2D
294        zip2            v30.2D, v4.2D, v18.2D
295        ld1             {v21.D}[0], [x0], x1
296        uaddw           v23.8H, v23.8H, v19.8B
297        uaddw2          v24.8H, v24.8H, v19.16B
298        ld1             {v21.D}[1], [x0], x1
299        sqxtun          v23.8B, v23.8H
300        sqxtun2         v23.16B, v24.8H
301        ld1             {v22.D}[0], [x0], x1
302        uaddw           v24.8H, v25.8H, v20.8B
303        uaddw2          v25.8H, v26.8H, v20.16B
304        ld1             {v22.D}[1], [x0], x1
305        sqxtun          v24.8B, v24.8H
306        sqxtun2         v24.16B, v25.8H
307        st1             {v23.D}[0], [x9], x1
308        uaddw           v25.8H, v27.8H, v21.8B
309        uaddw2          v26.8H, v28.8H, v21.16B
310        st1             {v23.D}[1], [x9], x1
311        sqxtun          v25.8B, v25.8H
312        sqxtun2         v25.16B, v26.8H
313        st1             {v24.D}[0], [x9], x1
314        uaddw           v26.8H, v29.8H, v22.8B
315        uaddw2          v27.8H, v30.8H, v22.16B
316        st1             {v24.D}[1], [x9], x1
317        sqxtun          v26.8B, v26.8H
318        sqxtun2         v26.16B, v27.8H
319        st1             {v25.D}[0], [x9], x1
320        st1             {v25.D}[1], [x9], x1
321        st1             {v26.D}[0], [x9], x1
322        st1             {v26.D}[1], [x9], x1
323
324        idct_end
325endfunc
326
327function ff_simple_idct_neon, export=1
328        idct_start      x0
329
330        mov             x2,  x0
331        idct_row4_neon  v24, v25, v26, v27, 1
332        idct_row4_neon  v28, v29, v30, v31, 2
333        sub             x2, x2, #128
334        bl              idct_col4_neon1
335
336        sshr            v1.8H, v7.8H, #COL_SHIFT-16
337        sshr            v2.8H, v16.8H, #COL_SHIFT-16
338        sshr            v3.8H, v17.8H, #COL_SHIFT-16
339        sshr            v4.8H, v18.8H, #COL_SHIFT-16
340
341        bl              idct_col4_neon2
342
343        sshr            v7.8H, v7.8H, #COL_SHIFT-16
344        sshr            v16.8H, v16.8H, #COL_SHIFT-16
345        sshr            v17.8H, v17.8H, #COL_SHIFT-16
346        sshr            v18.8H, v18.8H, #COL_SHIFT-16
347
348        zip1            v23.2D, v1.2D, v7.2D
349        zip2            v24.2D, v1.2D, v7.2D
350        st1             {v23.2D,v24.2D}, [x2], #32
351        zip1            v25.2D, v2.2D, v16.2D
352        zip2            v26.2D, v2.2D, v16.2D
353        st1             {v25.2D,v26.2D}, [x2], #32
354        zip1            v27.2D, v3.2D, v17.2D
355        zip2            v28.2D, v3.2D, v17.2D
356        st1             {v27.2D,v28.2D}, [x2], #32
357        zip1            v29.2D, v4.2D, v18.2D
358        zip2            v30.2D, v4.2D, v18.2D
359        st1             {v29.2D,v30.2D}, [x2], #32
360
361        idct_end
362endfunc
363