1/*
2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3 * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22#include "libavutil/aarch64/asm.S"
23#include "neon.S"
24
25function ff_h264_idct_add_neon, export=1
26.L_ff_h264_idct_add_neon:
27        ld1             {v0.4H, v1.4H, v2.4H, v3.4H},  [x1]
28        sxtw            x2,     w2
29        movi            v30.8H, #0
30
31        add             v4.4H,  v0.4H,  v2.4H
32        sshr            v16.4H, v1.4H,  #1
33        st1             {v30.8H},    [x1], #16
34        sshr            v17.4H, v3.4H,  #1
35        st1             {v30.8H},    [x1], #16
36        sub             v5.4H,  v0.4H,  v2.4H
37        sub             v6.4H,  v16.4H, v3.4H
38        add             v7.4H,  v1.4H,  v17.4H
39        add             v0.4H,  v4.4H,  v7.4H
40        add             v1.4H,  v5.4H,  v6.4H
41        sub             v2.4H,  v5.4H,  v6.4H
42        sub             v3.4H,  v4.4H,  v7.4H
43
44        transpose_4x4H  v0, v1, v2, v3, v4, v5, v6, v7
45
46        add             v4.4H,  v0.4H,  v2.4H
47        ld1             {v18.S}[0], [x0], x2
48        sshr            v16.4H,  v3.4H,  #1
49        sshr            v17.4H,  v1.4H,  #1
50        ld1             {v18.S}[1], [x0], x2
51        sub             v5.4H,  v0.4H,  v2.4H
52        ld1             {v19.S}[1], [x0], x2
53        add             v6.4H,  v16.4H, v1.4H
54        ins             v4.D[1],  v5.D[0]
55        sub             v7.4H,  v17.4H, v3.4H
56        ld1             {v19.S}[0], [x0], x2
57        ins             v6.D[1],  v7.D[0]
58        sub             x0,  x0,  x2, lsl #2
59        add             v0.8H,  v4.8H,  v6.8H
60        sub             v1.8H,  v4.8H,  v6.8H
61
62        srshr           v0.8H,  v0.8H,  #6
63        srshr           v1.8H,  v1.8H,  #6
64
65        uaddw           v0.8H,  v0.8H,  v18.8B
66        uaddw           v1.8H,  v1.8H,  v19.8B
67
68        sqxtun          v0.8B, v0.8H
69        sqxtun          v1.8B, v1.8H
70
71        st1             {v0.S}[0],  [x0], x2
72        st1             {v0.S}[1],  [x0], x2
73        st1             {v1.S}[1],  [x0], x2
74        st1             {v1.S}[0],  [x0], x2
75
76        sub             x1,  x1,  #32
77        ret
78endfunc
79
80function ff_h264_idct_dc_add_neon, export=1
81.L_ff_h264_idct_dc_add_neon:
82        sxtw            x2,  w2
83        mov             w3,       #0
84        ld1r            {v2.8H},  [x1]
85        strh            w3,       [x1]
86        srshr           v2.8H,  v2.8H,  #6
87        ld1             {v0.S}[0],  [x0], x2
88        ld1             {v0.S}[1],  [x0], x2
89        uaddw           v3.8H,  v2.8H,  v0.8B
90        ld1             {v1.S}[0],  [x0], x2
91        ld1             {v1.S}[1],  [x0], x2
92        uaddw           v4.8H,  v2.8H,  v1.8B
93        sqxtun          v0.8B,  v3.8H
94        sqxtun          v1.8B,  v4.8H
95        sub             x0,  x0,  x2, lsl #2
96        st1             {v0.S}[0],  [x0], x2
97        st1             {v0.S}[1],  [x0], x2
98        st1             {v1.S}[0],  [x0], x2
99        st1             {v1.S}[1],  [x0], x2
100        ret
101endfunc
102
103function ff_h264_idct_add16_neon, export=1
104        mov             x12, x30
105        mov             x6,  x0         // dest
106        mov             x5,  x1         // block_offset
107        mov             x1,  x2         // block
108        mov             w9,  w3         // stride
109        movrel          x7,  scan8
110        mov             x10, #16
111        movrel          x13, .L_ff_h264_idct_dc_add_neon
112        movrel          x14, .L_ff_h264_idct_add_neon
1131:      mov             w2,  w9
114        ldrb            w3,  [x7], #1
115        ldrsw           x0,  [x5], #4
116        ldrb            w3,  [x4,  w3,  uxtw]
117        subs            w3,  w3,  #1
118        b.lt            2f
119        ldrsh           w3,  [x1]
120        add             x0,  x0,  x6
121        ccmp            w3,  #0,  #4,  eq
122        csel            x15, x13, x14, ne
123        blr             x15
1242:      subs            x10, x10, #1
125        add             x1,  x1,  #32
126        b.ne            1b
127        ret             x12
128endfunc
129
130function ff_h264_idct_add16intra_neon, export=1
131        mov             x12, x30
132        mov             x6,  x0         // dest
133        mov             x5,  x1         // block_offset
134        mov             x1,  x2         // block
135        mov             w9,  w3         // stride
136        movrel          x7,  scan8
137        mov             x10, #16
138        movrel          x13, .L_ff_h264_idct_dc_add_neon
139        movrel          x14, .L_ff_h264_idct_add_neon
1401:      mov             w2,  w9
141        ldrb            w3,  [x7], #1
142        ldrsw           x0,  [x5], #4
143        ldrb            w3,  [x4,  w3,  uxtw]
144        add             x0,  x0,  x6
145        cmp             w3,  #0
146        ldrsh           w3,  [x1]
147        csel            x15, x13, x14, eq
148        ccmp            w3,  #0,  #0,  eq
149        b.eq            2f
150        blr             x15
1512:      subs            x10, x10, #1
152        add             x1,  x1,  #32
153        b.ne            1b
154        ret             x12
155endfunc
156
157function ff_h264_idct_add8_neon, export=1
158        sub             sp,  sp, #0x40
159        stp             x19, x20, [sp]
160        mov             x12, x30
161        ldp             x6,  x15, [x0]          // dest[0], dest[1]
162        add             x5,  x1,  #16*4         // block_offset
163        add             x9,  x2,  #16*32        // block
164        mov             w19, w3                 // stride
165        movrel          x13, .L_ff_h264_idct_dc_add_neon
166        movrel          x14, .L_ff_h264_idct_add_neon
167        movrel          x7,  scan8, 16
168        mov             x10, #0
169        mov             x11, #16
1701:      mov             w2,  w19
171        ldrb            w3,  [x7, x10]          // scan8[i]
172        ldrsw           x0,  [x5, x10, lsl #2]  // block_offset[i]
173        ldrb            w3,  [x4, w3,  uxtw]    // nnzc[ scan8[i] ]
174        add             x0,  x0,  x6            // block_offset[i] + dst[j-1]
175        add             x1,  x9,  x10, lsl #5   // block + i * 16
176        cmp             w3,  #0
177        ldrsh           w3,  [x1]               // block[i*16]
178        csel            x20, x13, x14, eq
179        ccmp            w3,  #0,  #0,  eq
180        b.eq            2f
181        blr             x20
1822:      add             x10, x10, #1
183        cmp             x10, #4
184        csel            x10, x11, x10, eq     // mov x10, #16
185        csel            x6,  x15, x6,  eq
186        cmp             x10, #20
187        b.lt            1b
188        ldp             x19, x20, [sp]
189        add             sp,  sp,  #0x40
190        ret             x12
191endfunc
192
193.macro  idct8x8_cols    pass
194  .if \pass == 0
195        va      .req    v18
196        vb      .req    v30
197        sshr            v18.8H, v26.8H, #1
198        add             v16.8H, v24.8H, v28.8H
199        ld1             {v30.8H, v31.8H}, [x1]
200        st1             {v19.8H}, [x1],  #16
201        st1             {v19.8H}, [x1],  #16
202        sub             v17.8H,  v24.8H, v28.8H
203        sshr            v19.8H,  v30.8H, #1
204        sub             v18.8H,  v18.8H,  v30.8H
205        add             v19.8H,  v19.8H,  v26.8H
206  .else
207        va      .req    v30
208        vb      .req    v18
209        sshr            v30.8H, v26.8H, #1
210        sshr            v19.8H, v18.8H, #1
211        add             v16.8H, v24.8H, v28.8H
212        sub             v17.8H, v24.8H, v28.8H
213        sub             v30.8H, v30.8H, v18.8H
214        add             v19.8H, v19.8H, v26.8H
215  .endif
216        add             v26.8H, v17.8H, va.8H
217        sub             v28.8H, v17.8H, va.8H
218        add             v24.8H, v16.8H, v19.8H
219        sub             vb.8H,  v16.8H, v19.8H
220        sub             v16.8H, v29.8H, v27.8H
221        add             v17.8H, v31.8H, v25.8H
222        sub             va.8H,  v31.8H, v25.8H
223        add             v19.8H, v29.8H, v27.8H
224        sub             v16.8H, v16.8H, v31.8H
225        sub             v17.8H, v17.8H, v27.8H
226        add             va.8H,  va.8H,  v29.8H
227        add             v19.8H, v19.8H, v25.8H
228        sshr            v25.8H, v25.8H, #1
229        sshr            v27.8H, v27.8H, #1
230        sshr            v29.8H, v29.8H, #1
231        sshr            v31.8H, v31.8H, #1
232        sub             v16.8H, v16.8H, v31.8H
233        sub             v17.8H, v17.8H, v27.8H
234        add             va.8H,  va.8H,  v29.8H
235        add             v19.8H, v19.8H, v25.8H
236        sshr            v25.8H, v16.8H, #2
237        sshr            v27.8H, v17.8H, #2
238        sshr            v29.8H, va.8H,  #2
239        sshr            v31.8H, v19.8H, #2
240        sub             v19.8H, v19.8H, v25.8H
241        sub             va.8H,  v27.8H, va.8H
242        add             v17.8H, v17.8H, v29.8H
243        add             v16.8H, v16.8H, v31.8H
244  .if \pass == 0
245        sub             v31.8H, v24.8H, v19.8H
246        add             v24.8H, v24.8H, v19.8H
247        add             v25.8H, v26.8H, v18.8H
248        sub             v18.8H, v26.8H, v18.8H
249        add             v26.8H, v28.8H, v17.8H
250        add             v27.8H, v30.8H, v16.8H
251        sub             v29.8H, v28.8H, v17.8H
252        sub             v28.8H, v30.8H, v16.8H
253  .else
254        sub             v31.8H, v24.8H, v19.8H
255        add             v24.8H, v24.8H, v19.8H
256        add             v25.8H, v26.8H, v30.8H
257        sub             v30.8H, v26.8H, v30.8H
258        add             v26.8H, v28.8H, v17.8H
259        sub             v29.8H, v28.8H, v17.8H
260        add             v27.8H, v18.8H, v16.8H
261        sub             v28.8H, v18.8H, v16.8H
262  .endif
263        .unreq          va
264        .unreq          vb
265.endm
266
267function ff_h264_idct8_add_neon, export=1
268.L_ff_h264_idct8_add_neon:
269        movi            v19.8H,   #0
270        sxtw            x2,       w2
271        ld1             {v24.8H, v25.8H}, [x1]
272        st1             {v19.8H},  [x1],   #16
273        st1             {v19.8H},  [x1],   #16
274        ld1             {v26.8H, v27.8H}, [x1]
275        st1             {v19.8H},  [x1],   #16
276        st1             {v19.8H},  [x1],   #16
277        ld1             {v28.8H, v29.8H}, [x1]
278        st1             {v19.8H},  [x1],   #16
279        st1             {v19.8H},  [x1],   #16
280
281        idct8x8_cols    0
282        transpose_8x8H  v24, v25, v26, v27, v28, v29, v18, v31, v6, v7
283        idct8x8_cols    1
284
285        mov             x3,  x0
286        srshr           v24.8H, v24.8H, #6
287        ld1             {v0.8B},     [x0], x2
288        srshr           v25.8H, v25.8H, #6
289        ld1             {v1.8B},     [x0], x2
290        srshr           v26.8H, v26.8H, #6
291        ld1             {v2.8B},     [x0], x2
292        srshr           v27.8H, v27.8H, #6
293        ld1             {v3.8B},     [x0], x2
294        srshr           v28.8H, v28.8H, #6
295        ld1             {v4.8B},     [x0], x2
296        srshr           v29.8H, v29.8H, #6
297        ld1             {v5.8B},     [x0], x2
298        srshr           v30.8H, v30.8H, #6
299        ld1             {v6.8B},     [x0], x2
300        srshr           v31.8H, v31.8H, #6
301        ld1             {v7.8B},     [x0], x2
302        uaddw           v24.8H, v24.8H, v0.8B
303        uaddw           v25.8H, v25.8H, v1.8B
304        uaddw           v26.8H, v26.8H, v2.8B
305        sqxtun          v0.8B,  v24.8H
306        uaddw           v27.8H, v27.8H, v3.8B
307        sqxtun          v1.8B,  v25.8H
308        uaddw           v28.8H, v28.8H, v4.8B
309        sqxtun          v2.8B,  v26.8H
310        st1             {v0.8B},     [x3], x2
311        uaddw           v29.8H, v29.8H, v5.8B
312        sqxtun          v3.8B,  v27.8H
313        st1             {v1.8B},     [x3], x2
314        uaddw           v30.8H, v30.8H, v6.8B
315        sqxtun          v4.8B,  v28.8H
316        st1             {v2.8B},     [x3], x2
317        uaddw           v31.8H, v31.8H, v7.8B
318        sqxtun          v5.8B,  v29.8H
319        st1             {v3.8B},     [x3], x2
320        sqxtun          v6.8B,  v30.8H
321        sqxtun          v7.8B,  v31.8H
322        st1             {v4.8B},     [x3], x2
323        st1             {v5.8B},     [x3], x2
324        st1             {v6.8B},     [x3], x2
325        st1             {v7.8B},     [x3], x2
326
327        sub             x1,  x1,  #128
328        ret
329endfunc
330
331function ff_h264_idct8_dc_add_neon, export=1
332.L_ff_h264_idct8_dc_add_neon:
333        mov             w3,       #0
334        sxtw            x2,       w2
335        ld1r            {v31.8H}, [x1]
336        strh            w3,       [x1]
337        ld1             {v0.8B},  [x0], x2
338        srshr           v31.8H, v31.8H, #6
339        ld1             {v1.8B},     [x0], x2
340        ld1             {v2.8B},     [x0], x2
341        uaddw           v24.8H, v31.8H, v0.8B
342        ld1             {v3.8B},     [x0], x2
343        uaddw           v25.8H, v31.8H, v1.8B
344        ld1             {v4.8B},     [x0], x2
345        uaddw           v26.8H, v31.8H, v2.8B
346        ld1             {v5.8B},     [x0], x2
347        uaddw           v27.8H, v31.8H, v3.8B
348        ld1             {v6.8B},     [x0], x2
349        uaddw           v28.8H, v31.8H, v4.8B
350        ld1             {v7.8B},     [x0], x2
351        uaddw           v29.8H, v31.8H, v5.8B
352        uaddw           v30.8H, v31.8H, v6.8B
353        uaddw           v31.8H, v31.8H, v7.8B
354        sqxtun          v0.8B,  v24.8H
355        sqxtun          v1.8B,  v25.8H
356        sqxtun          v2.8B,  v26.8H
357        sqxtun          v3.8B,  v27.8H
358        sub             x0,  x0,  x2, lsl #3
359        st1             {v0.8B},     [x0], x2
360        sqxtun          v4.8B,  v28.8H
361        st1             {v1.8B},     [x0], x2
362        sqxtun          v5.8B,  v29.8H
363        st1             {v2.8B},     [x0], x2
364        sqxtun          v6.8B,  v30.8H
365        st1             {v3.8B},     [x0], x2
366        sqxtun          v7.8B,  v31.8H
367        st1             {v4.8B},     [x0], x2
368        st1             {v5.8B},     [x0], x2
369        st1             {v6.8B},     [x0], x2
370        st1             {v7.8B},     [x0], x2
371        ret
372endfunc
373
374function ff_h264_idct8_add4_neon, export=1
375        mov             x12, x30
376        mov             x6,  x0
377        mov             x5,  x1
378        mov             x1,  x2
379        mov             w2,  w3
380        movrel          x7,  scan8
381        mov             w10, #16
382        movrel          x13, .L_ff_h264_idct8_dc_add_neon
383        movrel          x14, .L_ff_h264_idct8_add_neon
3841:      ldrb            w9,  [x7], #4
385        ldrsw           x0,  [x5], #16
386        ldrb            w9,  [x4, w9, UXTW]
387        subs            w9,  w9,  #1
388        b.lt            2f
389        ldrsh           w11,  [x1]
390        add             x0,  x6,  x0
391        ccmp            w11, #0,  #4,  eq
392        csel            x15, x13, x14, ne
393        blr             x15
3942:      subs            w10, w10, #4
395        add             x1,  x1,  #128
396        b.ne            1b
397        ret             x12
398endfunc
399
400const   scan8
401        .byte           4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
402        .byte           6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8
403        .byte           4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8
404        .byte           6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8
405        .byte           4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8
406        .byte           6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8
407        .byte           4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8
408        .byte           6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8
409        .byte           4+11*8, 5+11*8, 4+12*8, 5+12*8
410        .byte           6+11*8, 7+11*8, 6+12*8, 7+12*8
411        .byte           4+13*8, 5+13*8, 4+14*8, 5+14*8
412        .byte           6+13*8, 7+13*8, 6+14*8, 7+14*8
413endconst
414