1/*
2 * Simple IDCT
3 *
4 * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
5 * Copyright (c) 2007 Mans Rullgard <mans@mansr.com>
6 *
7 * This file is part of FFmpeg.
8 *
9 * FFmpeg is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * FFmpeg is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with FFmpeg; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23
24#include "libavutil/arm/asm.S"
25
26#define W1  22725   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
27#define W2  21407   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
28#define W3  19266   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
29#define W4  16383   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
30#define W5  12873   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
31#define W6  8867    /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
32#define W7  4520    /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
33#define ROW_SHIFT 11
34#define COL_SHIFT 20
35
36#define W13 (W1 | (W3 << 16))
37#define W26 (W2 | (W6 << 16))
38#define W42 (W4 | (W2 << 16))
39#define W42n (-W4&0xffff | (-W2 << 16))
40#define W46 (W4 | (W6 << 16))
41#define W57 (W5 | (W7 << 16))
42
43/*
44  Compute partial IDCT of single row.
45  shift = left-shift amount
46  r0 = source address
47  r2 = row[2,0] <= 2 cycles
48  r3 = row[3,1]
49  ip = w42      <= 2 cycles
50
51  Output in registers r4--r11
52*/
53        .macro idct_row shift
54        ldr    lr, =W46              /* lr  = W4 | (W6 << 16) */
55        mov    r1, #(1<<(\shift-1))
56        smlad  r4, r2, ip, r1
57        smlsd  r7, r2, ip, r1
58        ldr    ip, =W13              /* ip  = W1 | (W3 << 16) */
59        ldr    r10,=W57              /* r10 = W5 | (W7 << 16) */
60        smlad  r5, r2, lr, r1
61        smlsd  r6, r2, lr, r1
62
63        smuad  r8, r3, ip            /* r8  =  B0 = W1*row[1] + W3*row[3] */
64        smusdx r11,r3, r10           /* r11 =  B3 = W7*row[1] - W5*row[3] */
65        ldr    lr, [r0, #12]         /* lr  =  row[7,5] */
66        pkhtb  r2, ip, r10,asr #16   /* r3  =  W7 | (W3 << 16) */
67        pkhbt  r1, ip, r10,lsl #16   /* r1  =  W1 | (W5 << 16) */
68        smusdx r9, r2, r3            /* r9  = -B1 = W7*row[3] - W3*row[1] */
69        smlad  r8, lr, r10,r8        /* B0  +=      W5*row[5] + W7*row[7] */
70        smusdx r10,r3, r1            /* r10 =  B2 = W5*row[1] - W1*row[3] */
71
72        ldr    r3, =W42n             /* r3 =  -W4 | (-W2 << 16) */
73        smlad  r10,lr, r2, r10       /* B2 +=  W7*row[5] + W3*row[7] */
74        ldr    r2, [r0, #4]          /* r2 =   row[6,4] */
75        smlsdx r11,lr, ip, r11       /* B3 +=  W3*row[5] - W1*row[7] */
76        ldr    ip, =W46              /* ip =   W4 | (W6 << 16) */
77        smlad  r9, lr, r1, r9        /* B1 -=  W1*row[5] + W5*row[7] */
78
79        smlad  r5, r2, r3, r5        /* A1 += -W4*row[4] - W2*row[6] */
80        smlsd  r6, r2, r3, r6        /* A2 += -W4*row[4] + W2*row[6] */
81        smlad  r4, r2, ip, r4        /* A0 +=  W4*row[4] + W6*row[6] */
82        smlsd  r7, r2, ip, r7        /* A3 +=  W4*row[4] - W6*row[6] */
83        .endm
84
85/*
86  Compute partial IDCT of half row.
87  shift = left-shift amount
88  r2 = row[2,0]
89  r3 = row[3,1]
90  ip = w42
91
92  Output in registers r4--r11
93*/
94        .macro idct_row4 shift
95        ldr    lr, =W46              /* lr =  W4 | (W6 << 16) */
96        ldr    r10,=W57              /* r10 = W5 | (W7 << 16) */
97        mov    r1, #(1<<(\shift-1))
98        smlad  r4, r2, ip, r1
99        smlsd  r7, r2, ip, r1
100        ldr    ip, =W13              /* ip =  W1 | (W3 << 16) */
101        smlad  r5, r2, lr, r1
102        smlsd  r6, r2, lr, r1
103        smusdx r11,r3, r10           /* r11 =  B3 = W7*row[1] - W5*row[3] */
104        smuad  r8, r3, ip            /* r8  =  B0 = W1*row[1] + W3*row[3] */
105        pkhtb  r2, ip, r10,asr #16   /* r3  =  W7 | (W3 << 16) */
106        pkhbt  r1, ip, r10,lsl #16   /* r1  =  W1 | (W5 << 16) */
107        smusdx r9, r2, r3            /* r9  = -B1 = W7*row[3] - W3*row[1] */
108        smusdx r10,r3, r1            /* r10 =  B2 = W5*row[1] - W1*row[3] */
109        .endm
110
111/*
112  Compute final part of IDCT single row without shift.
113  Input in registers r4--r11
114  Output in registers ip, r4--r6, lr, r8--r10
115*/
116        .macro idct_finish
117        add    ip, r4, r8            /* r1 = A0 + B0 */
118        sub    lr, r4, r8            /* r2 = A0 - B0 */
119        sub    r4, r5, r9            /* r2 = A1 + B1 */
120        add    r8, r5, r9            /* r2 = A1 - B1 */
121        add    r5, r6, r10           /* r1 = A2 + B2 */
122        sub    r9, r6, r10           /* r1 = A2 - B2 */
123        add    r6, r7, r11           /* r2 = A3 + B3 */
124        sub    r10,r7, r11           /* r2 = A3 - B3 */
125        .endm
126
127/*
128  Compute final part of IDCT single row.
129  shift = right-shift amount
130  Input/output in registers r4--r11
131*/
132        .macro idct_finish_shift shift
133        add    r3, r4, r8            /* r3 = A0 + B0 */
134        sub    r2, r4, r8            /* r2 = A0 - B0 */
135        mov    r4, r3, asr #\shift
136        mov    r8, r2, asr #\shift
137
138        sub    r3, r5, r9            /* r3 = A1 + B1 */
139        add    r2, r5, r9            /* r2 = A1 - B1 */
140        mov    r5, r3, asr #\shift
141        mov    r9, r2, asr #\shift
142
143        add    r3, r6, r10           /* r3 = A2 + B2 */
144        sub    r2, r6, r10           /* r2 = A2 - B2 */
145        mov    r6, r3, asr #\shift
146        mov    r10,r2, asr #\shift
147
148        add    r3, r7, r11           /* r3 = A3 + B3 */
149        sub    r2, r7, r11           /* r2 = A3 - B3 */
150        mov    r7, r3, asr #\shift
151        mov    r11,r2, asr #\shift
152        .endm
153
154/*
155  Compute final part of IDCT single row, saturating results at 8 bits.
156  shift = right-shift amount
157  Input/output in registers r4--r11
158*/
159        .macro idct_finish_shift_sat shift
160        add    r3, r4, r8            /* r3 = A0 + B0 */
161        sub    ip, r4, r8            /* ip = A0 - B0 */
162        usat   r4, #8, r3, asr #\shift
163        usat   r8, #8, ip, asr #\shift
164
165        sub    r3, r5, r9            /* r3 = A1 + B1 */
166        add    ip, r5, r9            /* ip = A1 - B1 */
167        usat   r5, #8, r3, asr #\shift
168        usat   r9, #8, ip, asr #\shift
169
170        add    r3, r6, r10           /* r3 = A2 + B2 */
171        sub    ip, r6, r10           /* ip = A2 - B2 */
172        usat   r6, #8, r3, asr #\shift
173        usat   r10,#8, ip, asr #\shift
174
175        add    r3, r7, r11           /* r3 = A3 + B3 */
176        sub    ip, r7, r11           /* ip = A3 - B3 */
177        usat   r7, #8, r3, asr #\shift
178        usat   r11,#8, ip, asr #\shift
179        .endm
180
181/*
182  Compute IDCT of single row, storing as column.
183  r0 = source
184  r1 = dest
185*/
186function idct_row_armv6
187        push   {lr}
188
189        ldr    lr, [r0, #12]         /* lr = row[7,5] */
190        ldr    ip, [r0, #4]          /* ip = row[6,4] */
191        ldr    r3, [r0, #8]          /* r3 = row[3,1] */
192        ldr    r2, [r0]              /* r2 = row[2,0] */
193        orrs   lr, lr, ip
194        itt    eq
195        cmpeq  lr, r3
196        cmpeq  lr, r2, lsr #16
197        beq    1f
198        push   {r1}
199        ldr    ip, =W42              /* ip = W4 | (W2 << 16) */
200        cmp    lr, #0
201        beq    2f
202
203        idct_row   ROW_SHIFT
204        b      3f
205
2062:      idct_row4  ROW_SHIFT
207
2083:      pop    {r1}
209        idct_finish_shift ROW_SHIFT
210
211        strh   r4, [r1]
212        strh   r5, [r1, #(16*2)]
213        strh   r6, [r1, #(16*4)]
214        strh   r7, [r1, #(16*6)]
215        strh   r11,[r1, #(16*1)]
216        strh   r10,[r1, #(16*3)]
217        strh   r9, [r1, #(16*5)]
218        strh   r8, [r1, #(16*7)]
219
220        pop    {pc}
221
2221:      mov    r2, r2, lsl #3
223        strh   r2, [r1]
224        strh   r2, [r1, #(16*2)]
225        strh   r2, [r1, #(16*4)]
226        strh   r2, [r1, #(16*6)]
227        strh   r2, [r1, #(16*1)]
228        strh   r2, [r1, #(16*3)]
229        strh   r2, [r1, #(16*5)]
230        strh   r2, [r1, #(16*7)]
231        pop    {pc}
232endfunc
233
234/*
235  Compute IDCT of single column, read as row.
236  r0 = source
237  r1 = dest
238*/
239function idct_col_armv6
240        push   {r1, lr}
241
242        ldr    r2, [r0]              /* r2 = row[2,0] */
243        ldr    ip, =W42              /* ip = W4 | (W2 << 16) */
244        ldr    r3, [r0, #8]          /* r3 = row[3,1] */
245        idct_row COL_SHIFT
246        pop    {r1}
247        idct_finish_shift COL_SHIFT
248
249        strh   r4, [r1]
250        strh   r5, [r1, #(16*1)]
251        strh   r6, [r1, #(16*2)]
252        strh   r7, [r1, #(16*3)]
253        strh   r11,[r1, #(16*4)]
254        strh   r10,[r1, #(16*5)]
255        strh   r9, [r1, #(16*6)]
256        strh   r8, [r1, #(16*7)]
257
258        pop    {pc}
259endfunc
260
261/*
262  Compute IDCT of single column, read as row, store saturated 8-bit.
263  r0 = source
264  r1 = dest
265  r2 = line size
266*/
267function idct_col_put_armv6
268        push   {r1, r2, lr}
269
270        ldr    r2, [r0]              /* r2 = row[2,0] */
271        ldr    ip, =W42              /* ip = W4 | (W2 << 16) */
272        ldr    r3, [r0, #8]          /* r3 = row[3,1] */
273        idct_row COL_SHIFT
274        pop    {r1, r2}
275        idct_finish_shift_sat COL_SHIFT
276
277        strb_post r4, r1, r2
278        strb_post r5, r1, r2
279        strb_post r6, r1, r2
280        strb_post r7, r1, r2
281        strb_post r11,r1, r2
282        strb_post r10,r1, r2
283        strb_post r9, r1, r2
284        strb_post r8, r1, r2
285
286        sub    r1, r1, r2, lsl #3
287
288        pop    {pc}
289endfunc
290
291/*
292  Compute IDCT of single column, read as row, add/store saturated 8-bit.
293  r0 = source
294  r1 = dest
295  r2 = line size
296*/
297function idct_col_add_armv6
298        push   {r1, r2, lr}
299
300        ldr    r2, [r0]              /* r2 = row[2,0] */
301        ldr    ip, =W42              /* ip = W4 | (W2 << 16) */
302        ldr    r3, [r0, #8]          /* r3 = row[3,1] */
303        idct_row COL_SHIFT
304        pop    {r1, r2}
305        idct_finish
306
307        ldrb   r3, [r1]
308        ldrb   r7, [r1, r2]
309        ldrb   r11,[r1, r2, lsl #2]
310        add    ip, r3, ip, asr #COL_SHIFT
311        usat   ip, #8, ip
312        add    r4, r7, r4, asr #COL_SHIFT
313        strb_post ip, r1, r2
314        ldrb   ip, [r1, r2]
315        usat   r4, #8, r4
316        ldrb   r11,[r1, r2, lsl #2]
317        add    r5, ip, r5, asr #COL_SHIFT
318        usat   r5, #8, r5
319        strb_post r4, r1, r2
320        ldrb   r3, [r1, r2]
321        ldrb   ip, [r1, r2, lsl #2]
322        strb_post r5, r1, r2
323        ldrb   r7, [r1, r2]
324        ldrb   r4, [r1, r2, lsl #2]
325        add    r6, r3, r6, asr #COL_SHIFT
326        usat   r6, #8, r6
327        add    r10,r7, r10,asr #COL_SHIFT
328        usat   r10,#8, r10
329        add    r9, r11,r9, asr #COL_SHIFT
330        usat   r9, #8, r9
331        add    r8, ip, r8, asr #COL_SHIFT
332        usat   r8, #8, r8
333        add    lr, r4, lr, asr #COL_SHIFT
334        usat   lr, #8, lr
335        strb_post r6, r1, r2
336        strb_post r10,r1, r2
337        strb_post r9, r1, r2
338        strb_post r8, r1, r2
339        strb_post lr, r1, r2
340
341        sub    r1, r1, r2, lsl #3
342
343        pop    {pc}
344endfunc
345
346/*
347  Compute 8 IDCT row transforms.
348  func = IDCT row->col function
349  width = width of columns in bytes
350*/
351        .macro idct_rows func width
352        bl     \func
353        add    r0, r0, #(16*2)
354        add    r1, r1, #\width
355        bl     \func
356        add    r0, r0, #(16*2)
357        add    r1, r1, #\width
358        bl     \func
359        add    r0, r0, #(16*2)
360        add    r1, r1, #\width
361        bl     \func
362        sub    r0, r0, #(16*5)
363        add    r1, r1, #\width
364        bl     \func
365        add    r0, r0, #(16*2)
366        add    r1, r1, #\width
367        bl     \func
368        add    r0, r0, #(16*2)
369        add    r1, r1, #\width
370        bl     \func
371        add    r0, r0, #(16*2)
372        add    r1, r1, #\width
373        bl     \func
374
375        sub    r0, r0, #(16*7)
376        .endm
377
378/* void ff_simple_idct_armv6(int16_t *data); */
379function ff_simple_idct_armv6, export=1
380        push   {r4-r11, lr}
381        sub    sp, sp, #128
382
383        mov    r1, sp
384        idct_rows idct_row_armv6, 2
385        mov    r1, r0
386        mov    r0, sp
387        idct_rows idct_col_armv6, 2
388
389        add    sp, sp, #128
390        pop    {r4-r11, pc}
391endfunc
392
393/* ff_simple_idct_add_armv6(uint8_t *dest, ptrdiff_t line_size, int16_t *data); */
394function ff_simple_idct_add_armv6, export=1
395        push   {r0, r1, r4-r11, lr}
396        sub    sp, sp, #128
397
398        mov    r0, r2
399        mov    r1, sp
400        idct_rows idct_row_armv6, 2
401        mov    r0, sp
402        ldr    r1, [sp, #128]
403        ldr    r2, [sp, #(128+4)]
404        idct_rows idct_col_add_armv6, 1
405
406        add    sp, sp, #(128+8)
407        pop    {r4-r11, pc}
408endfunc
409
410/* ff_simple_idct_put_armv6(uint8_t *dest, ptrdiff_t line_size, int16_t *data); */
411function ff_simple_idct_put_armv6, export=1
412        push   {r0, r1, r4-r11, lr}
413        sub    sp, sp, #128
414
415        mov    r0, r2
416        mov    r1, sp
417        idct_rows idct_row_armv6, 2
418        mov    r0, sp
419        ldr    r1, [sp, #128]
420        ldr    r2, [sp, #(128+4)]
421        idct_rows idct_col_put_armv6, 1
422
423        add    sp, sp, #(128+8)
424        pop    {r4-r11, pc}
425endfunc
426