1/*
2 * Simple IDCT
3 *
4 * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
5 * Copyright (c) 2006 Mans Rullgard <mans@mansr.com>
6 *
7 * This file is part of FFmpeg.
8 *
9 * FFmpeg is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * FFmpeg is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with FFmpeg; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23
24#include "libavutil/arm/asm.S"
25
26#define W1  22725   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
27#define W2  21407   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
28#define W3  19266   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
29#define W4  16383   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
30#define W5  12873   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
31#define W6  8867    /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
32#define W7  4520    /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
33#define ROW_SHIFT 11
34#define COL_SHIFT 20
35
36#define W13 (W1 | (W3 << 16))
37#define W26 (W2 | (W6 << 16))
38#define W57 (W5 | (W7 << 16))
39
40function idct_row_armv5te
41        str    lr, [sp, #-4]!
42
43        ldrd   v1, v2, [a1, #8]
44        ldrd   a3, a4, [a1]          /* a3 = row[1:0], a4 = row[3:2] */
45        orrs   v1, v1, v2
46        itt    eq
47        cmpeq  v1, a4
48        cmpeq  v1, a3, lsr #16
49        beq    row_dc_only
50
51        mov    v1, #(1<<(ROW_SHIFT-1))
52        mov    ip, #16384
53        sub    ip, ip, #1            /* ip = W4 */
54        smlabb v1, ip, a3, v1        /* v1 = W4*row[0]+(1<<(RS-1)) */
55        ldr    ip, =W26              /* ip = W2 | (W6 << 16) */
56        smultb a2, ip, a4
57        smulbb lr, ip, a4
58        add    v2, v1, a2
59        sub    v3, v1, a2
60        sub    v4, v1, lr
61        add    v1, v1, lr
62
63        ldr    ip, =W13              /* ip = W1 | (W3 << 16) */
64        ldr    lr, =W57              /* lr = W5 | (W7 << 16) */
65        smulbt v5, ip, a3
66        smultt v6, lr, a4
67        smlatt v5, ip, a4, v5
68        smultt a2, ip, a3
69        smulbt v7, lr, a3
70        sub    v6, v6, a2
71        smulbt a2, ip, a4
72        smultt fp, lr, a3
73        sub    v7, v7, a2
74        smulbt a2, lr, a4
75        ldrd   a3, a4, [a1, #8]     /* a3=row[5:4] a4=row[7:6] */
76        sub    fp, fp, a2
77
78        orrs   a2, a3, a4
79        beq    1f
80
81        smlabt v5, lr, a3, v5
82        smlabt v6, ip, a3, v6
83        smlatt v5, lr, a4, v5
84        smlabt v6, lr, a4, v6
85        smlatt v7, lr, a3, v7
86        smlatt fp, ip, a3, fp
87        smulbt a2, ip, a4
88        smlatt v7, ip, a4, v7
89        sub    fp, fp, a2
90
91        ldr    ip, =W26              /* ip = W2 | (W6 << 16) */
92        mov    a2, #16384
93        sub    a2, a2, #1            /* a2 =  W4 */
94        smulbb a2, a2, a3            /* a2 =  W4*row[4] */
95        smultb lr, ip, a4            /* lr =  W6*row[6] */
96        add    v1, v1, a2            /* v1 += W4*row[4] */
97        add    v1, v1, lr            /* v1 += W6*row[6] */
98        add    v4, v4, a2            /* v4 += W4*row[4] */
99        sub    v4, v4, lr            /* v4 -= W6*row[6] */
100        smulbb lr, ip, a4            /* lr =  W2*row[6] */
101        sub    v2, v2, a2            /* v2 -= W4*row[4] */
102        sub    v2, v2, lr            /* v2 -= W2*row[6] */
103        sub    v3, v3, a2            /* v3 -= W4*row[4] */
104        add    v3, v3, lr            /* v3 += W2*row[6] */
105
1061:      add    a2, v1, v5
107        mov    a3, a2, lsr #11
108        bic    a3, a3, #0x1f0000
109        sub    a2, v2, v6
110        mov    a2, a2, lsr #11
111        add    a3, a3, a2, lsl #16
112        add    a2, v3, v7
113        mov    a4, a2, lsr #11
114        bic    a4, a4, #0x1f0000
115        add    a2, v4, fp
116        mov    a2, a2, lsr #11
117        add    a4, a4, a2, lsl #16
118        strd   a3, a4, [a1]
119
120        sub    a2, v4, fp
121        mov    a3, a2, lsr #11
122        bic    a3, a3, #0x1f0000
123        sub    a2, v3, v7
124        mov    a2, a2, lsr #11
125        add    a3, a3, a2, lsl #16
126        add    a2, v2, v6
127        mov    a4, a2, lsr #11
128        bic    a4, a4, #0x1f0000
129        sub    a2, v1, v5
130        mov    a2, a2, lsr #11
131        add    a4, a4, a2, lsl #16
132        strd   a3, a4, [a1, #8]
133
134        ldr    pc, [sp], #4
135
136row_dc_only:
137        orr    a3, a3, a3, lsl #16
138        bic    a3, a3, #0xe000
139        mov    a3, a3, lsl #3
140        mov    a4, a3
141        strd   a3, a4, [a1]
142        strd   a3, a4, [a1, #8]
143
144        ldr    pc, [sp], #4
145endfunc
146
147        .macro idct_col
148        ldr    a4, [a1]              /* a4 = col[1:0] */
149        mov    ip, #16384
150        sub    ip, ip, #1            /* ip = W4 */
151        mov    v1, #((1<<(COL_SHIFT-1))/W4) /* this matches the C version */
152        add    v2, v1, a4, asr #16
153        rsb    v2, v2, v2, lsl #14
154        mov    a4, a4, lsl #16
155        add    v1, v1, a4, asr #16
156        ldr    a4, [a1, #(16*4)]
157        rsb    v1, v1, v1, lsl #14
158
159        smulbb lr, ip, a4
160        smulbt a3, ip, a4
161        sub    v3, v1, lr
162        sub    v5, v1, lr
163        add    v7, v1, lr
164        add    v1, v1, lr
165        sub    v4, v2, a3
166        sub    v6, v2, a3
167        add    fp, v2, a3
168        ldr    ip, =W26
169        ldr    a4, [a1, #(16*2)]
170        add    v2, v2, a3
171
172        smulbb lr, ip, a4
173        smultb a3, ip, a4
174        add    v1, v1, lr
175        sub    v7, v7, lr
176        add    v3, v3, a3
177        sub    v5, v5, a3
178        smulbt lr, ip, a4
179        smultt a3, ip, a4
180        add    v2, v2, lr
181        sub    fp, fp, lr
182        add    v4, v4, a3
183        ldr    a4, [a1, #(16*6)]
184        sub    v6, v6, a3
185
186        smultb lr, ip, a4
187        smulbb a3, ip, a4
188        add    v1, v1, lr
189        sub    v7, v7, lr
190        sub    v3, v3, a3
191        add    v5, v5, a3
192        smultt lr, ip, a4
193        smulbt a3, ip, a4
194        add    v2, v2, lr
195        sub    fp, fp, lr
196        sub    v4, v4, a3
197        add    v6, v6, a3
198
199        stmfd  sp!, {v1, v2, v3, v4, v5, v6, v7, fp}
200
201        ldr    ip, =W13
202        ldr    a4, [a1, #(16*1)]
203        ldr    lr, =W57
204        smulbb v1, ip, a4
205        smultb v3, ip, a4
206        smulbb v5, lr, a4
207        smultb v7, lr, a4
208        smulbt v2, ip, a4
209        smultt v4, ip, a4
210        smulbt v6, lr, a4
211        smultt fp, lr, a4
212        rsb    v4, v4, #0
213        ldr    a4, [a1, #(16*3)]
214        rsb    v3, v3, #0
215
216        smlatb v1, ip, a4, v1
217        smlatb v3, lr, a4, v3
218        smulbb a3, ip, a4
219        smulbb a2, lr, a4
220        sub    v5, v5, a3
221        sub    v7, v7, a2
222        smlatt v2, ip, a4, v2
223        smlatt v4, lr, a4, v4
224        smulbt a3, ip, a4
225        smulbt a2, lr, a4
226        sub    v6, v6, a3
227        ldr    a4, [a1, #(16*5)]
228        sub    fp, fp, a2
229
230        smlabb v1, lr, a4, v1
231        smlabb v3, ip, a4, v3
232        smlatb v5, lr, a4, v5
233        smlatb v7, ip, a4, v7
234        smlabt v2, lr, a4, v2
235        smlabt v4, ip, a4, v4
236        smlatt v6, lr, a4, v6
237        ldr    a3, [a1, #(16*7)]
238        smlatt fp, ip, a4, fp
239
240        smlatb v1, lr, a3, v1
241        smlabb v3, lr, a3, v3
242        smlatb v5, ip, a3, v5
243        smulbb a4, ip, a3
244        smlatt v2, lr, a3, v2
245        sub    v7, v7, a4
246        smlabt v4, lr, a3, v4
247        smulbt a4, ip, a3
248        smlatt v6, ip, a3, v6
249        sub    fp, fp, a4
250        .endm
251
252function idct_col_armv5te
253        str    lr, [sp, #-4]!
254
255        idct_col
256
257        ldmfd  sp!, {a3, a4}
258        adds   a2, a3, v1
259        mov    a2, a2, lsr #20
260        it     mi
261        orrmi  a2, a2, #0xf000
262        add    ip, a4, v2
263        mov    ip, ip, asr #20
264        orr    a2, a2, ip, lsl #16
265        str    a2, [a1]
266        subs   a3, a3, v1
267        mov    a2, a3, lsr #20
268        it     mi
269        orrmi  a2, a2, #0xf000
270        sub    a4, a4, v2
271        mov    a4, a4, asr #20
272        orr    a2, a2, a4, lsl #16
273        ldmfd  sp!, {a3, a4}
274        str    a2, [a1, #(16*7)]
275
276        subs   a2, a3, v3
277        mov    a2, a2, lsr #20
278        it     mi
279        orrmi  a2, a2, #0xf000
280        sub    ip, a4, v4
281        mov    ip, ip, asr #20
282        orr    a2, a2, ip, lsl #16
283        str    a2, [a1, #(16*1)]
284        adds   a3, a3, v3
285        mov    a2, a3, lsr #20
286        it     mi
287        orrmi  a2, a2, #0xf000
288        add    a4, a4, v4
289        mov    a4, a4, asr #20
290        orr    a2, a2, a4, lsl #16
291        ldmfd  sp!, {a3, a4}
292        str    a2, [a1, #(16*6)]
293
294        adds   a2, a3, v5
295        mov    a2, a2, lsr #20
296        it     mi
297        orrmi  a2, a2, #0xf000
298        add    ip, a4, v6
299        mov    ip, ip, asr #20
300        orr    a2, a2, ip, lsl #16
301        str    a2, [a1, #(16*2)]
302        subs   a3, a3, v5
303        mov    a2, a3, lsr #20
304        it     mi
305        orrmi  a2, a2, #0xf000
306        sub    a4, a4, v6
307        mov    a4, a4, asr #20
308        orr    a2, a2, a4, lsl #16
309        ldmfd  sp!, {a3, a4}
310        str    a2, [a1, #(16*5)]
311
312        adds   a2, a3, v7
313        mov    a2, a2, lsr #20
314        it     mi
315        orrmi  a2, a2, #0xf000
316        add    ip, a4, fp
317        mov    ip, ip, asr #20
318        orr    a2, a2, ip, lsl #16
319        str    a2, [a1, #(16*3)]
320        subs   a3, a3, v7
321        mov    a2, a3, lsr #20
322        it     mi
323        orrmi  a2, a2, #0xf000
324        sub    a4, a4, fp
325        mov    a4, a4, asr #20
326        orr    a2, a2, a4, lsl #16
327        str    a2, [a1, #(16*4)]
328
329        ldr    pc, [sp], #4
330endfunc
331
332.macro  clip   dst, src:vararg
333        movs   \dst, \src
334        it     mi
335        movmi  \dst, #0
336        cmp    \dst, #255
337        it     gt
338        movgt  \dst, #255
339.endm
340
341.macro  aclip  dst, src:vararg
342        adds   \dst, \src
343        it     mi
344        movmi  \dst, #0
345        cmp    \dst, #255
346        it     gt
347        movgt  \dst, #255
348.endm
349
350function idct_col_put_armv5te
351        str    lr, [sp, #-4]!
352
353        idct_col
354
355        ldmfd  sp!, {a3, a4}
356        ldr    lr, [sp, #32]
357        add    a2, a3, v1
358        clip   a2, a2, asr #20
359        add    ip, a4, v2
360        clip   ip, ip, asr #20
361        orr    a2, a2, ip, lsl #8
362        sub    a3, a3, v1
363        clip   a3, a3, asr #20
364        sub    a4, a4, v2
365        clip   a4, a4, asr #20
366        ldr    v1, [sp, #28]
367        strh   a2, [v1]
368        add    a2, v1, #2
369        str    a2, [sp, #28]
370        orr    a2, a3, a4, lsl #8
371        rsb    v2, lr, lr, lsl #3
372        ldmfd  sp!, {a3, a4}
373        strh_pre a2, v2, v1
374
375        sub    a2, a3, v3
376        clip   a2, a2, asr #20
377        sub    ip, a4, v4
378        clip   ip, ip, asr #20
379        orr    a2, a2, ip, lsl #8
380        strh_pre a2, v1, lr
381        add    a3, a3, v3
382        clip   a2, a3, asr #20
383        add    a4, a4, v4
384        clip   a4, a4, asr #20
385        orr    a2, a2, a4, lsl #8
386        ldmfd  sp!, {a3, a4}
387        strh_dpre a2, v2, lr
388
389        add    a2, a3, v5
390        clip   a2, a2, asr #20
391        add    ip, a4, v6
392        clip   ip, ip, asr #20
393        orr    a2, a2, ip, lsl #8
394        strh_pre a2, v1, lr
395        sub    a3, a3, v5
396        clip   a2, a3, asr #20
397        sub    a4, a4, v6
398        clip   a4, a4, asr #20
399        orr    a2, a2, a4, lsl #8
400        ldmfd  sp!, {a3, a4}
401        strh_dpre a2, v2, lr
402
403        add    a2, a3, v7
404        clip   a2, a2, asr #20
405        add    ip, a4, fp
406        clip   ip, ip, asr #20
407        orr    a2, a2, ip, lsl #8
408        strh   a2, [v1, lr]
409        sub    a3, a3, v7
410        clip   a2, a3, asr #20
411        sub    a4, a4, fp
412        clip   a4, a4, asr #20
413        orr    a2, a2, a4, lsl #8
414        strh_dpre a2, v2, lr
415
416        ldr    pc, [sp], #4
417endfunc
418
419function idct_col_add_armv5te
420        str    lr, [sp, #-4]!
421
422        idct_col
423
424        ldr    lr, [sp, #36]
425
426        ldmfd  sp!, {a3, a4}
427        ldrh   ip, [lr]
428        add    a2, a3, v1
429        sub    a3, a3, v1
430        and    v1, ip, #255
431        aclip  a2, v1, a2, asr #20
432        add    v1, a4, v2
433        mov    v1, v1, asr #20
434        aclip  v1, v1, ip, lsr #8
435        orr    a2, a2, v1, lsl #8
436        ldr    v1, [sp, #32]
437        sub    a4, a4, v2
438        rsb    v2, v1, v1, lsl #3
439        ldrh_pre ip, v2, lr
440        strh   a2, [lr]
441        and    a2, ip, #255
442        aclip  a3, a2, a3, asr #20
443        mov    a4, a4, asr #20
444        aclip  a4, a4, ip, lsr #8
445        add    a2, lr, #2
446        str    a2, [sp, #28]
447        orr    a2, a3, a4, lsl #8
448        strh   a2, [v2]
449
450        ldmfd  sp!, {a3, a4}
451        ldrh_pre ip, lr, v1
452        sub    a2, a3, v3
453        add    a3, a3, v3
454        and    v3, ip, #255
455        aclip  a2, v3, a2, asr #20
456        sub    v3, a4, v4
457        mov    v3, v3, asr #20
458        aclip  v3, v3, ip, lsr #8
459        orr    a2, a2, v3, lsl #8
460        add    a4, a4, v4
461        ldrh_dpre ip, v2, v1
462        strh   a2, [lr]
463        and    a2, ip, #255
464        aclip  a3, a2, a3, asr #20
465        mov    a4, a4, asr #20
466        aclip  a4, a4, ip, lsr #8
467        orr    a2, a3, a4, lsl #8
468        strh   a2, [v2]
469
470        ldmfd  sp!, {a3, a4}
471        ldrh_pre ip, lr, v1
472        add    a2, a3, v5
473        sub    a3, a3, v5
474        and    v3, ip, #255
475        aclip  a2, v3, a2, asr #20
476        add    v3, a4, v6
477        mov    v3, v3, asr #20
478        aclip  v3, v3, ip, lsr #8
479        orr    a2, a2, v3, lsl #8
480        sub    a4, a4, v6
481        ldrh_dpre ip, v2, v1
482        strh   a2, [lr]
483        and    a2, ip, #255
484        aclip  a3, a2, a3, asr #20
485        mov    a4, a4, asr #20
486        aclip  a4, a4, ip, lsr #8
487        orr    a2, a3, a4, lsl #8
488        strh   a2, [v2]
489
490        ldmfd  sp!, {a3, a4}
491        ldrh_pre ip, lr, v1
492        add    a2, a3, v7
493        sub    a3, a3, v7
494        and    v3, ip, #255
495        aclip  a2, v3, a2, asr #20
496        add    v3, a4, fp
497        mov    v3, v3, asr #20
498        aclip  v3, v3, ip, lsr #8
499        orr    a2, a2, v3, lsl #8
500        sub    a4, a4, fp
501        ldrh_dpre ip, v2, v1
502        strh   a2, [lr]
503        and    a2, ip, #255
504        aclip  a3, a2, a3, asr #20
505        mov    a4, a4, asr #20
506        aclip  a4, a4, ip, lsr #8
507        orr    a2, a3, a4, lsl #8
508        strh   a2, [v2]
509
510        ldr    pc, [sp], #4
511endfunc
512
513function ff_simple_idct_armv5te, export=1
514        stmfd  sp!, {v1, v2, v3, v4, v5, v6, v7, fp, lr}
515
516        bl     idct_row_armv5te
517        add    a1, a1, #16
518        bl     idct_row_armv5te
519        add    a1, a1, #16
520        bl     idct_row_armv5te
521        add    a1, a1, #16
522        bl     idct_row_armv5te
523        add    a1, a1, #16
524        bl     idct_row_armv5te
525        add    a1, a1, #16
526        bl     idct_row_armv5te
527        add    a1, a1, #16
528        bl     idct_row_armv5te
529        add    a1, a1, #16
530        bl     idct_row_armv5te
531
532        sub    a1, a1, #(16*7)
533
534        bl     idct_col_armv5te
535        add    a1, a1, #4
536        bl     idct_col_armv5te
537        add    a1, a1, #4
538        bl     idct_col_armv5te
539        add    a1, a1, #4
540        bl     idct_col_armv5te
541
542        ldmfd  sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc}
543endfunc
544
545function ff_simple_idct_add_armv5te, export=1
546        stmfd  sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr}
547
548        mov    a1, a3
549
550        bl     idct_row_armv5te
551        add    a1, a1, #16
552        bl     idct_row_armv5te
553        add    a1, a1, #16
554        bl     idct_row_armv5te
555        add    a1, a1, #16
556        bl     idct_row_armv5te
557        add    a1, a1, #16
558        bl     idct_row_armv5te
559        add    a1, a1, #16
560        bl     idct_row_armv5te
561        add    a1, a1, #16
562        bl     idct_row_armv5te
563        add    a1, a1, #16
564        bl     idct_row_armv5te
565
566        sub    a1, a1, #(16*7)
567
568        bl     idct_col_add_armv5te
569        add    a1, a1, #4
570        bl     idct_col_add_armv5te
571        add    a1, a1, #4
572        bl     idct_col_add_armv5te
573        add    a1, a1, #4
574        bl     idct_col_add_armv5te
575
576        add    sp, sp, #8
577        ldmfd  sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc}
578endfunc
579
580function ff_simple_idct_put_armv5te, export=1
581        stmfd  sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr}
582
583        mov    a1, a3
584
585        bl     idct_row_armv5te
586        add    a1, a1, #16
587        bl     idct_row_armv5te
588        add    a1, a1, #16
589        bl     idct_row_armv5te
590        add    a1, a1, #16
591        bl     idct_row_armv5te
592        add    a1, a1, #16
593        bl     idct_row_armv5te
594        add    a1, a1, #16
595        bl     idct_row_armv5te
596        add    a1, a1, #16
597        bl     idct_row_armv5te
598        add    a1, a1, #16
599        bl     idct_row_armv5te
600
601        sub    a1, a1, #(16*7)
602
603        bl     idct_col_put_armv5te
604        add    a1, a1, #4
605        bl     idct_col_put_armv5te
606        add    a1, a1, #4
607        bl     idct_col_put_armv5te
608        add    a1, a1, #4
609        bl     idct_col_put_armv5te
610
611        add    sp, sp, #8
612        ldmfd  sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc}
613endfunc
614