1 /*
2  * Fixed Point IMDCT
3  * Copyright (c) 2002 The FFmpeg Project.
4  * Copyright (c) 2010 Dave Hooper, Mohamed Tarek, Michael Giacomelli
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, write to the Free Software
18  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
19  */
20 
21 #include "mdct.h"
22 #include "codeclib_misc.h"
23 #include "mdct_lookup.h"
24 
25 /* Use to give gcc hints on which branch is most likely taken */
26 #if defined(__GNUC__) && __GNUC__ >= 3
27 #define LIKELY(x)   __builtin_expect(!!(x), 1)
28 #define UNLIKELY(x) __builtin_expect(!!(x), 0)
29 #else
30 #define LIKELY(x)   (x)
31 #define UNLIKELY(x) (x)
32 #endif
33 
34 #ifndef ICODE_ATTR_TREMOR_MDCT
35 #define ICODE_ATTR_TREMOR_MDCT ICODE_ATTR
36 #endif
37 
38 /**
39  * Compute the middle half of the inverse MDCT of size N = 2^nbits
40  * thus excluding the parts that can be derived by symmetry
41  * @param output N/2 samples
42  * @param input N/2 samples
43  *
44  * NOTE - CANNOT CURRENTLY OPERATE IN PLACE (input and output must
45  *                                          not overlap or intersect at all)
46  */
47 #define ICODE_ATTR
48 void ff_imdct_half(unsigned int nbits, fixed32 *output, const fixed32 *input) ICODE_ATTR_TREMOR_MDCT;
ff_imdct_half(unsigned int nbits,fixed32 * output,const fixed32 * input)49 void ff_imdct_half(unsigned int nbits, fixed32 *output, const fixed32 *input)
50 {
51     int n8, n4, n2, n, j;
52     const fixed32 *in1, *in2;
53     (void)j;
54     n = 1 << nbits;
55 
56     n2 = n >> 1;
57     n4 = n >> 2;
58     n8 = n >> 3;
59 
60     FFTComplex *z = (FFTComplex *)output;
61 
62     /* pre rotation */
63     in1 = input;
64     in2 = input + n2 - 1;
65 
66     /* revtab comes from the fft; revtab table is sized for N=4096 size fft = 2^12.
67        The fft is size N/4 so s->nbits-2, so our shift needs to be (12-(nbits-2)) */
68     const int revtab_shift = (14- nbits);
69 
70     /* bitreverse reorder the input and rotate;   result here is in OUTPUT ... */
71     /* (note that when using the current split radix, the bitreverse ordering is
72         complex, meaning that this reordering cannot easily be done in-place) */
73     /* Using the following pdf, you can see that it is possible to rearrange
74        the 'classic' pre/post rotate with an alternative one that enables
75        us to use fewer distinct twiddle factors.
76        http://www.eurasip.org/Proceedings/Eusipco/Eusipco2006/papers/1568980508.pdf
77 
78        For prerotation, the factors are just sin,cos(2PI*i/N)
79        For postrotation, the factors are sin,cos(2PI*(i+1/4)/N)
80 
81        Therefore, prerotation can immediately reuse the same twiddles as fft
82        (for postrotation it's still a bit complex, we reuse the fft trig tables
83         where we can, or a special table for N=2048, or interpolate between
84         trig tables for N>2048)
85        */
86     const int32_t *T = sincos_lookup0;
87     const int step = 2<<(12-nbits);
88     const uint16_t * p_revtab=revtab;
89     {
90         const uint16_t * const p_revtab_end = p_revtab + n8;
91 #ifdef CPU_COLDFIRE
92         asm volatile ("move.l (%[in2]), %%d0\n\t"
93                       "move.l (%[in1]), %%d1\n\t"
94                       "bra.s 1f\n\t"
95                       "0:\n\t"
96                       "movem.l (%[T]), %%d2-%%d3\n\t"
97 
98                       "addq.l #8, %[in1]\n\t"
99                       "subq.l #8, %[in2]\n\t"
100 
101                       "lea (%[step]*4, %[T]), %[T]\n\t"
102 
103                       "mac.l %%d0, %%d3, (%[T]), %%d4, %%acc0;"
104                       "msac.l %%d1, %%d2, (4, %[T]), %%d5, %%acc0;"
105                       "mac.l %%d1, %%d3, (%[in1]), %%d1, %%acc1;"
106                       "mac.l %%d0, %%d2, (%[in2]), %%d0, %%acc1;"
107 
108                       "addq.l #8, %[in1]\n\t"
109                       "subq.l #8, %[in2]\n\t"
110 
111                       "mac.l %%d0, %%d5, %%acc2;"
112                       "msac.l %%d1, %%d4, (%[p_revtab])+, %%d2, %%acc2;"
113                       "mac.l %%d1, %%d5, (%[in1]), %%d1, %%acc3;"
114                       "mac.l %%d0, %%d4, (%[in2]), %%d0, %%acc3;"
115 
116                       "clr.l %%d3\n\t"
117                       "move.w %%d2, %%d3\n\t"
118                       "eor.l %%d3, %%d2\n\t"
119                       "swap %%d2\n\t"
120                       "lsr.l %[revtab_shift], %%d2\n\t"
121 
122                       "movclr.l %%acc0, %%d4;"
123                       "movclr.l %%acc1, %%d5;"
124                       "lsl.l #3, %%d2\n\t"
125                       "lea (%%d2, %[z]), %%a1\n\t"
126                       "movem.l %%d4-%%d5, (%%a1)\n\t"
127 
128                       "lsr.l %[revtab_shift], %%d3\n\t"
129 
130                       "movclr.l %%acc2, %%d4;"
131                       "movclr.l %%acc3, %%d5;"
132                       "lsl.l #3, %%d3\n\t"
133                       "lea (%%d3, %[z]), %%a1\n\t"
134                       "movem.l %%d4-%%d5, (%%a1)\n\t"
135 
136                       "lea (%[step]*4, %[T]), %[T]\n\t"
137 
138                       "1:\n\t"
139                       "cmp.l %[p_revtab_end], %[p_revtab]\n\t"
140                       "bcs.s 0b\n\t"
141                       : [in1] "+a" (in1), [in2] "+a" (in2), [T] "+a" (T),
142                         [p_revtab] "+a" (p_revtab)
143                       : [z] "a" (z), [step] "d" (step), [revtab_shift] "d" (revtab_shift),
144                         [p_revtab_end] "r" (p_revtab_end)
145                       : "d0", "d1", "d2", "d3", "d4", "d5", "a1", "cc", "memory");
146 #else
147         while(LIKELY(p_revtab < p_revtab_end))
148         {
149             j = (*p_revtab)>>revtab_shift;
150             XNPROD31(*in2, *in1, T[1], T[0], &z[j].re, &z[j].im );
151             T += step;
152             in1 += 2;
153             in2 -= 2;
154             p_revtab++;
155             j = (*p_revtab)>>revtab_shift;
156             XNPROD31(*in2, *in1, T[1], T[0], &z[j].re, &z[j].im );
157             T += step;
158             in1 += 2;
159             in2 -= 2;
160             p_revtab++;
161         }
162 #endif
163     }
164     {
165         const uint16_t * const p_revtab_end = p_revtab + n8;
166 #ifdef CPU_COLDFIRE
167         asm volatile ("move.l (%[in2]), %%d0\n\t"
168                       "move.l (%[in1]), %%d1\n\t"
169                       "bra.s 1f\n\t"
170                       "0:\n\t"
171                       "movem.l (%[T]), %%d2-%%d3\n\t"
172 
173                       "addq.l #8, %[in1]\n\t"
174                       "subq.l #8, %[in2]\n\t"
175 
176                       "lea (%[step]*4, %[T]), %[T]\n\t"
177 
178                       "mac.l %%d0, %%d2, (%[T]), %%d4, %%acc0;"
179                       "msac.l %%d1, %%d3, (4, %[T]), %%d5, %%acc0;"
180                       "mac.l %%d1, %%d2, (%[in1]), %%d1, %%acc1;"
181                       "mac.l %%d0, %%d3, (%[in2]), %%d0, %%acc1;"
182 
183                       "addq.l #8, %[in1]\n\t"
184                       "subq.l #8, %[in2]\n\t"
185 
186                       "mac.l %%d0, %%d4, %%acc2;"
187                       "msac.l %%d1, %%d5, (%[p_revtab])+, %%d2, %%acc2;"
188                       "mac.l %%d1, %%d4, (%[in1]), %%d1, %%acc3;"
189                       "mac.l %%d0, %%d5, (%[in2]), %%d0, %%acc3;"
190 
191                       "clr.l %%d3\n\t"
192                       "move.w %%d2, %%d3\n\t"
193                       "eor.l %%d3, %%d2\n\t"
194                       "swap %%d2\n\t"
195                       "lsr.l %[revtab_shift], %%d2\n\t"
196 
197                       "movclr.l %%acc0, %%d4;"
198                       "movclr.l %%acc1, %%d5;"
199                       "lsl.l #3, %%d2\n\t"
200                       "lea (%%d2, %[z]), %%a1\n\t"
201                       "movem.l %%d4-%%d5, (%%a1)\n\t"
202 
203                       "lsr.l %[revtab_shift], %%d3\n\t"
204 
205                       "movclr.l %%acc2, %%d4;"
206                       "movclr.l %%acc3, %%d5;"
207                       "lsl.l #3, %%d3\n\t"
208                       "lea (%%d3, %[z]), %%a1\n\t"
209                       "movem.l %%d4-%%d5, (%%a1)\n\t"
210 
211                       "lea (%[step]*4, %[T]), %[T]\n\t"
212 
213                       "1:\n\t"
214                       "cmp.l %[p_revtab_end], %[p_revtab]\n\t"
215                       "bcs.s 0b\n\t"
216                       : [in1] "+a" (in1), [in2] "+a" (in2), [T] "+a" (T),
217                         [p_revtab] "+a" (p_revtab)
218                       : [z] "a" (z), [step] "d" (-step), [revtab_shift] "d" (revtab_shift),
219                         [p_revtab_end] "r" (p_revtab_end)
220                       : "d0", "d1", "d2", "d3", "d4", "d5", "a1", "cc", "memory");
221 #else
222         while(LIKELY(p_revtab < p_revtab_end))
223         {
224             j = (*p_revtab)>>revtab_shift;
225             XNPROD31(*in2, *in1, T[0], T[1], &z[j].re, &z[j].im);
226             T -= step;
227             in1 += 2;
228             in2 -= 2;
229             p_revtab++;
230             j = (*p_revtab)>>revtab_shift;
231             XNPROD31(*in2, *in1, T[0], T[1], &z[j].re, &z[j].im);
232             T -= step;
233             in1 += 2;
234             in2 -= 2;
235             p_revtab++;
236         }
237 #endif
238     }
239 
240 
241     /* ... and so fft runs in OUTPUT buffer */
242     ff_fft_calc_c(nbits-2, z);
243 
244     /* post rotation + reordering.  now keeps the result within the OUTPUT buffer */
245     switch( nbits )
246     {
247         default:
248         {
249             fixed32 * z1 = (fixed32 *)(&z[0]);
250             int magic_step = step>>2;
251             int newstep;
252             if(n<=1024)
253             {
254                 T = sincos_lookup0 + magic_step;
255                 newstep = step>>1;
256             }
257             else
258             {
259                 T = sincos_lookup1;
260                 newstep = 2;
261             }
262 
263 #ifdef CPU_COLDFIRE
264             fixed32 * z2 = (fixed32 *)(&z[n4]);
265             int c = n4;
266             if (newstep == 2)
267             {
268                 asm volatile ("movem.l (%[z1]), %%d0-%%d1\n\t"
269                               "addq.l #8, %[z1]\n\t"
270                               "movem.l (%[T]), %%d2-%%d3\n\t"
271                               "addq.l #8, %[T]\n\t"
272                               "bra.s 1f\n\t"
273                               "0:\n\t"
274                               "msac.l %%d1, %%d2, (%[T])+, %%a3, %%acc0\n\t"
275                               "mac.l  %%d0, %%d3, (%[T])+, %%a4, %%acc0\n\t"
276 
277                               "msac.l %%d1, %%d3, -(%[z2]), %%d1, %%acc1\n\t"
278                               "msac.l %%d0, %%d2, -(%[z2]), %%d0, %%acc1\n\t"
279 
280                               "msac.l %%d1, %%a4, (%[T])+, %%d2, %%acc2\n\t"
281                               "mac.l  %%d0, %%a3, (%[T])+, %%d3, %%acc2\n\t"
282                               "msac.l %%d0, %%a4, (%[z1])+, %%d0, %%acc3\n\t"
283                               "msac.l %%d1, %%a3, (%[z1])+, %%d1, %%acc3\n\t"
284 
285                               "movclr.l %%acc0, %%a3\n\t"
286                               "movclr.l %%acc3, %%a4\n\t"
287                               "movem.l %%a3-%%a4, (-16, %[z1])\n\t"
288 
289                               "movclr.l %%acc1, %%a4\n\t"
290                               "movclr.l %%acc2, %%a3\n\t"
291                               "movem.l %%a3-%%a4, (%[z2])\n\t"
292 
293                               "subq.l #2, %[n]\n\t"
294                               "1:\n\t"
295                               "bhi.s 0b\n\t"
296                               : [z1] "+a" (z1), [z2] "+a" (z2), [T] "+a" (T), [n] "+d" (c)
297                               :
298                               : "d0", "d1", "d2", "d3", "a3", "a4", "cc", "memory");
299             }
300             else
301             {
302                 asm volatile ("movem.l (%[z1]), %%d0-%%d1\n\t"
303                               "addq.l #8, %[z1]\n\t"
304                               "movem.l (%[T]), %%d2-%%d3\n\t"
305                               "lea (%[newstep]*4, %[T]), %[T]\n\t"
306                               "bra.s 1f\n\t"
307                               "0:\n\t"
308                               "msac.l %%d1, %%d2, (%[T]), %%a3, %%acc0\n\t"
309                               "mac.l  %%d0, %%d3, (4, %[T]), %%a4, %%acc0\n\t"
310                               "msac.l %%d1, %%d3, -(%[z2]), %%d1, %%acc1\n\t"
311                               "msac.l %%d0, %%d2, -(%[z2]), %%d0, %%acc1\n\t"
312 
313                               "lea (%[newstep]*4, %[T]), %[T]\n\t"
314                               "msac.l %%d1, %%a4, (%[T]), %%d2, %%acc2\n\t"
315                               "mac.l  %%d0, %%a3, (4, %[T]), %%d3, %%acc2\n\t"
316                               "msac.l %%d0, %%a4, (%[z1])+, %%d0, %%acc3\n\t"
317                               "msac.l %%d1, %%a3, (%[z1])+, %%d1, %%acc3\n\t"
318 
319                               "lea (%[newstep]*4, %[T]), %[T]\n\t"
320 
321                               "movclr.l %%acc0, %%a3\n\t"
322                               "movclr.l %%acc3, %%a4\n\t"
323                               "movem.l %%a3-%%a4, (-16, %[z1])\n\t"
324 
325                               "movclr.l %%acc1, %%a4\n\t"
326                               "movclr.l %%acc2, %%a3\n\t"
327                               "movem.l %%a3-%%a4, (%[z2])\n\t"
328 
329                               "subq.l #2, %[n]\n\t"
330                               "1:\n\t"
331                               "bhi.s 0b\n\t"
332                               : [z1] "+a" (z1), [z2] "+a" (z2), [T] "+a" (T), [n] "+d" (c)
333                               : [newstep] "d" (newstep)
334                               : "d0", "d1", "d2", "d3", "a3", "a4", "cc", "memory");
335             }
336 #else
337             fixed32 * z2 = (fixed32 *)(&z[n4-1]);
338             while(z1<z2)
339             {
340                 fixed32 r0,i0,r1,i1;
341                 XNPROD31_R(z1[1], z1[0], T[0], T[1], r0, i1 ); T+=newstep;
342                 XNPROD31_R(z2[1], z2[0], T[1], T[0], r1, i0 ); T+=newstep;
343                 z1[0] = -r0;
344                 z1[1] = -i0;
345                 z2[0] = -r1;
346                 z2[1] = -i1;
347                 z1+=2;
348                 z2-=2;
349             }
350 #endif
351             break;
352         }
353 
354         case 12: /* n=4096 */
355         {
356             /* linear interpolation (50:50) between sincos_lookup0 and sincos_lookup1 */
357             const int32_t * V = sincos_lookup1;
358             T = sincos_lookup0;
359             int32_t t0,t1,v0,v1;
360             fixed32 * z1 = (fixed32 *)(&z[0]);
361             fixed32 * z2 = (fixed32 *)(&z[n4-1]);
362 
363             t0 = T[0]>>1; t1=T[1]>>1;
364 
365             while(z1<z2)
366             {
367                 fixed32 r0,i0,r1,i1;
368                 t0 += (v0 = (V[0]>>1));
369                 t1 += (v1 = (V[1]>>1));
370                 XNPROD31_R(z1[1], z1[0], t0, t1, r0, i1 );
371                 T+=2;
372                 v0 += (t0 = (T[0]>>1));
373                 v1 += (t1 = (T[1]>>1));
374                 XNPROD31_R(z2[1], z2[0], v1, v0, r1, i0 );
375                 z1[0] = -r0;
376                 z1[1] = -i0;
377                 z2[0] = -r1;
378                 z2[1] = -i1;
379                 z1+=2;
380                 z2-=2;
381                 V+=2;
382             }
383 
384             break;
385         }
386 
387         case 13: /* n = 8192 */
388         {
389             /* weight linear interpolation between sincos_lookup0 and sincos_lookup1
390                specifically: 25:75 for first twiddle and 75:25 for second twiddle */
391             const int32_t * V = sincos_lookup1;
392             T = sincos_lookup0;
393             int32_t t0,t1,v0,v1,q0,q1;
394             fixed32 * z1 = (fixed32 *)(&z[0]);
395             fixed32 * z2 = (fixed32 *)(&z[n4-1]);
396 
397             t0 = T[0]; t1=T[1];
398 
399             while(z1<z2)
400             {
401                 fixed32 r0,i0,r1,i1;
402                 v0 = V[0]; v1 = V[1];
403                 t0 += (q0 = (v0-t0)>>1);
404                 t1 += (q1 = (v1-t1)>>1);
405                 XNPROD31_R(z1[1], z1[0], t0, t1, r0, i1 );
406                 t0 = v0-q0;
407                 t1 = v1-q1;
408                 XNPROD31_R(z2[1], z2[0], t1, t0, r1, i0 );
409                 z1[0] = -r0;
410                 z1[1] = -i0;
411                 z2[0] = -r1;
412                 z2[1] = -i1;
413                 z1+=2;
414                 z2-=2;
415                 T+=2;
416 
417                 t0 = T[0]; t1 = T[1];
418                 v0 += (q0 = (t0-v0)>>1);
419                 v1 += (q1 = (t1-v1)>>1);
420                 XNPROD31_R(z1[1], z1[0], v0, v1, r0, i1 );
421                 v0 = t0-q0;
422                 v1 = t1-q1;
423                 XNPROD31_R(z2[1], z2[0], v1, v0, r1, i0 );
424                 z1[0] = -r0;
425                 z1[1] = -i0;
426                 z2[0] = -r1;
427                 z2[1] = -i1;
428                 z1+=2;
429                 z2-=2;
430                 V+=2;
431             }
432 
433             break;
434         }
435     }
436 }
437 
438 /**
439  * Compute inverse MDCT of size N = 2^nbits
440  * @param output N samples
441  * @param input N/2 samples
442  * "In-place" processing can be achieved provided that:
443  *            [0  ..  N/2-1 | N/2  ..  N-1 ]
444  *            <----input---->
445  *            <-----------output----------->
446  *
447  * The result of ff_imdct_half is to put the 'half' imdct here
448  *
449  *                          N/2          N-1
450  *                          <--half imdct-->
451  *
452  * We want it here for the full imdct:
453  *                   N/4      3N/4-1
454  *                   <-------------->
455  *
456  * In addition we need to apply two symmetries to get the full imdct:
457  *
458  *           <AAAAAA>                <DDDDDD>
459  *                   <BBBBBB><CCCCCC>
460  *
461  *           D is a reflection of C
462  *           A is a reflection of B (but with sign flipped)
463  *
464  * We process the symmetries at the same time as we 'move' the half imdct
465  * from [N/2,N-1] to [N/4,3N/4-1]
466  *
467  * TODO: find a way to make ff_imdct_half put the result in [N/4..3N/4-1]
468  * This would require being able to use revtab 'inplace' (since the input
469  * and output of imdct_half would then overlap somewhat)
470  */
471 void ff_imdct_calc(unsigned int nbits, fixed32 *output, const fixed32 *input) ICODE_ATTR_TREMOR_MDCT;
472 #ifndef CPU_ARM
ff_imdct_calc(unsigned int nbits,fixed32 * output,const fixed32 * input)473 void ff_imdct_calc(unsigned int nbits, fixed32 *output, const fixed32 *input)
474 {
475     const int n = (1<<nbits);
476     const int n2 = (n>>1);
477     const int n4 = (n>>2);
478 
479     /* tell imdct_half to put the output in [N/2..3N/4-1] i.e. output+n2 */
480     ff_imdct_half(nbits,output+n2,input);
481 
482     fixed32 * in_r, * in_r2, * out_r, * out_r2;
483 
484     /* Copy BBBB to AAAA, reflected and sign-flipped.
485        Also copy BBBB to its correct destination (from [N/2..3N/4-1] to [N/4..N/2-1]) */
486     out_r = output;
487     out_r2 = output+n2-8;
488     in_r  = output+n2+n4-8;
489     while(out_r<out_r2)
490     {
491 #if defined CPU_COLDFIRE
492         asm volatile(
493             "movem.l (%[in_r]), %%d0-%%d7\n\t"
494             "movem.l %%d0-%%d7, (%[out_r2])\n\t"
495             "neg.l %%d7\n\t"
496             "move.l %%d7, (%[out_r])+\n\t"
497             "neg.l %%d6\n\t"
498             "move.l %%d6, (%[out_r])+\n\t"
499             "neg.l %%d5\n\t"
500             "move.l %%d5, (%[out_r])+\n\t"
501             "neg.l %%d4\n\t"
502             "move.l %%d4, (%[out_r])+\n\t"
503             "neg.l %%d3\n\t"
504             "move.l %%d3, (%[out_r])+\n\t"
505             "neg.l %%d2\n\t"
506             "move.l %%d2, (%[out_r])+\n\t"
507             "lea.l (-8*4, %[in_r]), %[in_r]\n\t"
508             "neg.l %%d1\n\t"
509             "move.l %%d1, (%[out_r])+\n\t"
510             "lea.l (-8*4, %[out_r2]), %[out_r2]\n\t"
511             "neg.l %%d0\n\t"
512             "move.l %%d0, (%[out_r])+\n\t"
513             : [in_r] "+a" (in_r), [out_r] "+a" (out_r), [out_r2] "+a" (out_r2)
514             :
515             : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "cc", "memory" );
516 #else
517         out_r[0]     = -(out_r2[7] = in_r[7]);
518         out_r[1]     = -(out_r2[6] = in_r[6]);
519         out_r[2]     = -(out_r2[5] = in_r[5]);
520         out_r[3]     = -(out_r2[4] = in_r[4]);
521         out_r[4]     = -(out_r2[3] = in_r[3]);
522         out_r[5]     = -(out_r2[2] = in_r[2]);
523         out_r[6]     = -(out_r2[1] = in_r[1]);
524         out_r[7]     = -(out_r2[0] = in_r[0]);
525         in_r -= 8;
526         out_r += 8;
527         out_r2 -= 8;
528 #endif
529     }
530     in_r = output + n2+n4;
531     in_r2 = output + n-4;
532     out_r = output + n2;
533     out_r2 = output + n2 + n4 - 4;
534     while(in_r<in_r2)
535     {
536 #if defined CPU_COLDFIRE
537         asm volatile(
538             "movem.l (%[in_r]), %%d0-%%d3\n\t"
539             "movem.l %%d0-%%d3, (%[out_r])\n\t"
540             "movem.l (%[in_r2]), %%d4-%%d7\n\t"
541             "movem.l %%d4-%%d7, (%[out_r2])\n\t"
542             "move.l %%d0, %%a3\n\t"
543             "move.l %%d3, %%d0\n\t"
544             "move.l %%d1, %%d3\n\t"
545             "movem.l %%d0/%%d2-%%d3/%%a3, (%[in_r2])\n\t"
546             "move.l %%d7, %%d1\n\t"
547             "move.l %%d6, %%d2\n\t"
548             "move.l %%d5, %%d3\n\t"
549             "movem.l %%d1-%%d4, (%[in_r])\n\t"
550             "lea.l (4*4, %[in_r]), %[in_r]\n\t"
551             "lea.l (-4*4, %[in_r2]), %[in_r2]\n\t"
552             "lea.l (4*4, %[out_r]), %[out_r]\n\t"
553             "lea.l (-4*4, %[out_r2]), %[out_r2]\n\t"
554             : [in_r] "+a" (in_r), [in_r2] "+a" (in_r2),
555               [out_r] "+a" (out_r), [out_r2] "+a" (out_r2)
556             :
557             : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "a3", "memory", "cc" );
558 #else
559         register fixed32 t0,t1,t2,t3;
560         register fixed32 s0,s1,s2,s3;
561 
562         /* Copy and reflect CCCC to DDDD.  Because CCCC is already where
563            we actually want to put DDDD this is a bit complicated.
564          * So simultaneously do the following things:
565          * 1. copy range from [n2+n4 .. n-1] to range[n2 .. n2+n4-1]
566          * 2. reflect range from [n2+n4 .. n-1] inplace
567          *
568          *  [                      |                        ]
569          *   ^a ->            <- ^b ^c ->               <- ^d
570          *
571          *  #1: copy from ^c to ^a
572          *  #2: copy from ^d to ^b
573          *  #3: swap ^c and ^d in place
574          */
575         /* #1 pt1 : load 4 words from ^c. */
576         t0=in_r[0]; t1=in_r[1]; t2=in_r[2]; t3=in_r[3];
577         /* #1 pt2 : write to ^a */
578         out_r[0]=t0;out_r[1]=t1;out_r[2]=t2;out_r[3]=t3;
579         /* #2 pt1 : load 4 words from ^d */
580         s0=in_r2[0];s1=in_r2[1];s2=in_r2[2];s3=in_r2[3];
581         /* #2 pt2 : write to ^b */
582         out_r2[0]=s0;out_r2[1]=s1;out_r2[2]=s2;out_r2[3]=s3;
583         /* #3 pt1 : write words from #2 to ^c */
584         in_r[0]=s3;in_r[1]=s2;in_r[2]=s1;in_r[3]=s0;
585         /* #3 pt2 : write words from #1 to ^d */
586         in_r2[0]=t3;in_r2[1]=t2;in_r2[2]=t1;in_r2[3]=t0;
587 
588         in_r += 4;
589         in_r2 -= 4;
590         out_r += 4;
591         out_r2 -= 4;
592 #endif
593     }
594 }
595 #else
596 /* Follows the same structure as the canonical version above */
ff_imdct_calc(unsigned int nbits,fixed32 * output,const fixed32 * input)597 void ff_imdct_calc(unsigned int nbits, fixed32 *output, const fixed32 *input)
598 {
599     const int n = (1<<nbits);
600     const int n2 = (n>>1);
601     const int n4 = (n>>2);
602 
603     ff_imdct_half(nbits,output+n2,input);
604 
605     fixed32 * in_r, * in_r2, * out_r, * out_r2;
606 
607     out_r = output;
608     out_r2 = output+n2;
609     in_r  = output+n2+n4;
610     while(out_r<out_r2)
611     {
612         asm volatile(
613             "ldmdb %[in_r]!, {r0-r7}\n\t"
614             "stmdb %[out_r2]!, {r0-r7}\n\t"
615             "rsb r8,r0,#0\n\t"
616             "rsb r0,r7,#0\n\t"
617             "rsb r7,r1,#0\n\t"
618             "rsb r1,r6,#0\n\t"
619             "rsb r6,r2,#0\n\t"
620             "rsb r2,r5,#0\n\t"
621             "rsb r5,r3,#0\n\t"
622             "rsb r3,r4,#0\n\t"
623             "stmia %[out_r]!, {r0-r3,r5-r8}\n\t"
624             : [in_r] "+r" (in_r), [out_r] "+r" (out_r), [out_r2] "+r" (out_r2)
625             :
626             : "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "memory" );
627     }
628     in_r = output + n2+n4;
629     in_r2 = output + n;
630     out_r = output + n2;
631     out_r2 = output + n2 + n4;
632     while(in_r<in_r2)
633     {
634         asm volatile(
635             "ldmia %[in_r], {r0-r3}\n\t"
636             "stmia %[out_r]!, {r0-r3}\n\t"
637             "ldmdb %[in_r2], {r5-r8}\n\t"
638             "stmdb %[out_r2]!, {r5-r8}\n\t"
639             "mov r4,r0\n\t"
640             "mov r0,r3\n\t"
641             "mov r3,r1\n\t"
642             "stmdb %[in_r2]!, {r0,r2,r3,r4}\n\t"
643             "mov r4,r8\n\t"
644             "mov r8,r5\n\t"
645             "mov r5,r7\n\t"
646             "stmia %[in_r]!, {r4,r5,r6,r8}\n\t"
647             :
648             [in_r] "+r" (in_r), [in_r2] "+r" (in_r2), [out_r] "+r" (out_r), [out_r2] "+r" (out_r2)
649             :
650             : "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "memory" );
651     }
652 }
653 #endif
654