1 /*Copyright (c) 2003-2004, Mark Borgerding
2   Lots of modifications by Jean-Marc Valin
3   Copyright (c) 2005-2007, Xiph.Org Foundation
4   Copyright (c) 2008,      Xiph.Org Foundation, CSIRO
5 
6   All rights reserved.
7 
8   Redistribution and use in source and binary forms, with or without
9    modification, are permitted provided that the following conditions are met:
10 
11     * Redistributions of source code must retain the above copyright notice,
12        this list of conditions and the following disclaimer.
13     * Redistributions in binary form must reproduce the above copyright notice,
14        this list of conditions and the following disclaimer in the
15        documentation and/or other materials provided with the distribution.
16 
17   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27   POSSIBILITY OF SUCH DAMAGE.*/
28 
29 /* This code is originally from Mark Borgerding's KISS-FFT but has been
30    heavily modified to better suit Opus */
31 
32 #ifndef SKIP_CONFIG_H
33 #  ifdef HAVE_CONFIG_H
34 #    include "config.h"
35 #  endif
36 #endif
37 
38 #include "_kiss_fft_guts.h"
39 #define CUSTOM_MODES
40 
41 /* The guts header contains all the multiplication and addition macros that are defined for
42    complex numbers.  It also delares the kf_ internal functions.
43 */
44 
kf_bfly2(kiss_fft_cpx * Fout,int m,int N)45 static void kf_bfly2(
46                      kiss_fft_cpx * Fout,
47                      int m,
48                      int N
49                     )
50 {
51    kiss_fft_cpx * Fout2;
52    int i;
53    (void)m;
54 #ifdef CUSTOM_MODES
55    if (m==1)
56    {
57       celt_assert(m==1);
58       for (i=0;i<N;i++)
59       {
60          kiss_fft_cpx t;
61          Fout2 = Fout + 1;
62          t = *Fout2;
63          C_SUB( *Fout2 ,  *Fout , t );
64          C_ADDTO( *Fout ,  t );
65          Fout += 2;
66       }
67    } else
68 #endif
69    {
70       opus_val16 tw;
71       tw = QCONST16(0.7071067812f, 15);
72       /* We know that m==4 here because the radix-2 is just after a radix-4 */
73       celt_assert(m==4);
74       for (i=0;i<N;i++)
75       {
76          kiss_fft_cpx t;
77          Fout2 = Fout + 4;
78          t = Fout2[0];
79          C_SUB( Fout2[0] ,  Fout[0] , t );
80          C_ADDTO( Fout[0] ,  t );
81 
82          t.r = S_MUL(ADD32_ovflw(Fout2[1].r, Fout2[1].i), tw);
83          t.i = S_MUL(SUB32_ovflw(Fout2[1].i, Fout2[1].r), tw);
84          C_SUB( Fout2[1] ,  Fout[1] , t );
85          C_ADDTO( Fout[1] ,  t );
86 
87          t.r = Fout2[2].i;
88          t.i = -Fout2[2].r;
89          C_SUB( Fout2[2] ,  Fout[2] , t );
90          C_ADDTO( Fout[2] ,  t );
91 
92          t.r = S_MUL(SUB32_ovflw(Fout2[3].i, Fout2[3].r), tw);
93          t.i = S_MUL(NEG32_ovflw(ADD32_ovflw(Fout2[3].i, Fout2[3].r)), tw);
94          C_SUB( Fout2[3] ,  Fout[3] , t );
95          C_ADDTO( Fout[3] ,  t );
96          Fout += 8;
97       }
98    }
99 }
100 
kf_bfly4(kiss_fft_cpx * Fout,const size_t fstride,const kiss_fft_state * st,int m,int N,int mm)101 static void kf_bfly4(
102                      kiss_fft_cpx * Fout,
103                      const size_t fstride,
104                      const kiss_fft_state *st,
105                      int m,
106                      int N,
107                      int mm
108                     )
109 {
110    int i;
111 
112    if (m==1)
113    {
114       /* Degenerate case where all the twiddles are 1. */
115       for (i=0;i<N;i++)
116       {
117          kiss_fft_cpx scratch0, scratch1;
118 
119          C_SUB( scratch0 , *Fout, Fout[2] );
120          C_ADDTO(*Fout, Fout[2]);
121          C_ADD( scratch1 , Fout[1] , Fout[3] );
122          C_SUB( Fout[2], *Fout, scratch1 );
123          C_ADDTO( *Fout , scratch1 );
124          C_SUB( scratch1 , Fout[1] , Fout[3] );
125 
126          Fout[1].r = ADD32_ovflw(scratch0.r, scratch1.i);
127          Fout[1].i = SUB32_ovflw(scratch0.i, scratch1.r);
128          Fout[3].r = SUB32_ovflw(scratch0.r, scratch1.i);
129          Fout[3].i = ADD32_ovflw(scratch0.i, scratch1.r);
130          Fout+=4;
131       }
132    } else {
133       int j;
134       kiss_fft_cpx scratch[6];
135       const kiss_twiddle_cpx *tw1,*tw2,*tw3;
136       const int m2=2*m;
137       const int m3=3*m;
138       kiss_fft_cpx * Fout_beg = Fout;
139       for (i=0;i<N;i++)
140       {
141          Fout = Fout_beg + i*mm;
142          tw3 = tw2 = tw1 = st->twiddles;
143          /* m is guaranteed to be a multiple of 4. */
144          for (j=0;j<m;j++)
145          {
146             C_MUL(scratch[0],Fout[m] , *tw1 );
147             C_MUL(scratch[1],Fout[m2] , *tw2 );
148             C_MUL(scratch[2],Fout[m3] , *tw3 );
149 
150             C_SUB( scratch[5] , *Fout, scratch[1] );
151             C_ADDTO(*Fout, scratch[1]);
152             C_ADD( scratch[3] , scratch[0] , scratch[2] );
153             C_SUB( scratch[4] , scratch[0] , scratch[2] );
154             C_SUB( Fout[m2], *Fout, scratch[3] );
155             tw1 += fstride;
156             tw2 += fstride*2;
157             tw3 += fstride*3;
158             C_ADDTO( *Fout , scratch[3] );
159 
160             Fout[m].r = ADD32_ovflw(scratch[5].r, scratch[4].i);
161             Fout[m].i = SUB32_ovflw(scratch[5].i, scratch[4].r);
162             Fout[m3].r = SUB32_ovflw(scratch[5].r, scratch[4].i);
163             Fout[m3].i = ADD32_ovflw(scratch[5].i, scratch[4].r);
164             ++Fout;
165          }
166       }
167    }
168 }
169 
170 
171 #ifndef RADIX_TWO_ONLY
172 
kf_bfly3(kiss_fft_cpx * Fout,const size_t fstride,const kiss_fft_state * st,int m,int N,int mm)173 static void kf_bfly3(
174                      kiss_fft_cpx * Fout,
175                      const size_t fstride,
176                      const kiss_fft_state *st,
177                      int m,
178                      int N,
179                      int mm
180                     )
181 {
182    int i;
183    size_t k;
184    const size_t m2 = 2*m;
185    const kiss_twiddle_cpx *tw1,*tw2;
186    kiss_fft_cpx scratch[5];
187    kiss_twiddle_cpx epi3;
188 
189    kiss_fft_cpx * Fout_beg = Fout;
190 #ifdef FIXED_POINT
191    /*epi3.r = -16384;*/ /* Unused */
192    epi3.i = -28378;
193 #else
194    epi3 = st->twiddles[fstride*m];
195 #endif
196    for (i=0;i<N;i++)
197    {
198       Fout = Fout_beg + i*mm;
199       tw1=tw2=st->twiddles;
200       /* For non-custom modes, m is guaranteed to be a multiple of 4. */
201       k=m;
202       do {
203 
204          C_MUL(scratch[1],Fout[m] , *tw1);
205          C_MUL(scratch[2],Fout[m2] , *tw2);
206 
207          C_ADD(scratch[3],scratch[1],scratch[2]);
208          C_SUB(scratch[0],scratch[1],scratch[2]);
209          tw1 += fstride;
210          tw2 += fstride*2;
211 
212          Fout[m].r = SUB32_ovflw(Fout->r, HALF_OF(scratch[3].r));
213          Fout[m].i = SUB32_ovflw(Fout->i, HALF_OF(scratch[3].i));
214 
215          C_MULBYSCALAR( scratch[0] , epi3.i );
216 
217          C_ADDTO(*Fout,scratch[3]);
218 
219          Fout[m2].r = ADD32_ovflw(Fout[m].r, scratch[0].i);
220          Fout[m2].i = SUB32_ovflw(Fout[m].i, scratch[0].r);
221 
222          Fout[m].r = SUB32_ovflw(Fout[m].r, scratch[0].i);
223          Fout[m].i = ADD32_ovflw(Fout[m].i, scratch[0].r);
224 
225          ++Fout;
226       } while(--k);
227    }
228 }
229 
230 
231 #ifndef OVERRIDE_kf_bfly5
kf_bfly5(kiss_fft_cpx * Fout,const size_t fstride,const kiss_fft_state * st,int m,int N,int mm)232 static void kf_bfly5(
233                      kiss_fft_cpx * Fout,
234                      const size_t fstride,
235                      const kiss_fft_state *st,
236                      int m,
237                      int N,
238                      int mm
239                     )
240 {
241    kiss_fft_cpx *Fout0,*Fout1,*Fout2,*Fout3,*Fout4;
242    int i, u;
243    kiss_fft_cpx scratch[13];
244    const kiss_twiddle_cpx *tw;
245    kiss_twiddle_cpx ya,yb;
246    kiss_fft_cpx * Fout_beg = Fout;
247 
248 #ifdef FIXED_POINT
249    ya.r = 10126;
250    ya.i = -31164;
251    yb.r = -26510;
252    yb.i = -19261;
253 #else
254    ya = st->twiddles[fstride*m];
255    yb = st->twiddles[fstride*2*m];
256 #endif
257    tw=st->twiddles;
258 
259    for (i=0;i<N;i++)
260    {
261       Fout = Fout_beg + i*mm;
262       Fout0=Fout;
263       Fout1=Fout0+m;
264       Fout2=Fout0+2*m;
265       Fout3=Fout0+3*m;
266       Fout4=Fout0+4*m;
267 
268       /* For non-custom modes, m is guaranteed to be a multiple of 4. */
269       for ( u=0; u<m; ++u ) {
270          scratch[0] = *Fout0;
271 
272          C_MUL(scratch[1] ,*Fout1, tw[u*fstride]);
273          C_MUL(scratch[2] ,*Fout2, tw[2*u*fstride]);
274          C_MUL(scratch[3] ,*Fout3, tw[3*u*fstride]);
275          C_MUL(scratch[4] ,*Fout4, tw[4*u*fstride]);
276 
277          C_ADD( scratch[7],scratch[1],scratch[4]);
278          C_SUB( scratch[10],scratch[1],scratch[4]);
279          C_ADD( scratch[8],scratch[2],scratch[3]);
280          C_SUB( scratch[9],scratch[2],scratch[3]);
281 
282          Fout0->r = ADD32_ovflw(Fout0->r, ADD32_ovflw(scratch[7].r, scratch[8].r));
283          Fout0->i = ADD32_ovflw(Fout0->i, ADD32_ovflw(scratch[7].i, scratch[8].i));
284 
285          scratch[5].r = ADD32_ovflw(scratch[0].r, ADD32_ovflw(S_MUL(scratch[7].r,ya.r), S_MUL(scratch[8].r,yb.r)));
286          scratch[5].i = ADD32_ovflw(scratch[0].i, ADD32_ovflw(S_MUL(scratch[7].i,ya.r), S_MUL(scratch[8].i,yb.r)));
287 
288          scratch[6].r =  ADD32_ovflw(S_MUL(scratch[10].i,ya.i), S_MUL(scratch[9].i,yb.i));
289          scratch[6].i = NEG32_ovflw(ADD32_ovflw(S_MUL(scratch[10].r,ya.i), S_MUL(scratch[9].r,yb.i)));
290 
291          C_SUB(*Fout1,scratch[5],scratch[6]);
292          C_ADD(*Fout4,scratch[5],scratch[6]);
293 
294          scratch[11].r = ADD32_ovflw(scratch[0].r, ADD32_ovflw(S_MUL(scratch[7].r,yb.r), S_MUL(scratch[8].r,ya.r)));
295          scratch[11].i = ADD32_ovflw(scratch[0].i, ADD32_ovflw(S_MUL(scratch[7].i,yb.r), S_MUL(scratch[8].i,ya.r)));
296          scratch[12].r = SUB32_ovflw(S_MUL(scratch[9].i,ya.i), S_MUL(scratch[10].i,yb.i));
297          scratch[12].i = SUB32_ovflw(S_MUL(scratch[10].r,yb.i), S_MUL(scratch[9].r,ya.i));
298 
299          C_ADD(*Fout2,scratch[11],scratch[12]);
300          C_SUB(*Fout3,scratch[11],scratch[12]);
301 
302          ++Fout0;++Fout1;++Fout2;++Fout3;++Fout4;
303       }
304    }
305 }
306 #endif /* OVERRIDE_kf_bfly5 */
307 
308 
309 #endif
310 
311 
312 #ifdef CUSTOM_MODES
313 
314 static
compute_bitrev_table(int Fout,opus_int16 * f,const size_t fstride,int in_stride,opus_int16 * factors,const kiss_fft_state * st)315 void compute_bitrev_table(
316          int Fout,
317          opus_int16 *f,
318          const size_t fstride,
319          int in_stride,
320          opus_int16 * factors,
321          const kiss_fft_state *st
322             )
323 {
324    const int p=*factors++; /* the radix  */
325    const int m=*factors++; /* stage's fft length/p */
326 
327     /*printf ("fft %d %d %d %d %d %d\n", p*m, m, p, s2, fstride*in_stride, N);*/
328    if (m==1)
329    {
330       int j;
331       for (j=0;j<p;j++)
332       {
333          *f = Fout+j;
334          f += fstride*in_stride;
335       }
336    } else {
337       int j;
338       for (j=0;j<p;j++)
339       {
340          compute_bitrev_table( Fout , f, fstride*p, in_stride, factors,st);
341          f += fstride*in_stride;
342          Fout += m;
343       }
344    }
345 }
346 
347 /*  facbuf is populated by p1,m1,p2,m2, ...
348     where
349     p[i] * m[i] = m[i-1]
350     m0 = n                  */
351 static
kf_factor(int n,opus_int16 * facbuf)352 int kf_factor(int n,opus_int16 * facbuf)
353 {
354     int p=4;
355     int i;
356     int stages=0;
357     int nbak = n;
358 
359     /*factor out powers of 4, powers of 2, then any remaining primes */
360     do {
361         while (n % p) {
362             switch (p) {
363                 case 4: p = 2; break;
364                 case 2: p = 3; break;
365                 default: p += 2; break;
366             }
367             if (p>32000 || (opus_int32)p*(opus_int32)p > n)
368                 p = n;          /* no more factors, skip to end */
369         }
370         n /= p;
371 #ifdef RADIX_TWO_ONLY
372         if (p!=2 && p != 4)
373 #else
374         if (p>5)
375 #endif
376         {
377            return 0;
378         }
379         facbuf[2*stages] = p;
380         if (p==2 && stages > 1)
381         {
382            facbuf[2*stages] = 4;
383            facbuf[2] = 2;
384         }
385         stages++;
386     } while (n > 1);
387     n = nbak;
388     /* Reverse the order to get the radix 4 at the end, so we can use the
389        fast degenerate case. It turns out that reversing the order also
390        improves the noise behaviour. */
391     for (i=0;i<stages/2;i++)
392     {
393        int tmp;
394        tmp = facbuf[2*i];
395        facbuf[2*i] = facbuf[2*(stages-i-1)];
396        facbuf[2*(stages-i-1)] = tmp;
397     }
398     for (i=0;i<stages;i++)
399     {
400         n /= facbuf[2*i];
401         facbuf[2*i+1] = n;
402     }
403     return 1;
404 }
405 
compute_twiddles(kiss_twiddle_cpx * twiddles,int nfft)406 static void compute_twiddles(kiss_twiddle_cpx *twiddles, int nfft)
407 {
408    int i;
409 #ifdef FIXED_POINT
410    for (i=0;i<nfft;++i) {
411       opus_val32 phase = -i;
412       kf_cexp2(twiddles+i, DIV32(SHL32(phase,17),nfft));
413    }
414 #else
415    for (i=0;i<nfft;++i) {
416       const double pi=3.14159265358979323846264338327;
417       double phase = ( -2*pi /nfft ) * i;
418       kf_cexp(twiddles+i, phase );
419    }
420 #endif
421 }
422 
opus_fft_alloc_arch_c(kiss_fft_state * st)423 int opus_fft_alloc_arch_c(kiss_fft_state *st) {
424    (void)st;
425    return 0;
426 }
427 
428 /*
429  *
430  * Allocates all necessary storage space for the fft and ifft.
431  * The return value is a contiguous block of memory.  As such,
432  * It can be freed with free().
433  * */
opus_fft_alloc_twiddles(int nfft,void * mem,size_t * lenmem,const kiss_fft_state * base,int arch)434 kiss_fft_state *opus_fft_alloc_twiddles(int nfft,void * mem,size_t * lenmem,
435                                         const kiss_fft_state *base, int arch)
436 {
437     kiss_fft_state *st=NULL;
438     size_t memneeded = sizeof(struct kiss_fft_state); /* twiddle factors*/
439 
440     if ( lenmem==NULL ) {
441         st = ( kiss_fft_state*)KISS_FFT_MALLOC( memneeded );
442     }else{
443         if (mem != NULL && *lenmem >= memneeded)
444             st = (kiss_fft_state*)mem;
445         *lenmem = memneeded;
446     }
447     if (st) {
448         opus_int16 *bitrev;
449         kiss_twiddle_cpx *twiddles;
450 
451         st->nfft=nfft;
452 #ifdef FIXED_POINT
453         st->scale_shift = celt_ilog2(st->nfft);
454         if (st->nfft == 1<<st->scale_shift)
455            st->scale = Q15ONE;
456         else
457            st->scale = (1073741824+st->nfft/2)/st->nfft>>(15-st->scale_shift);
458 #else
459         st->scale = 1.f/nfft;
460 #endif
461         if (base != NULL)
462         {
463            st->twiddles = base->twiddles;
464            st->shift = 0;
465            while (st->shift < 32 && nfft<<st->shift != base->nfft)
466               st->shift++;
467            if (st->shift>=32)
468               goto fail;
469         } else {
470            st->twiddles = twiddles = (kiss_twiddle_cpx*)KISS_FFT_MALLOC(sizeof(kiss_twiddle_cpx)*nfft);
471            compute_twiddles(twiddles, nfft);
472            st->shift = -1;
473         }
474         if (!kf_factor(nfft,st->factors))
475         {
476            goto fail;
477         }
478 
479         /* bitrev */
480         st->bitrev = bitrev = (opus_int16*)KISS_FFT_MALLOC(sizeof(opus_int16)*nfft);
481         if (st->bitrev==NULL)
482             goto fail;
483         compute_bitrev_table(0, bitrev, 1,1, st->factors,st);
484 
485         /* Initialize architecture specific fft parameters */
486         if (opus_fft_alloc_arch(st, arch))
487             goto fail;
488     }
489     return st;
490 fail:
491     opus_fft_free(st, arch);
492     return NULL;
493 }
494 
opus_fft_alloc(int nfft,void * mem,size_t * lenmem,int arch)495 kiss_fft_state *opus_fft_alloc(int nfft,void * mem,size_t * lenmem, int arch)
496 {
497    return opus_fft_alloc_twiddles(nfft, mem, lenmem, NULL, arch);
498 }
499 
opus_fft_free_arch_c(kiss_fft_state * st)500 void opus_fft_free_arch_c(kiss_fft_state *st) {
501    (void)st;
502 }
503 
opus_fft_free(const kiss_fft_state * cfg,int arch)504 void opus_fft_free(const kiss_fft_state *cfg, int arch)
505 {
506    if (cfg)
507    {
508       opus_fft_free_arch((kiss_fft_state *)cfg, arch);
509       opus_free((opus_int16*)cfg->bitrev);
510       if (cfg->shift < 0)
511          opus_free((kiss_twiddle_cpx*)cfg->twiddles);
512       opus_free((kiss_fft_state*)cfg);
513    }
514 }
515 
516 #endif /* CUSTOM_MODES */
517 
opus_fft_impl(const kiss_fft_state * st,kiss_fft_cpx * fout)518 void opus_fft_impl(const kiss_fft_state *st,kiss_fft_cpx *fout)
519 {
520     int m2, m;
521     int p;
522     int L;
523     int fstride[MAXFACTORS];
524     int i;
525     int shift;
526 
527     /* st->shift can be -1 */
528     shift = st->shift>0 ? st->shift : 0;
529 
530     fstride[0] = 1;
531     L=0;
532     do {
533        p = st->factors[2*L];
534        m = st->factors[2*L+1];
535        fstride[L+1] = fstride[L]*p;
536        L++;
537     } while(m!=1);
538     m = st->factors[2*L-1];
539     for (i=L-1;i>=0;i--)
540     {
541        if (i!=0)
542           m2 = st->factors[2*i-1];
543        else
544           m2 = 1;
545        switch (st->factors[2*i])
546        {
547        case 2:
548           kf_bfly2(fout, m, fstride[i]);
549           break;
550        case 4:
551           kf_bfly4(fout,fstride[i]<<shift,st,m, fstride[i], m2);
552           break;
553  #ifndef RADIX_TWO_ONLY
554        case 3:
555           kf_bfly3(fout,fstride[i]<<shift,st,m, fstride[i], m2);
556           break;
557        case 5:
558           kf_bfly5(fout,fstride[i]<<shift,st,m, fstride[i], m2);
559           break;
560  #endif
561        }
562        m = m2;
563     }
564 }
565 
opus_fft_c(const kiss_fft_state * st,const kiss_fft_cpx * fin,kiss_fft_cpx * fout)566 void opus_fft_c(const kiss_fft_state *st,const kiss_fft_cpx *fin,kiss_fft_cpx *fout)
567 {
568    int i;
569    opus_val16 scale;
570 #ifdef FIXED_POINT
571    /* Allows us to scale with MULT16_32_Q16(), which is faster than
572       MULT16_32_Q15() on ARM. */
573    int scale_shift = st->scale_shift-1;
574 #endif
575    scale = st->scale;
576 
577    celt_assert2 (fin != fout, "In-place FFT not supported");
578    /* Bit-reverse the input */
579    for (i=0;i<st->nfft;i++)
580    {
581       kiss_fft_cpx x = fin[i];
582       fout[st->bitrev[i]].r = SHR32(MULT16_32_Q16(scale, x.r), scale_shift);
583       fout[st->bitrev[i]].i = SHR32(MULT16_32_Q16(scale, x.i), scale_shift);
584    }
585    opus_fft_impl(st, fout);
586 }
587 
588 
opus_ifft_c(const kiss_fft_state * st,const kiss_fft_cpx * fin,kiss_fft_cpx * fout)589 void opus_ifft_c(const kiss_fft_state *st,const kiss_fft_cpx *fin,kiss_fft_cpx *fout)
590 {
591    int i;
592    celt_assert2 (fin != fout, "In-place FFT not supported");
593    /* Bit-reverse the input */
594    for (i=0;i<st->nfft;i++)
595       fout[st->bitrev[i]] = fin[i];
596    for (i=0;i<st->nfft;i++)
597       fout[i].i = -fout[i].i;
598    opus_fft_impl(st, fout);
599    for (i=0;i<st->nfft;i++)
600       fout[i].i = -fout[i].i;
601 }
602