1 /*Copyright (c) 2003-2004, Mark Borgerding
2   Lots of modifications by Jean-Marc Valin
3   Copyright (c) 2005-2007, Xiph.Org Foundation
4   Copyright (c) 2008,      Xiph.Org Foundation, CSIRO
5 
6   All rights reserved.
7 
8   Redistribution and use in source and binary forms, with or without
9    modification, are permitted provided that the following conditions are met:
10 
11     * Redistributions of source code must retain the above copyright notice,
12        this list of conditions and the following disclaimer.
13     * Redistributions in binary form must reproduce the above copyright notice,
14        this list of conditions and the following disclaimer in the
15        documentation and/or other materials provided with the distribution.
16 
17   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27   POSSIBILITY OF SUCH DAMAGE.*/
28 
29 /* This code is originally from Mark Borgerding's KISS-FFT but has been
30    heavily modified to better suit Opus */
31 
32 #ifndef SKIP_CONFIG_H
33 #  ifdef HAVE_CONFIG_H
34 #    include "config.h"
35 #  endif
36 #endif
37 
38 #include "_kiss_fft_guts.h"
39 #include "arch.h"
40 #include "os_support.h"
41 #include "mathops.h"
42 #include "stack_alloc.h"
43 
44 /* The guts header contains all the multiplication and addition macros that are defined for
45    complex numbers.  It also delares the kf_ internal functions.
46 */
47 
kf_bfly2(kiss_fft_cpx * Fout,int m,int N)48 static void kf_bfly2(
49                      kiss_fft_cpx * Fout,
50                      int m,
51                      int N
52                     )
53 {
54    kiss_fft_cpx * Fout2;
55    int i;
56    (void)m;
57 #ifdef CUSTOM_MODES
58    if (m==1)
59    {
60       celt_assert(m==1);
61       for (i=0;i<N;i++)
62       {
63          kiss_fft_cpx t;
64          Fout2 = Fout + 1;
65          t = *Fout2;
66          C_SUB( *Fout2 ,  *Fout , t );
67          C_ADDTO( *Fout ,  t );
68          Fout += 2;
69       }
70    } else
71 #endif
72    {
73       opus_val16 tw;
74       tw = QCONST16(0.7071067812f, 15);
75       /* We know that m==4 here because the radix-2 is just after a radix-4 */
76       celt_assert(m==4);
77       for (i=0;i<N;i++)
78       {
79          kiss_fft_cpx t;
80          Fout2 = Fout + 4;
81          t = Fout2[0];
82          C_SUB( Fout2[0] ,  Fout[0] , t );
83          C_ADDTO( Fout[0] ,  t );
84 
85          t.r = S_MUL(Fout2[1].r+Fout2[1].i, tw);
86          t.i = S_MUL(Fout2[1].i-Fout2[1].r, tw);
87          C_SUB( Fout2[1] ,  Fout[1] , t );
88          C_ADDTO( Fout[1] ,  t );
89 
90          t.r = Fout2[2].i;
91          t.i = -Fout2[2].r;
92          C_SUB( Fout2[2] ,  Fout[2] , t );
93          C_ADDTO( Fout[2] ,  t );
94 
95          t.r = S_MUL(Fout2[3].i-Fout2[3].r, tw);
96          t.i = S_MUL(-Fout2[3].i-Fout2[3].r, tw);
97          C_SUB( Fout2[3] ,  Fout[3] , t );
98          C_ADDTO( Fout[3] ,  t );
99          Fout += 8;
100       }
101    }
102 }
103 
kf_bfly4(kiss_fft_cpx * Fout,const size_t fstride,const kiss_fft_state * st,int m,int N,int mm)104 static void kf_bfly4(
105                      kiss_fft_cpx * Fout,
106                      const size_t fstride,
107                      const kiss_fft_state *st,
108                      int m,
109                      int N,
110                      int mm
111                     )
112 {
113    int i;
114 
115    if (m==1)
116    {
117       /* Degenerate case where all the twiddles are 1. */
118       for (i=0;i<N;i++)
119       {
120          kiss_fft_cpx scratch0, scratch1;
121 
122          C_SUB( scratch0 , *Fout, Fout[2] );
123          C_ADDTO(*Fout, Fout[2]);
124          C_ADD( scratch1 , Fout[1] , Fout[3] );
125          C_SUB( Fout[2], *Fout, scratch1 );
126          C_ADDTO( *Fout , scratch1 );
127          C_SUB( scratch1 , Fout[1] , Fout[3] );
128 
129          Fout[1].r = scratch0.r + scratch1.i;
130          Fout[1].i = scratch0.i - scratch1.r;
131          Fout[3].r = scratch0.r - scratch1.i;
132          Fout[3].i = scratch0.i + scratch1.r;
133          Fout+=4;
134       }
135    } else {
136       int j;
137       kiss_fft_cpx scratch[6];
138       const kiss_twiddle_cpx *tw1,*tw2,*tw3;
139       const int m2=2*m;
140       const int m3=3*m;
141       kiss_fft_cpx * Fout_beg = Fout;
142       for (i=0;i<N;i++)
143       {
144          Fout = Fout_beg + i*mm;
145          tw3 = tw2 = tw1 = st->twiddles;
146          /* m is guaranteed to be a multiple of 4. */
147          for (j=0;j<m;j++)
148          {
149             C_MUL(scratch[0],Fout[m] , *tw1 );
150             C_MUL(scratch[1],Fout[m2] , *tw2 );
151             C_MUL(scratch[2],Fout[m3] , *tw3 );
152 
153             C_SUB( scratch[5] , *Fout, scratch[1] );
154             C_ADDTO(*Fout, scratch[1]);
155             C_ADD( scratch[3] , scratch[0] , scratch[2] );
156             C_SUB( scratch[4] , scratch[0] , scratch[2] );
157             C_SUB( Fout[m2], *Fout, scratch[3] );
158             tw1 += fstride;
159             tw2 += fstride*2;
160             tw3 += fstride*3;
161             C_ADDTO( *Fout , scratch[3] );
162 
163             Fout[m].r = scratch[5].r + scratch[4].i;
164             Fout[m].i = scratch[5].i - scratch[4].r;
165             Fout[m3].r = scratch[5].r - scratch[4].i;
166             Fout[m3].i = scratch[5].i + scratch[4].r;
167             ++Fout;
168          }
169       }
170    }
171 }
172 
173 
174 #ifndef RADIX_TWO_ONLY
175 
kf_bfly3(kiss_fft_cpx * Fout,const size_t fstride,const kiss_fft_state * st,int m,int N,int mm)176 static void kf_bfly3(
177                      kiss_fft_cpx * Fout,
178                      const size_t fstride,
179                      const kiss_fft_state *st,
180                      int m,
181                      int N,
182                      int mm
183                     )
184 {
185    int i;
186    size_t k;
187    const size_t m2 = 2*m;
188    const kiss_twiddle_cpx *tw1,*tw2;
189    kiss_fft_cpx scratch[5];
190    kiss_twiddle_cpx epi3;
191 
192    kiss_fft_cpx * Fout_beg = Fout;
193 #ifdef FIXED_POINT
194    /*epi3.r = -16384;*/ /* Unused */
195    epi3.i = -28378;
196 #else
197    epi3 = st->twiddles[fstride*m];
198 #endif
199    for (i=0;i<N;i++)
200    {
201       Fout = Fout_beg + i*mm;
202       tw1=tw2=st->twiddles;
203       /* For non-custom modes, m is guaranteed to be a multiple of 4. */
204       k=m;
205       do {
206 
207          C_MUL(scratch[1],Fout[m] , *tw1);
208          C_MUL(scratch[2],Fout[m2] , *tw2);
209 
210          C_ADD(scratch[3],scratch[1],scratch[2]);
211          C_SUB(scratch[0],scratch[1],scratch[2]);
212          tw1 += fstride;
213          tw2 += fstride*2;
214 
215          Fout[m].r = Fout->r - HALF_OF(scratch[3].r);
216          Fout[m].i = Fout->i - HALF_OF(scratch[3].i);
217 
218          C_MULBYSCALAR( scratch[0] , epi3.i );
219 
220          C_ADDTO(*Fout,scratch[3]);
221 
222          Fout[m2].r = Fout[m].r + scratch[0].i;
223          Fout[m2].i = Fout[m].i - scratch[0].r;
224 
225          Fout[m].r -= scratch[0].i;
226          Fout[m].i += scratch[0].r;
227 
228          ++Fout;
229       } while(--k);
230    }
231 }
232 
233 
234 #ifndef OVERRIDE_kf_bfly5
kf_bfly5(kiss_fft_cpx * Fout,const size_t fstride,const kiss_fft_state * st,int m,int N,int mm)235 static void kf_bfly5(
236                      kiss_fft_cpx * Fout,
237                      const size_t fstride,
238                      const kiss_fft_state *st,
239                      int m,
240                      int N,
241                      int mm
242                     )
243 {
244    kiss_fft_cpx *Fout0,*Fout1,*Fout2,*Fout3,*Fout4;
245    int i, u;
246    kiss_fft_cpx scratch[13];
247    const kiss_twiddle_cpx *tw;
248    kiss_twiddle_cpx ya,yb;
249    kiss_fft_cpx * Fout_beg = Fout;
250 
251 #ifdef FIXED_POINT
252    ya.r = 10126;
253    ya.i = -31164;
254    yb.r = -26510;
255    yb.i = -19261;
256 #else
257    ya = st->twiddles[fstride*m];
258    yb = st->twiddles[fstride*2*m];
259 #endif
260    tw=st->twiddles;
261 
262    for (i=0;i<N;i++)
263    {
264       Fout = Fout_beg + i*mm;
265       Fout0=Fout;
266       Fout1=Fout0+m;
267       Fout2=Fout0+2*m;
268       Fout3=Fout0+3*m;
269       Fout4=Fout0+4*m;
270 
271       /* For non-custom modes, m is guaranteed to be a multiple of 4. */
272       for ( u=0; u<m; ++u ) {
273          scratch[0] = *Fout0;
274 
275          C_MUL(scratch[1] ,*Fout1, tw[u*fstride]);
276          C_MUL(scratch[2] ,*Fout2, tw[2*u*fstride]);
277          C_MUL(scratch[3] ,*Fout3, tw[3*u*fstride]);
278          C_MUL(scratch[4] ,*Fout4, tw[4*u*fstride]);
279 
280          C_ADD( scratch[7],scratch[1],scratch[4]);
281          C_SUB( scratch[10],scratch[1],scratch[4]);
282          C_ADD( scratch[8],scratch[2],scratch[3]);
283          C_SUB( scratch[9],scratch[2],scratch[3]);
284 
285          Fout0->r += scratch[7].r + scratch[8].r;
286          Fout0->i += scratch[7].i + scratch[8].i;
287 
288          scratch[5].r = scratch[0].r + S_MUL(scratch[7].r,ya.r) + S_MUL(scratch[8].r,yb.r);
289          scratch[5].i = scratch[0].i + S_MUL(scratch[7].i,ya.r) + S_MUL(scratch[8].i,yb.r);
290 
291          scratch[6].r =  S_MUL(scratch[10].i,ya.i) + S_MUL(scratch[9].i,yb.i);
292          scratch[6].i = -S_MUL(scratch[10].r,ya.i) - S_MUL(scratch[9].r,yb.i);
293 
294          C_SUB(*Fout1,scratch[5],scratch[6]);
295          C_ADD(*Fout4,scratch[5],scratch[6]);
296 
297          scratch[11].r = scratch[0].r + S_MUL(scratch[7].r,yb.r) + S_MUL(scratch[8].r,ya.r);
298          scratch[11].i = scratch[0].i + S_MUL(scratch[7].i,yb.r) + S_MUL(scratch[8].i,ya.r);
299          scratch[12].r = - S_MUL(scratch[10].i,yb.i) + S_MUL(scratch[9].i,ya.i);
300          scratch[12].i = S_MUL(scratch[10].r,yb.i) - S_MUL(scratch[9].r,ya.i);
301 
302          C_ADD(*Fout2,scratch[11],scratch[12]);
303          C_SUB(*Fout3,scratch[11],scratch[12]);
304 
305          ++Fout0;++Fout1;++Fout2;++Fout3;++Fout4;
306       }
307    }
308 }
309 #endif /* OVERRIDE_kf_bfly5 */
310 
311 
312 #endif
313 
314 
315 #ifdef CUSTOM_MODES
316 
317 static
compute_bitrev_table(int Fout,opus_int16 * f,const size_t fstride,int in_stride,opus_int16 * factors,const kiss_fft_state * st)318 void compute_bitrev_table(
319          int Fout,
320          opus_int16 *f,
321          const size_t fstride,
322          int in_stride,
323          opus_int16 * factors,
324          const kiss_fft_state *st
325             )
326 {
327    const int p=*factors++; /* the radix  */
328    const int m=*factors++; /* stage's fft length/p */
329 
330     /*printf ("fft %d %d %d %d %d %d\n", p*m, m, p, s2, fstride*in_stride, N);*/
331    if (m==1)
332    {
333       int j;
334       for (j=0;j<p;j++)
335       {
336          *f = Fout+j;
337          f += fstride*in_stride;
338       }
339    } else {
340       int j;
341       for (j=0;j<p;j++)
342       {
343          compute_bitrev_table( Fout , f, fstride*p, in_stride, factors,st);
344          f += fstride*in_stride;
345          Fout += m;
346       }
347    }
348 }
349 
350 /*  facbuf is populated by p1,m1,p2,m2, ...
351     where
352     p[i] * m[i] = m[i-1]
353     m0 = n                  */
354 static
kf_factor(int n,opus_int16 * facbuf)355 int kf_factor(int n,opus_int16 * facbuf)
356 {
357     int p=4;
358     int i;
359     int stages=0;
360     int nbak = n;
361 
362     /*factor out powers of 4, powers of 2, then any remaining primes */
363     do {
364         while (n % p) {
365             switch (p) {
366                 case 4: p = 2; break;
367                 case 2: p = 3; break;
368                 default: p += 2; break;
369             }
370             if (p>32000 || (opus_int32)p*(opus_int32)p > n)
371                 p = n;          /* no more factors, skip to end */
372         }
373         n /= p;
374 #ifdef RADIX_TWO_ONLY
375         if (p!=2 && p != 4)
376 #else
377         if (p>5)
378 #endif
379         {
380            return 0;
381         }
382         facbuf[2*stages] = p;
383         if (p==2 && stages > 1)
384         {
385            facbuf[2*stages] = 4;
386            facbuf[2] = 2;
387         }
388         stages++;
389     } while (n > 1);
390     n = nbak;
391     /* Reverse the order to get the radix 4 at the end, so we can use the
392        fast degenerate case. It turns out that reversing the order also
393        improves the noise behaviour. */
394     for (i=0;i<stages/2;i++)
395     {
396        int tmp;
397        tmp = facbuf[2*i];
398        facbuf[2*i] = facbuf[2*(stages-i-1)];
399        facbuf[2*(stages-i-1)] = tmp;
400     }
401     for (i=0;i<stages;i++)
402     {
403         n /= facbuf[2*i];
404         facbuf[2*i+1] = n;
405     }
406     return 1;
407 }
408 
compute_twiddles(kiss_twiddle_cpx * twiddles,int nfft)409 static void compute_twiddles(kiss_twiddle_cpx *twiddles, int nfft)
410 {
411    int i;
412 #ifdef FIXED_POINT
413    for (i=0;i<nfft;++i) {
414       opus_val32 phase = -i;
415       kf_cexp2(twiddles+i, DIV32(SHL32(phase,17),nfft));
416    }
417 #else
418    for (i=0;i<nfft;++i) {
419       const double pi=3.14159265358979323846264338327;
420       double phase = ( -2*pi /nfft ) * i;
421       kf_cexp(twiddles+i, phase );
422    }
423 #endif
424 }
425 
opus_fft_alloc_arch_c(kiss_fft_state * st)426 int opus_fft_alloc_arch_c(kiss_fft_state *st) {
427    (void)st;
428    return 0;
429 }
430 
431 /*
432  *
433  * Allocates all necessary storage space for the fft and ifft.
434  * The return value is a contiguous block of memory.  As such,
435  * It can be freed with free().
436  * */
opus_fft_alloc_twiddles(int nfft,void * mem,size_t * lenmem,const kiss_fft_state * base,int arch)437 kiss_fft_state *opus_fft_alloc_twiddles(int nfft,void * mem,size_t * lenmem,
438                                         const kiss_fft_state *base, int arch)
439 {
440     kiss_fft_state *st=NULL;
441     size_t memneeded = sizeof(struct kiss_fft_state); /* twiddle factors*/
442 
443     if ( lenmem==NULL ) {
444         st = ( kiss_fft_state*)KISS_FFT_MALLOC( memneeded );
445     }else{
446         if (mem != NULL && *lenmem >= memneeded)
447             st = (kiss_fft_state*)mem;
448         *lenmem = memneeded;
449     }
450     if (st) {
451         opus_int16 *bitrev;
452         kiss_twiddle_cpx *twiddles;
453 
454         st->nfft=nfft;
455 #ifdef FIXED_POINT
456         st->scale_shift = celt_ilog2(st->nfft);
457         if (st->nfft == 1<<st->scale_shift)
458            st->scale = Q15ONE;
459         else
460            st->scale = (1073741824+st->nfft/2)/st->nfft>>(15-st->scale_shift);
461 #else
462         st->scale = 1.f/nfft;
463 #endif
464         if (base != NULL)
465         {
466            st->twiddles = base->twiddles;
467            st->shift = 0;
468            while (st->shift < 32 && nfft<<st->shift != base->nfft)
469               st->shift++;
470            if (st->shift>=32)
471               goto fail;
472         } else {
473            st->twiddles = twiddles = (kiss_twiddle_cpx*)KISS_FFT_MALLOC(sizeof(kiss_twiddle_cpx)*nfft);
474            compute_twiddles(twiddles, nfft);
475            st->shift = -1;
476         }
477         if (!kf_factor(nfft,st->factors))
478         {
479            goto fail;
480         }
481 
482         /* bitrev */
483         st->bitrev = bitrev = (opus_int16*)KISS_FFT_MALLOC(sizeof(opus_int16)*nfft);
484         if (st->bitrev==NULL)
485             goto fail;
486         compute_bitrev_table(0, bitrev, 1,1, st->factors,st);
487 
488         /* Initialize architecture specific fft parameters */
489         if (opus_fft_alloc_arch(st, arch))
490             goto fail;
491     }
492     return st;
493 fail:
494     opus_fft_free(st, arch);
495     return NULL;
496 }
497 
opus_fft_alloc(int nfft,void * mem,size_t * lenmem,int arch)498 kiss_fft_state *opus_fft_alloc(int nfft,void * mem,size_t * lenmem, int arch)
499 {
500    return opus_fft_alloc_twiddles(nfft, mem, lenmem, NULL, arch);
501 }
502 
opus_fft_free_arch_c(kiss_fft_state * st)503 void opus_fft_free_arch_c(kiss_fft_state *st) {
504    (void)st;
505 }
506 
opus_fft_free(const kiss_fft_state * cfg,int arch)507 void opus_fft_free(const kiss_fft_state *cfg, int arch)
508 {
509    if (cfg)
510    {
511       opus_fft_free_arch((kiss_fft_state *)cfg, arch);
512       opus_free((opus_int16*)cfg->bitrev);
513       if (cfg->shift < 0)
514          opus_free((kiss_twiddle_cpx*)cfg->twiddles);
515       opus_free((kiss_fft_state*)cfg);
516    }
517 }
518 
519 #endif /* CUSTOM_MODES */
520 
opus_fft_impl(const kiss_fft_state * st,kiss_fft_cpx * fout)521 void opus_fft_impl(const kiss_fft_state *st,kiss_fft_cpx *fout)
522 {
523     int m2, m;
524     int p;
525     int L;
526     int fstride[MAXFACTORS];
527     int i;
528     int shift;
529 
530     /* st->shift can be -1 */
531     shift = st->shift>0 ? st->shift : 0;
532 
533     fstride[0] = 1;
534     L=0;
535     do {
536        p = st->factors[2*L];
537        m = st->factors[2*L+1];
538        fstride[L+1] = fstride[L]*p;
539        L++;
540     } while(m!=1);
541     m = st->factors[2*L-1];
542     for (i=L-1;i>=0;i--)
543     {
544        if (i!=0)
545           m2 = st->factors[2*i-1];
546        else
547           m2 = 1;
548        switch (st->factors[2*i])
549        {
550        case 2:
551           kf_bfly2(fout, m, fstride[i]);
552           break;
553        case 4:
554           kf_bfly4(fout,fstride[i]<<shift,st,m, fstride[i], m2);
555           break;
556  #ifndef RADIX_TWO_ONLY
557        case 3:
558           kf_bfly3(fout,fstride[i]<<shift,st,m, fstride[i], m2);
559           break;
560        case 5:
561           kf_bfly5(fout,fstride[i]<<shift,st,m, fstride[i], m2);
562           break;
563  #endif
564        }
565        m = m2;
566     }
567 }
568 
opus_fft_c(const kiss_fft_state * st,const kiss_fft_cpx * fin,kiss_fft_cpx * fout)569 void opus_fft_c(const kiss_fft_state *st,const kiss_fft_cpx *fin,kiss_fft_cpx *fout)
570 {
571    int i;
572    opus_val16 scale;
573 #ifdef FIXED_POINT
574    /* Allows us to scale with MULT16_32_Q16(), which is faster than
575       MULT16_32_Q15() on ARM. */
576    int scale_shift = st->scale_shift-1;
577 #endif
578    scale = st->scale;
579 
580    celt_assert2 (fin != fout, "In-place FFT not supported");
581    /* Bit-reverse the input */
582    for (i=0;i<st->nfft;i++)
583    {
584       kiss_fft_cpx x = fin[i];
585       fout[st->bitrev[i]].r = SHR32(MULT16_32_Q16(scale, x.r), scale_shift);
586       fout[st->bitrev[i]].i = SHR32(MULT16_32_Q16(scale, x.i), scale_shift);
587    }
588    opus_fft_impl(st, fout);
589 }
590 
591 
opus_ifft_c(const kiss_fft_state * st,const kiss_fft_cpx * fin,kiss_fft_cpx * fout)592 void opus_ifft_c(const kiss_fft_state *st,const kiss_fft_cpx *fin,kiss_fft_cpx *fout)
593 {
594    int i;
595    celt_assert2 (fin != fout, "In-place FFT not supported");
596    /* Bit-reverse the input */
597    for (i=0;i<st->nfft;i++)
598       fout[st->bitrev[i]] = fin[i];
599    for (i=0;i<st->nfft;i++)
600       fout[i].i = -fout[i].i;
601    opus_fft_impl(st, fout);
602    for (i=0;i<st->nfft;i++)
603       fout[i].i = -fout[i].i;
604 }
605