1 /**
2  * Aften: A/52 audio encoder
3  *
4  * This file is derived from libvorbis lancer patch
5  * Copyright (c) 2006-2007 prakash@punnoor.de
6  * Copyright (c) 2006, blacksword8192@hotmail.com
7  * Copyright (c) 2002, Xiph.org Foundation
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  *
13  * - Redistributions of source code must retain the above copyright
14  * notice, this list of conditions and the following disclaimer.
15  *
16  * - Redistributions in binary form must reproduce the above copyright
17  * notice, this list of conditions and the following disclaimer in the
18  * documentation and/or other materials provided with the distribution.
19  *
20  * - Neither the name of the Xiph.org Foundation nor the names of its
21  * contributors may be used to endorse or promote products derived from
22  * this software without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
25  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
26  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
27  * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION
28  * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
29  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
30  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
31  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
32  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
33  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
34  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35  */
36 
37 /**
38  * @file mdct.c
39  * Modified Discrete Cosine Transform
40  */
41 
42 #include "common.h"
43 
44 #include <stdlib.h>
45 #include <stdio.h>
46 #include <string.h>
47 #include <assert.h>
48 
49 #include "a52.h"
50 #include "mdct.h"
51 
52 #include "x86_simd_support.h"
53 #include "x86_sse_mdct_common_init.h"
54 
55 
56 static const union __m128ui PCS_NNRR = {{0x80000000, 0x80000000, 0x00000000, 0x00000000}};
57 static const union __m128ui PCS_RNRN = {{0x00000000, 0x80000000, 0x00000000, 0x80000000}};
58 static const union __m128ui PCS_RRNN = {{0x00000000, 0x00000000, 0x80000000, 0x80000000}};
59 static const union __m128ui PCS_RNNR = {{0x80000000, 0x00000000, 0x00000000, 0x80000000}};
60 static const union __m128ui PCS_RRRR = {{0x80000000, 0x80000000, 0x80000000, 0x80000000}};
61 
62 void
sse_mdct_ctx_init(MDCTContext * mdct,int n)63 sse_mdct_ctx_init(MDCTContext *mdct, int n)
64 {
65     int *bitrev = aligned_malloc((n/4) * sizeof(int));
66     FLOAT *trig = aligned_malloc((n+n/4) * sizeof(FLOAT));
67     int i;
68     int n2 = (n >> 1);
69     int log2n = mdct->log2n = log2i(n);
70     mdct->n = n;
71     mdct->trig = trig;
72     mdct->bitrev = bitrev;
73 
74     // trig lookups
75     for(i=0;i<n/4;i++){
76         trig[i*2]      =  AFT_COS((AFT_PI/n)*(4*i));
77         trig[i*2+1]    = -AFT_SIN((AFT_PI/n)*(4*i));
78         trig[n2+i*2]   =  AFT_COS((AFT_PI/(2*n))*(2*i+1));
79         trig[n2+i*2+1] =  AFT_SIN((AFT_PI/(2*n))*(2*i+1));
80     }
81     for(i=0;i<n/8;i++){
82         trig[n+i*2]    =  AFT_COS((AFT_PI/n)*(4*i+2))*0.5f;
83         trig[n+i*2+1]  = -AFT_SIN((AFT_PI/n)*(4*i+2))*0.5f;
84     }
85 
86     // bitreverse lookup
87     {
88         int j, acc;
89         int mask = (1 << (log2n-1)) - 1;
90         int msb = (1 << (log2n-2));
91         for(i=0; i<n/8; i++) {
92             acc = 0;
93             for(j=0; msb>>j; j++) {
94                 if((msb>>j)&i) {
95                     acc |= (1 << j);
96                 }
97             }
98             bitrev[i*2]= ((~acc) & mask) - 1;
99             bitrev[i*2+1] = acc;
100         }
101     }
102 
103     // MDCT scale used in AC3
104     mdct->scale = -2.0f / n;
105     {
106         __m128  pscalem  = _mm_set_ps1(mdct->scale);
107         float *T, *S;
108         int n4   = n>>2;
109         int n8   = n>>3;
110         int j;
111         /*
112             for mdct_bitreverse
113         */
114         T    = aligned_malloc(sizeof(*T)*n2);
115         mdct->trig_bitreverse    = T;
116         S    = mdct->trig+n;
117         for(i=0;i<n4;i+=8)
118         {
119             __m128  XMM0     = _mm_load_ps(S+i   );
120             __m128  XMM1     = _mm_load_ps(S+i+ 4);
121             __m128  XMM2     = XMM0;
122             __m128  XMM3     = XMM1;
123             XMM2     = _mm_shuffle_ps(XMM2, XMM2, _MM_SHUFFLE(2,3,0,1));
124             XMM3     = _mm_shuffle_ps(XMM3, XMM3, _MM_SHUFFLE(2,3,0,1));
125             XMM2     = _mm_xor_ps(XMM2, PCS_RNRN.v);
126             XMM3     = _mm_xor_ps(XMM3, PCS_RNRN.v);
127             _mm_store_ps(T+i*2   , XMM0);
128             _mm_store_ps(T+i*2+ 4, XMM2);
129             _mm_store_ps(T+i*2+ 8, XMM1);
130             _mm_store_ps(T+i*2+12, XMM3);
131         }
132         /*
133             for mdct_forward part 0
134         */
135         T    = aligned_malloc(sizeof(*T)*(n*2));
136         mdct->trig_forward   = T;
137         S    = mdct->trig;
138         for(i=0,j=n2-4;i<n8;i+=4,j-=4)
139         {
140             __m128  XMM0, XMM1, XMM2, XMM3;
141 #ifdef __INTEL_COMPILER
142 #pragma warning(disable : 592)
143 #endif
144             XMM0     = _mm_loadl_pi(XMM0, (__m64*)(S+j+2));
145             XMM2     = _mm_loadl_pi(XMM2, (__m64*)(S+j  ));
146             XMM0     = _mm_loadh_pi(XMM0, (__m64*)(S+i  ));
147             XMM2     = _mm_loadh_pi(XMM2, (__m64*)(S+i+2));
148 #ifdef __INTEL_COMPILER
149 #pragma warning(default : 592)
150 #endif
151             XMM1     = XMM0;
152             XMM3     = XMM2;
153             XMM0     = _mm_shuffle_ps(XMM0, XMM0, _MM_SHUFFLE(2,3,0,1));
154             XMM2     = _mm_shuffle_ps(XMM2, XMM2, _MM_SHUFFLE(2,3,0,1));
155             XMM0     = _mm_xor_ps(XMM0, PCS_RRNN.v);
156             XMM1     = _mm_xor_ps(XMM1, PCS_RNNR.v);
157             XMM2     = _mm_xor_ps(XMM2, PCS_RRNN.v);
158             XMM3     = _mm_xor_ps(XMM3, PCS_RNNR.v);
159             _mm_store_ps(T+i*4   , XMM0);
160             _mm_store_ps(T+i*4+ 4, XMM1);
161             _mm_store_ps(T+i*4+ 8, XMM2);
162             _mm_store_ps(T+i*4+12, XMM3);
163         }
164         for(;i<n4;i+=4,j-=4)
165         {
166             __m128  XMM0, XMM1, XMM2, XMM3;
167 #ifdef __INTEL_COMPILER
168 #pragma warning(disable : 592)
169 #endif
170             XMM0     = _mm_loadl_pi(XMM0, (__m64*)(S+j+2));
171             XMM2     = _mm_loadl_pi(XMM2, (__m64*)(S+j  ));
172             XMM0     = _mm_loadh_pi(XMM0, (__m64*)(S+i  ));
173             XMM2     = _mm_loadh_pi(XMM2, (__m64*)(S+i+2));
174 #ifdef __INTEL_COMPILER
175 #pragma warning(default : 592)
176 #endif
177             XMM1     = XMM0;
178             XMM3     = XMM2;
179             XMM0     = _mm_shuffle_ps(XMM0, XMM0, _MM_SHUFFLE(2,3,0,1));
180             XMM2     = _mm_shuffle_ps(XMM2, XMM2, _MM_SHUFFLE(2,3,0,1));
181             XMM0     = _mm_xor_ps(XMM0, PCS_NNRR.v);
182             XMM2     = _mm_xor_ps(XMM2, PCS_NNRR.v);
183             XMM1     = _mm_xor_ps(XMM1, PCS_RNNR.v);
184             XMM3     = _mm_xor_ps(XMM3, PCS_RNNR.v);
185             _mm_store_ps(T+i*4   , XMM0);
186             _mm_store_ps(T+i*4+ 4, XMM1);
187             _mm_store_ps(T+i*4+ 8, XMM2);
188             _mm_store_ps(T+i*4+12, XMM3);
189         }
190         /*
191             for mdct_forward part 1
192         */
193         T    = mdct->trig_forward+n;
194         S    = mdct->trig+n2;
195         for(i=0;i<n4;i+=4){
196             __m128  XMM0, XMM1, XMM2;
197             XMM0     = _mm_load_ps(S+4);
198             XMM2     = _mm_load_ps(S  );
199             XMM1     = XMM0;
200             XMM0     = _mm_shuffle_ps(XMM0, XMM2,_MM_SHUFFLE(1,3,1,3));
201             XMM1     = _mm_shuffle_ps(XMM1, XMM2,_MM_SHUFFLE(0,2,0,2));
202             XMM0     = _mm_mul_ps(XMM0, pscalem);
203             XMM1     = _mm_mul_ps(XMM1, pscalem);
204             _mm_store_ps(T   , XMM0);
205             _mm_store_ps(T+ 4, XMM1);
206             XMM1     = _mm_shuffle_ps(XMM1, XMM1, _MM_SHUFFLE(0,1,2,3));
207             XMM0     = _mm_shuffle_ps(XMM0, XMM0, _MM_SHUFFLE(0,1,2,3));
208             _mm_store_ps(T+ 8, XMM1);
209             _mm_store_ps(T+12, XMM0);
210             S       += 8;
211             T       += 16;
212         }
213         /*
214             for mdct_butterfly_first
215         */
216         S    = mdct->trig;
217         T    = aligned_malloc(sizeof(*T)*n*2);
218         mdct->trig_butterfly_first   = T;
219         for(i=0;i<n4;i+=4)
220         {
221             __m128  XMM0, XMM1, XMM2, XMM3, XMM4, XMM5;
222             XMM2     = _mm_load_ps(S   );
223             XMM0     = _mm_load_ps(S+ 4);
224             XMM5     = _mm_load_ps(S+ 8);
225             XMM3     = _mm_load_ps(S+12);
226             XMM1     = XMM0;
227             XMM4     = XMM3;
228             XMM0     = _mm_shuffle_ps(XMM0, XMM2, _MM_SHUFFLE(0,1,0,1));
229             XMM1     = _mm_shuffle_ps(XMM1, XMM2, _MM_SHUFFLE(1,0,1,0));
230             XMM3     = _mm_shuffle_ps(XMM3, XMM5, _MM_SHUFFLE(0,1,0,1));
231             XMM4     = _mm_shuffle_ps(XMM4, XMM5, _MM_SHUFFLE(1,0,1,0));
232             XMM1     = _mm_xor_ps(XMM1, PCS_RNRN.v);
233             XMM4     = _mm_xor_ps(XMM4, PCS_RNRN.v);
234             _mm_store_ps(T   , XMM1);
235             _mm_store_ps(T+ 4, XMM4);
236             _mm_store_ps(T+ 8, XMM0);
237             _mm_store_ps(T+12, XMM3);
238             XMM1     = _mm_xor_ps(XMM1, PCS_RRRR.v);
239             XMM4     = _mm_xor_ps(XMM4, PCS_RRRR.v);
240             XMM0     = _mm_xor_ps(XMM0, PCS_RRRR.v);
241             XMM3     = _mm_xor_ps(XMM3, PCS_RRRR.v);
242             _mm_store_ps(T+n   , XMM1);
243             _mm_store_ps(T+n+ 4, XMM4);
244             _mm_store_ps(T+n+ 8, XMM0);
245             _mm_store_ps(T+n+12, XMM3);
246             S   += 16;
247             T   += 16;
248         }
249         /*
250             for mdct_butterfly_generic(trigint=8)
251         */
252         S    = mdct->trig;
253         T    = aligned_malloc(sizeof(*T)*n2);
254         mdct->trig_butterfly_generic8    = T;
255         for(i=0;i<n;i+=32)
256         {
257             __m128  XMM0, XMM1, XMM2, XMM3, XMM4, XMM5;
258 
259             XMM0     = _mm_load_ps(S+ 24);
260             XMM2     = _mm_load_ps(S+ 16);
261             XMM3     = _mm_load_ps(S+  8);
262             XMM5     = _mm_load_ps(S    );
263             XMM1     = XMM0;
264             XMM4     = XMM3;
265             XMM0     = _mm_shuffle_ps(XMM0, XMM2, _MM_SHUFFLE(0,1,0,1));
266             XMM1     = _mm_shuffle_ps(XMM1, XMM2, _MM_SHUFFLE(1,0,1,0));
267             XMM3     = _mm_shuffle_ps(XMM3, XMM5, _MM_SHUFFLE(0,1,0,1));
268             XMM4     = _mm_shuffle_ps(XMM4, XMM5, _MM_SHUFFLE(1,0,1,0));
269             XMM1     = _mm_xor_ps(XMM1, PCS_RNRN.v);
270             XMM4     = _mm_xor_ps(XMM4, PCS_RNRN.v);
271             _mm_store_ps(T   , XMM0);
272             _mm_store_ps(T+ 4, XMM1);
273             _mm_store_ps(T+ 8, XMM3);
274             _mm_store_ps(T+12, XMM4);
275             S   += 32;
276             T   += 16;
277         }
278         /*
279             for mdct_butterfly_generic(trigint=16)
280         */
281         S    = mdct->trig;
282         T    = aligned_malloc(sizeof(*T)*n4);
283         mdct->trig_butterfly_generic16   = T;
284         for(i=0;i<n;i+=64)
285         {
286             __m128  XMM0, XMM1, XMM2, XMM3, XMM4, XMM5;
287 
288             XMM0     = _mm_load_ps(S+ 48);
289             XMM2     = _mm_load_ps(S+ 32);
290             XMM3     = _mm_load_ps(S+ 16);
291             XMM5     = _mm_load_ps(S    );
292             XMM1     = XMM0;
293             XMM4     = XMM3;
294             XMM0     = _mm_shuffle_ps(XMM0, XMM2, _MM_SHUFFLE(0,1,0,1));
295             XMM1     = _mm_shuffle_ps(XMM1, XMM2, _MM_SHUFFLE(1,0,1,0));
296             XMM3     = _mm_shuffle_ps(XMM3, XMM5, _MM_SHUFFLE(0,1,0,1));
297             XMM4     = _mm_shuffle_ps(XMM4, XMM5, _MM_SHUFFLE(1,0,1,0));
298             XMM1     = _mm_xor_ps(XMM1, PCS_RNRN.v);
299             XMM4     = _mm_xor_ps(XMM4, PCS_RNRN.v);
300             _mm_store_ps(T   , XMM0);
301             _mm_store_ps(T+ 4, XMM1);
302             _mm_store_ps(T+ 8, XMM3);
303             _mm_store_ps(T+12, XMM4);
304             S   += 64;
305             T   += 16;
306         }
307         /*
308             for mdct_butterfly_generic(trigint=32)
309         */
310         if(n<128)
311             mdct->trig_butterfly_generic32   = NULL;
312         else
313         {
314             S    = mdct->trig;
315             T    = aligned_malloc(sizeof(*T)*n8);
316             mdct->trig_butterfly_generic32   = T;
317             for(i=0;i<n;i+=128)
318             {
319                 __m128  XMM0, XMM1, XMM2, XMM3, XMM4, XMM5;
320 
321                 XMM0     = _mm_load_ps(S+ 96);
322                 XMM2     = _mm_load_ps(S+ 64);
323                 XMM3     = _mm_load_ps(S+ 32);
324                 XMM5     = _mm_load_ps(S    );
325                 XMM1     = XMM0;
326                 XMM4     = XMM3;
327                 XMM0     = _mm_shuffle_ps(XMM0, XMM2, _MM_SHUFFLE(0,1,0,1));
328                 XMM1     = _mm_shuffle_ps(XMM1, XMM2, _MM_SHUFFLE(1,0,1,0));
329                 XMM3     = _mm_shuffle_ps(XMM3, XMM5, _MM_SHUFFLE(0,1,0,1));
330                 XMM4     = _mm_shuffle_ps(XMM4, XMM5, _MM_SHUFFLE(1,0,1,0));
331                 XMM1     = _mm_xor_ps(XMM1, PCS_RNRN.v);
332                 XMM4     = _mm_xor_ps(XMM4, PCS_RNRN.v);
333                 _mm_store_ps(T   , XMM0);
334                 _mm_store_ps(T+ 4, XMM1);
335                 _mm_store_ps(T+ 8, XMM3);
336                 _mm_store_ps(T+12, XMM4);
337                 S   += 128;
338                 T   += 16;
339             }
340         }
341         /*
342             for mdct_butterfly_generic(trigint=64)
343         */
344         if(n<256)
345             mdct->trig_butterfly_generic64   = NULL;
346         else
347         {
348             S    = mdct->trig;
349             T    = aligned_malloc(sizeof(*T)*(n8>>1));
350             mdct->trig_butterfly_generic64   = T;
351             for(i=0;i<n;i+=256)
352             {
353                 __m128  XMM0, XMM1, XMM2, XMM3, XMM4, XMM5;
354 
355                 XMM0     = _mm_load_ps(S+192);
356                 XMM2     = _mm_load_ps(S+128);
357                 XMM3     = _mm_load_ps(S+ 64);
358                 XMM5     = _mm_load_ps(S    );
359                 XMM1     = XMM0;
360                 XMM4     = XMM3;
361                 XMM0     = _mm_shuffle_ps(XMM0, XMM2, _MM_SHUFFLE(0,1,0,1));
362                 XMM1     = _mm_shuffle_ps(XMM1, XMM2, _MM_SHUFFLE(1,0,1,0));
363                 XMM3     = _mm_shuffle_ps(XMM3, XMM5, _MM_SHUFFLE(0,1,0,1));
364                 XMM4     = _mm_shuffle_ps(XMM4, XMM5, _MM_SHUFFLE(1,0,1,0));
365                 XMM1     = _mm_xor_ps(XMM1, PCS_RNRN.v);
366                 XMM4     = _mm_xor_ps(XMM4, PCS_RNRN.v);
367                 _mm_store_ps(T   , XMM0);
368                 _mm_store_ps(T+ 4, XMM1);
369                 _mm_store_ps(T+ 8, XMM3);
370                 _mm_store_ps(T+12, XMM4);
371                 S   += 256;
372                 T   += 16;
373             }
374         }
375     }
376 }
377 
378 void
sse_mdct_ctx_close(MDCTContext * mdct)379 sse_mdct_ctx_close(MDCTContext *mdct)
380 {
381     if(mdct) {
382         if(mdct->trig)   aligned_free(mdct->trig);
383         if(mdct->bitrev) aligned_free(mdct->bitrev);
384         if(mdct->trig_bitreverse) aligned_free(mdct->trig_bitreverse);
385         if(mdct->trig_forward) aligned_free(mdct->trig_forward);
386         if(mdct->trig_butterfly_first) aligned_free(mdct->trig_butterfly_first);
387         if(mdct->trig_butterfly_generic8) aligned_free(mdct->trig_butterfly_generic8);
388         if(mdct->trig_butterfly_generic16) aligned_free(mdct->trig_butterfly_generic16);
389         if(mdct->trig_butterfly_generic32) aligned_free(mdct->trig_butterfly_generic32);
390         if(mdct->trig_butterfly_generic64) aligned_free(mdct->trig_butterfly_generic64);
391         memset(mdct, 0, sizeof(MDCTContext));
392     }
393 }
394 
395 void
sse_mdct_tctx_init(MDCTThreadContext * tmdct,int n)396 sse_mdct_tctx_init(MDCTThreadContext *tmdct, int n)
397 {
398     // internal mdct buffers
399     tmdct->buffer = aligned_malloc((n+2) * sizeof(FLOAT));/* +2 to prevent illegal read in bitreverse*/
400     tmdct->buffer1 = aligned_malloc(n * sizeof(FLOAT));
401 }
402 
403 void
sse_mdct_tctx_close(MDCTThreadContext * tmdct)404 sse_mdct_tctx_close(MDCTThreadContext *tmdct)
405 {
406     if(tmdct) {
407         if(tmdct->buffer) aligned_free(tmdct->buffer);
408         if(tmdct->buffer1) aligned_free(tmdct->buffer1);
409     }
410 }
411