1 /**
2 * Aften: A/52 audio encoder
3 *
4 * This file is derived from libvorbis lancer patch
5 * Copyright (c) 2006-2007 prakash@punnoor.de
6 * Copyright (c) 2006, blacksword8192@hotmail.com
7 * Copyright (c) 2002, Xiph.org Foundation
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 *
13 * - Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 *
16 * - Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 *
20 * - Neither the name of the Xiph.org Foundation nor the names of its
21 * contributors may be used to endorse or promote products derived from
22 * this software without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
25 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
26 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
27 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION
28 * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
29 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
30 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
31 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
32 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
33 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
34 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35 */
36
37 /**
38 * @file mdct.c
39 * Modified Discrete Cosine Transform
40 */
41
42 #include "common.h"
43
44 #include <stdlib.h>
45 #include <stdio.h>
46 #include <string.h>
47 #include <assert.h>
48
49 #include "a52.h"
50 #include "mdct.h"
51
52 #include "x86_simd_support.h"
53 #include "x86_sse_mdct_common_init.h"
54
55
56 static const union __m128ui PCS_NNRR = {{0x80000000, 0x80000000, 0x00000000, 0x00000000}};
57 static const union __m128ui PCS_RNRN = {{0x00000000, 0x80000000, 0x00000000, 0x80000000}};
58 static const union __m128ui PCS_RRNN = {{0x00000000, 0x00000000, 0x80000000, 0x80000000}};
59 static const union __m128ui PCS_RNNR = {{0x80000000, 0x00000000, 0x00000000, 0x80000000}};
60 static const union __m128ui PCS_RRRR = {{0x80000000, 0x80000000, 0x80000000, 0x80000000}};
61
62 void
sse_mdct_ctx_init(MDCTContext * mdct,int n)63 sse_mdct_ctx_init(MDCTContext *mdct, int n)
64 {
65 int *bitrev = aligned_malloc((n/4) * sizeof(int));
66 FLOAT *trig = aligned_malloc((n+n/4) * sizeof(FLOAT));
67 int i;
68 int n2 = (n >> 1);
69 int log2n = mdct->log2n = log2i(n);
70 mdct->n = n;
71 mdct->trig = trig;
72 mdct->bitrev = bitrev;
73
74 // trig lookups
75 for(i=0;i<n/4;i++){
76 trig[i*2] = AFT_COS((AFT_PI/n)*(4*i));
77 trig[i*2+1] = -AFT_SIN((AFT_PI/n)*(4*i));
78 trig[n2+i*2] = AFT_COS((AFT_PI/(2*n))*(2*i+1));
79 trig[n2+i*2+1] = AFT_SIN((AFT_PI/(2*n))*(2*i+1));
80 }
81 for(i=0;i<n/8;i++){
82 trig[n+i*2] = AFT_COS((AFT_PI/n)*(4*i+2))*0.5f;
83 trig[n+i*2+1] = -AFT_SIN((AFT_PI/n)*(4*i+2))*0.5f;
84 }
85
86 // bitreverse lookup
87 {
88 int j, acc;
89 int mask = (1 << (log2n-1)) - 1;
90 int msb = (1 << (log2n-2));
91 for(i=0; i<n/8; i++) {
92 acc = 0;
93 for(j=0; msb>>j; j++) {
94 if((msb>>j)&i) {
95 acc |= (1 << j);
96 }
97 }
98 bitrev[i*2]= ((~acc) & mask) - 1;
99 bitrev[i*2+1] = acc;
100 }
101 }
102
103 // MDCT scale used in AC3
104 mdct->scale = -2.0f / n;
105 {
106 __m128 pscalem = _mm_set_ps1(mdct->scale);
107 float *T, *S;
108 int n4 = n>>2;
109 int n8 = n>>3;
110 int j;
111 /*
112 for mdct_bitreverse
113 */
114 T = aligned_malloc(sizeof(*T)*n2);
115 mdct->trig_bitreverse = T;
116 S = mdct->trig+n;
117 for(i=0;i<n4;i+=8)
118 {
119 __m128 XMM0 = _mm_load_ps(S+i );
120 __m128 XMM1 = _mm_load_ps(S+i+ 4);
121 __m128 XMM2 = XMM0;
122 __m128 XMM3 = XMM1;
123 XMM2 = _mm_shuffle_ps(XMM2, XMM2, _MM_SHUFFLE(2,3,0,1));
124 XMM3 = _mm_shuffle_ps(XMM3, XMM3, _MM_SHUFFLE(2,3,0,1));
125 XMM2 = _mm_xor_ps(XMM2, PCS_RNRN.v);
126 XMM3 = _mm_xor_ps(XMM3, PCS_RNRN.v);
127 _mm_store_ps(T+i*2 , XMM0);
128 _mm_store_ps(T+i*2+ 4, XMM2);
129 _mm_store_ps(T+i*2+ 8, XMM1);
130 _mm_store_ps(T+i*2+12, XMM3);
131 }
132 /*
133 for mdct_forward part 0
134 */
135 T = aligned_malloc(sizeof(*T)*(n*2));
136 mdct->trig_forward = T;
137 S = mdct->trig;
138 for(i=0,j=n2-4;i<n8;i+=4,j-=4)
139 {
140 __m128 XMM0, XMM1, XMM2, XMM3;
141 #ifdef __INTEL_COMPILER
142 #pragma warning(disable : 592)
143 #endif
144 XMM0 = _mm_loadl_pi(XMM0, (__m64*)(S+j+2));
145 XMM2 = _mm_loadl_pi(XMM2, (__m64*)(S+j ));
146 XMM0 = _mm_loadh_pi(XMM0, (__m64*)(S+i ));
147 XMM2 = _mm_loadh_pi(XMM2, (__m64*)(S+i+2));
148 #ifdef __INTEL_COMPILER
149 #pragma warning(default : 592)
150 #endif
151 XMM1 = XMM0;
152 XMM3 = XMM2;
153 XMM0 = _mm_shuffle_ps(XMM0, XMM0, _MM_SHUFFLE(2,3,0,1));
154 XMM2 = _mm_shuffle_ps(XMM2, XMM2, _MM_SHUFFLE(2,3,0,1));
155 XMM0 = _mm_xor_ps(XMM0, PCS_RRNN.v);
156 XMM1 = _mm_xor_ps(XMM1, PCS_RNNR.v);
157 XMM2 = _mm_xor_ps(XMM2, PCS_RRNN.v);
158 XMM3 = _mm_xor_ps(XMM3, PCS_RNNR.v);
159 _mm_store_ps(T+i*4 , XMM0);
160 _mm_store_ps(T+i*4+ 4, XMM1);
161 _mm_store_ps(T+i*4+ 8, XMM2);
162 _mm_store_ps(T+i*4+12, XMM3);
163 }
164 for(;i<n4;i+=4,j-=4)
165 {
166 __m128 XMM0, XMM1, XMM2, XMM3;
167 #ifdef __INTEL_COMPILER
168 #pragma warning(disable : 592)
169 #endif
170 XMM0 = _mm_loadl_pi(XMM0, (__m64*)(S+j+2));
171 XMM2 = _mm_loadl_pi(XMM2, (__m64*)(S+j ));
172 XMM0 = _mm_loadh_pi(XMM0, (__m64*)(S+i ));
173 XMM2 = _mm_loadh_pi(XMM2, (__m64*)(S+i+2));
174 #ifdef __INTEL_COMPILER
175 #pragma warning(default : 592)
176 #endif
177 XMM1 = XMM0;
178 XMM3 = XMM2;
179 XMM0 = _mm_shuffle_ps(XMM0, XMM0, _MM_SHUFFLE(2,3,0,1));
180 XMM2 = _mm_shuffle_ps(XMM2, XMM2, _MM_SHUFFLE(2,3,0,1));
181 XMM0 = _mm_xor_ps(XMM0, PCS_NNRR.v);
182 XMM2 = _mm_xor_ps(XMM2, PCS_NNRR.v);
183 XMM1 = _mm_xor_ps(XMM1, PCS_RNNR.v);
184 XMM3 = _mm_xor_ps(XMM3, PCS_RNNR.v);
185 _mm_store_ps(T+i*4 , XMM0);
186 _mm_store_ps(T+i*4+ 4, XMM1);
187 _mm_store_ps(T+i*4+ 8, XMM2);
188 _mm_store_ps(T+i*4+12, XMM3);
189 }
190 /*
191 for mdct_forward part 1
192 */
193 T = mdct->trig_forward+n;
194 S = mdct->trig+n2;
195 for(i=0;i<n4;i+=4){
196 __m128 XMM0, XMM1, XMM2;
197 XMM0 = _mm_load_ps(S+4);
198 XMM2 = _mm_load_ps(S );
199 XMM1 = XMM0;
200 XMM0 = _mm_shuffle_ps(XMM0, XMM2,_MM_SHUFFLE(1,3,1,3));
201 XMM1 = _mm_shuffle_ps(XMM1, XMM2,_MM_SHUFFLE(0,2,0,2));
202 XMM0 = _mm_mul_ps(XMM0, pscalem);
203 XMM1 = _mm_mul_ps(XMM1, pscalem);
204 _mm_store_ps(T , XMM0);
205 _mm_store_ps(T+ 4, XMM1);
206 XMM1 = _mm_shuffle_ps(XMM1, XMM1, _MM_SHUFFLE(0,1,2,3));
207 XMM0 = _mm_shuffle_ps(XMM0, XMM0, _MM_SHUFFLE(0,1,2,3));
208 _mm_store_ps(T+ 8, XMM1);
209 _mm_store_ps(T+12, XMM0);
210 S += 8;
211 T += 16;
212 }
213 /*
214 for mdct_butterfly_first
215 */
216 S = mdct->trig;
217 T = aligned_malloc(sizeof(*T)*n*2);
218 mdct->trig_butterfly_first = T;
219 for(i=0;i<n4;i+=4)
220 {
221 __m128 XMM0, XMM1, XMM2, XMM3, XMM4, XMM5;
222 XMM2 = _mm_load_ps(S );
223 XMM0 = _mm_load_ps(S+ 4);
224 XMM5 = _mm_load_ps(S+ 8);
225 XMM3 = _mm_load_ps(S+12);
226 XMM1 = XMM0;
227 XMM4 = XMM3;
228 XMM0 = _mm_shuffle_ps(XMM0, XMM2, _MM_SHUFFLE(0,1,0,1));
229 XMM1 = _mm_shuffle_ps(XMM1, XMM2, _MM_SHUFFLE(1,0,1,0));
230 XMM3 = _mm_shuffle_ps(XMM3, XMM5, _MM_SHUFFLE(0,1,0,1));
231 XMM4 = _mm_shuffle_ps(XMM4, XMM5, _MM_SHUFFLE(1,0,1,0));
232 XMM1 = _mm_xor_ps(XMM1, PCS_RNRN.v);
233 XMM4 = _mm_xor_ps(XMM4, PCS_RNRN.v);
234 _mm_store_ps(T , XMM1);
235 _mm_store_ps(T+ 4, XMM4);
236 _mm_store_ps(T+ 8, XMM0);
237 _mm_store_ps(T+12, XMM3);
238 XMM1 = _mm_xor_ps(XMM1, PCS_RRRR.v);
239 XMM4 = _mm_xor_ps(XMM4, PCS_RRRR.v);
240 XMM0 = _mm_xor_ps(XMM0, PCS_RRRR.v);
241 XMM3 = _mm_xor_ps(XMM3, PCS_RRRR.v);
242 _mm_store_ps(T+n , XMM1);
243 _mm_store_ps(T+n+ 4, XMM4);
244 _mm_store_ps(T+n+ 8, XMM0);
245 _mm_store_ps(T+n+12, XMM3);
246 S += 16;
247 T += 16;
248 }
249 /*
250 for mdct_butterfly_generic(trigint=8)
251 */
252 S = mdct->trig;
253 T = aligned_malloc(sizeof(*T)*n2);
254 mdct->trig_butterfly_generic8 = T;
255 for(i=0;i<n;i+=32)
256 {
257 __m128 XMM0, XMM1, XMM2, XMM3, XMM4, XMM5;
258
259 XMM0 = _mm_load_ps(S+ 24);
260 XMM2 = _mm_load_ps(S+ 16);
261 XMM3 = _mm_load_ps(S+ 8);
262 XMM5 = _mm_load_ps(S );
263 XMM1 = XMM0;
264 XMM4 = XMM3;
265 XMM0 = _mm_shuffle_ps(XMM0, XMM2, _MM_SHUFFLE(0,1,0,1));
266 XMM1 = _mm_shuffle_ps(XMM1, XMM2, _MM_SHUFFLE(1,0,1,0));
267 XMM3 = _mm_shuffle_ps(XMM3, XMM5, _MM_SHUFFLE(0,1,0,1));
268 XMM4 = _mm_shuffle_ps(XMM4, XMM5, _MM_SHUFFLE(1,0,1,0));
269 XMM1 = _mm_xor_ps(XMM1, PCS_RNRN.v);
270 XMM4 = _mm_xor_ps(XMM4, PCS_RNRN.v);
271 _mm_store_ps(T , XMM0);
272 _mm_store_ps(T+ 4, XMM1);
273 _mm_store_ps(T+ 8, XMM3);
274 _mm_store_ps(T+12, XMM4);
275 S += 32;
276 T += 16;
277 }
278 /*
279 for mdct_butterfly_generic(trigint=16)
280 */
281 S = mdct->trig;
282 T = aligned_malloc(sizeof(*T)*n4);
283 mdct->trig_butterfly_generic16 = T;
284 for(i=0;i<n;i+=64)
285 {
286 __m128 XMM0, XMM1, XMM2, XMM3, XMM4, XMM5;
287
288 XMM0 = _mm_load_ps(S+ 48);
289 XMM2 = _mm_load_ps(S+ 32);
290 XMM3 = _mm_load_ps(S+ 16);
291 XMM5 = _mm_load_ps(S );
292 XMM1 = XMM0;
293 XMM4 = XMM3;
294 XMM0 = _mm_shuffle_ps(XMM0, XMM2, _MM_SHUFFLE(0,1,0,1));
295 XMM1 = _mm_shuffle_ps(XMM1, XMM2, _MM_SHUFFLE(1,0,1,0));
296 XMM3 = _mm_shuffle_ps(XMM3, XMM5, _MM_SHUFFLE(0,1,0,1));
297 XMM4 = _mm_shuffle_ps(XMM4, XMM5, _MM_SHUFFLE(1,0,1,0));
298 XMM1 = _mm_xor_ps(XMM1, PCS_RNRN.v);
299 XMM4 = _mm_xor_ps(XMM4, PCS_RNRN.v);
300 _mm_store_ps(T , XMM0);
301 _mm_store_ps(T+ 4, XMM1);
302 _mm_store_ps(T+ 8, XMM3);
303 _mm_store_ps(T+12, XMM4);
304 S += 64;
305 T += 16;
306 }
307 /*
308 for mdct_butterfly_generic(trigint=32)
309 */
310 if(n<128)
311 mdct->trig_butterfly_generic32 = NULL;
312 else
313 {
314 S = mdct->trig;
315 T = aligned_malloc(sizeof(*T)*n8);
316 mdct->trig_butterfly_generic32 = T;
317 for(i=0;i<n;i+=128)
318 {
319 __m128 XMM0, XMM1, XMM2, XMM3, XMM4, XMM5;
320
321 XMM0 = _mm_load_ps(S+ 96);
322 XMM2 = _mm_load_ps(S+ 64);
323 XMM3 = _mm_load_ps(S+ 32);
324 XMM5 = _mm_load_ps(S );
325 XMM1 = XMM0;
326 XMM4 = XMM3;
327 XMM0 = _mm_shuffle_ps(XMM0, XMM2, _MM_SHUFFLE(0,1,0,1));
328 XMM1 = _mm_shuffle_ps(XMM1, XMM2, _MM_SHUFFLE(1,0,1,0));
329 XMM3 = _mm_shuffle_ps(XMM3, XMM5, _MM_SHUFFLE(0,1,0,1));
330 XMM4 = _mm_shuffle_ps(XMM4, XMM5, _MM_SHUFFLE(1,0,1,0));
331 XMM1 = _mm_xor_ps(XMM1, PCS_RNRN.v);
332 XMM4 = _mm_xor_ps(XMM4, PCS_RNRN.v);
333 _mm_store_ps(T , XMM0);
334 _mm_store_ps(T+ 4, XMM1);
335 _mm_store_ps(T+ 8, XMM3);
336 _mm_store_ps(T+12, XMM4);
337 S += 128;
338 T += 16;
339 }
340 }
341 /*
342 for mdct_butterfly_generic(trigint=64)
343 */
344 if(n<256)
345 mdct->trig_butterfly_generic64 = NULL;
346 else
347 {
348 S = mdct->trig;
349 T = aligned_malloc(sizeof(*T)*(n8>>1));
350 mdct->trig_butterfly_generic64 = T;
351 for(i=0;i<n;i+=256)
352 {
353 __m128 XMM0, XMM1, XMM2, XMM3, XMM4, XMM5;
354
355 XMM0 = _mm_load_ps(S+192);
356 XMM2 = _mm_load_ps(S+128);
357 XMM3 = _mm_load_ps(S+ 64);
358 XMM5 = _mm_load_ps(S );
359 XMM1 = XMM0;
360 XMM4 = XMM3;
361 XMM0 = _mm_shuffle_ps(XMM0, XMM2, _MM_SHUFFLE(0,1,0,1));
362 XMM1 = _mm_shuffle_ps(XMM1, XMM2, _MM_SHUFFLE(1,0,1,0));
363 XMM3 = _mm_shuffle_ps(XMM3, XMM5, _MM_SHUFFLE(0,1,0,1));
364 XMM4 = _mm_shuffle_ps(XMM4, XMM5, _MM_SHUFFLE(1,0,1,0));
365 XMM1 = _mm_xor_ps(XMM1, PCS_RNRN.v);
366 XMM4 = _mm_xor_ps(XMM4, PCS_RNRN.v);
367 _mm_store_ps(T , XMM0);
368 _mm_store_ps(T+ 4, XMM1);
369 _mm_store_ps(T+ 8, XMM3);
370 _mm_store_ps(T+12, XMM4);
371 S += 256;
372 T += 16;
373 }
374 }
375 }
376 }
377
378 void
sse_mdct_ctx_close(MDCTContext * mdct)379 sse_mdct_ctx_close(MDCTContext *mdct)
380 {
381 if(mdct) {
382 if(mdct->trig) aligned_free(mdct->trig);
383 if(mdct->bitrev) aligned_free(mdct->bitrev);
384 if(mdct->trig_bitreverse) aligned_free(mdct->trig_bitreverse);
385 if(mdct->trig_forward) aligned_free(mdct->trig_forward);
386 if(mdct->trig_butterfly_first) aligned_free(mdct->trig_butterfly_first);
387 if(mdct->trig_butterfly_generic8) aligned_free(mdct->trig_butterfly_generic8);
388 if(mdct->trig_butterfly_generic16) aligned_free(mdct->trig_butterfly_generic16);
389 if(mdct->trig_butterfly_generic32) aligned_free(mdct->trig_butterfly_generic32);
390 if(mdct->trig_butterfly_generic64) aligned_free(mdct->trig_butterfly_generic64);
391 memset(mdct, 0, sizeof(MDCTContext));
392 }
393 }
394
395 void
sse_mdct_tctx_init(MDCTThreadContext * tmdct,int n)396 sse_mdct_tctx_init(MDCTThreadContext *tmdct, int n)
397 {
398 // internal mdct buffers
399 tmdct->buffer = aligned_malloc((n+2) * sizeof(FLOAT));/* +2 to prevent illegal read in bitreverse*/
400 tmdct->buffer1 = aligned_malloc(n * sizeof(FLOAT));
401 }
402
403 void
sse_mdct_tctx_close(MDCTThreadContext * tmdct)404 sse_mdct_tctx_close(MDCTThreadContext *tmdct)
405 {
406 if(tmdct) {
407 if(tmdct->buffer) aligned_free(tmdct->buffer);
408 if(tmdct->buffer1) aligned_free(tmdct->buffer1);
409 }
410 }
411