1 /**
2 * Aften: A/52 audio encoder
3 * Copyright (c) 2007 Prakash Punnoor <prakash@punnoor.de>
4 * 2006 Justin Ruggles
5 *
6 * Based on "The simplest AC3 encoder" from FFmpeg
7 * Copyright (c) 2000 Fabrice Bellard.
8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2 of the License
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23
24 /**
25 * @file x86_mmx_exponent.c
26 * A/52 mmx optimized exponent functions
27 */
28
29 #include "exponent_common.c"
30 #include "x86_simd_support.h"
31
32 #include <mmintrin.h>
33
34
35 /* set exp[i] to min(exp[i], exp1[i]) */
36 static void
exponent_min(uint8_t * exp,uint8_t * exp1,int n)37 exponent_min(uint8_t *exp, uint8_t *exp1, int n)
38 {
39 int i;
40
41 for(i=0; i<(n & ~7); i+=8) {
42 __m64 vexp = *(__m64*)&exp[i];
43 __m64 vexp1 = *(__m64*)&exp1[i];
44 __m64 vmask = _mm_cmpgt_pi8(vexp, vexp1);
45 vexp = _mm_andnot_si64(vmask, vexp);
46 vexp1 = _mm_and_si64(vmask, vexp1);
47 vexp = _mm_or_si64(vexp, vexp1);
48 *(__m64*)&exp[i] = vexp;
49 }
50 switch(n & 7) {
51 case 7:
52 exp[i] = MIN(exp[i], exp1[i]);
53 ++i;
54 case 6:
55 exp[i] = MIN(exp[i], exp1[i]);
56 ++i;
57 case 5:
58 exp[i] = MIN(exp[i], exp1[i]);
59 ++i;
60 case 4:
61 exp[i] = MIN(exp[i], exp1[i]);
62 ++i;
63 case 3:
64 exp[i] = MIN(exp[i], exp1[i]);
65 ++i;
66 case 2:
67 exp[i] = MIN(exp[i], exp1[i]);
68 ++i;
69 case 1:
70 exp[i] = MIN(exp[i], exp1[i]);
71 case 0:
72 ;
73 }
74 }
75
76
77 /**
78 * Update the exponents so that they are the ones the decoder will decode.
79 * Constrain DC exponent, group exponents based on strategy, constrain delta
80 * between adjacent exponents to +2/-2.
81 */
82 static void
encode_exp_blk_ch(uint8_t * exp,int ncoefs,int exp_strategy)83 encode_exp_blk_ch(uint8_t *exp, int ncoefs, int exp_strategy)
84 {
85 int grpsize, ngrps, i, k, exp_min1, exp_min2;
86 uint8_t v;
87
88 ngrps = nexpgrptab[exp_strategy-1][ncoefs] * 3;
89 grpsize = exp_strategy + (exp_strategy == EXP_D45);
90
91 // for D15 strategy, there is no need to group/ungroup exponents
92 switch (grpsize) {
93 case 1: {
94 // constraint for DC exponent
95 exp[0] = MIN(exp[0], 15);
96
97 // Decrease the delta between each groups to within 2
98 // so that they can be differentially encoded
99 for(i=1; i<=ngrps; i++)
100 exp[i] = MIN(exp[i], exp[i-1]+2);
101 for(i=ngrps-1; i>=0; i--)
102 exp[i] = MIN(exp[i], exp[i+1]+2);
103
104 return;
105 }
106 // for each group, compute the minimum exponent
107 case 2: {
108 ALIGN16(uint16_t) exp1[256];
109 ALIGN16(const union __m64ui) vmask = {{0x00ff00ff, 0x00ff00ff}};
110
111 i=0;k=1;
112 for(; i<(ngrps & ~3); i+=4, k+=8) {
113 __m64 v1 = *(__m64*)&exp[k];
114 __m64 v2 = _mm_srli_si64(v1, 8);
115 __m64 vcmask;
116 v1 = _mm_and_si64(v1, vmask.v);
117 //v1 = _mm_min_pi8(v1, v2);
118 vcmask = _mm_cmpgt_pi8(v1, v2);
119 v1 = _mm_andnot_si64(vcmask, v1);
120 v2 = _mm_and_si64(vcmask, v2);
121 v1 = _mm_or_si64(v1, v2);
122
123 *(__m64*)&exp1[i] = v1;
124 }
125 switch (ngrps & 3) {
126 case 3:
127 exp1[i] = MIN(exp[k], exp[k+1]);
128 ++i;
129 k += 2;
130 case 2:
131 exp1[i] = MIN(exp[k], exp[k+1]);
132 ++i;
133 k += 2;
134 case 1:
135 exp1[i] = MIN(exp[k], exp[k+1]);
136 case 0:
137 ;
138 }
139 // constraint for DC exponent
140 exp[0] = MIN(exp[0], 15);
141 // Decrease the delta between each groups to within 2
142 // so that they can be differentially encoded
143 exp1[0] = MIN(exp1[0], (uint16_t)exp[0]+2);
144 for(i=1; i<ngrps; i++)
145 exp1[i] = MIN(exp1[i], exp1[i-1]+2);
146 for(i=ngrps-2; i>=0; i--)
147 exp1[i] = MIN(exp1[i], exp1[i+1]+2);
148 // now we have the exponent values the decoder will see
149 exp[0] = MIN(exp[0], exp1[0]+2); // DC exponent is handled separately
150
151 i=0;k=1;
152 for(; i<(ngrps & ~3); i+=4, k+=8) {
153 __m64 v1 = *(__m64*)&exp1[i];
154 __m64 v2 = _mm_slli_si64(v1, 8);
155 v1 = _mm_or_si64(v1, v2);
156 *(__m64*)&exp[k] = v1;
157 }
158 switch (ngrps & 3) {
159 case 3:
160 v = exp1[i];
161 exp[k] = v;
162 exp[k+1] = v;
163 ++i;
164 k += 2;
165 case 2:
166 v = exp1[i];
167 exp[k] = v;
168 exp[k+1] = v;
169 ++i;
170 k += 2;
171 case 1:
172 v = exp1[i];
173 exp[k] = v;
174 exp[k+1] = v;
175 case 0:
176 ;
177 }
178 return;
179 }
180 default: {
181 ALIGN16(uint32_t) exp1[256];
182 ALIGN16(const union __m64ui) vmask2 = {{0x000000ff, 0x000000ff}};
183
184 i=0;k=1;
185 for(; i<(ngrps & ~1); i+=2, k+=8) {
186 __m64 v1 = *(__m64*)&exp[k];
187 __m64 v2 = _mm_srli_si64(v1, 8);
188 //v1 = _mm_min_pi8(v1, v2);
189 __m64 vcmask = _mm_cmpgt_pi8(v1, v2);
190 v1 = _mm_andnot_si64(vcmask, v1);
191 v2 = _mm_and_si64(vcmask, v2);
192 v1 = _mm_or_si64(v1, v2);
193
194 v2 = _mm_srli_si64(v1, 16);
195 //v1 = _mm_min_pi8(v1, v2);
196 vcmask = _mm_cmpgt_pi8(v1, v2);
197 v1 = _mm_andnot_si64(vcmask, v1);
198 v2 = _mm_and_si64(vcmask, v2);
199 v1 = _mm_or_si64(v1, v2);
200
201 v1 = _mm_and_si64(v1, vmask2.v);
202 *(__m64*)&exp1[i] = v1;
203 }
204 if(ngrps & 1) {
205 exp_min1 = MIN(exp[k ], exp[k+1]);
206 exp_min2 = MIN(exp[k+2], exp[k+3]);
207 exp1[i] = MIN(exp_min1, exp_min2);
208 }
209 // constraint for DC exponent
210 exp[0] = MIN(exp[0], 15);
211 // Decrease the delta between each groups to within 2
212 // so that they can be differentially encoded
213 exp1[0] = MIN(exp1[0], (uint32_t)exp[0]+2);
214 for(i=1; i<ngrps; i++)
215 exp1[i] = MIN(exp1[i], exp1[i-1]+2);
216 for(i=ngrps-2; i>=0; i--)
217 exp1[i] = MIN(exp1[i], exp1[i+1]+2);
218 // now we have the exponent values the decoder will see
219 exp[0] = MIN(exp[0], exp1[0]+2); // DC exponent is handled separately
220
221 i=0;k=1;
222 for(; i<(ngrps & ~1); i+=2, k+=8) {
223 __m64 v1 = *(__m64*)&exp1[i];
224 __m64 v2 = _mm_slli_si64(v1, 8);
225 v1 = _mm_or_si64(v1, v2);
226 v2 = _mm_slli_si64(v1, 16);
227 v1 = _mm_or_si64(v1, v2);
228 *(__m64*)&exp[k] = v1;
229 }
230 if(ngrps & 1) {
231 v = exp1[i];
232 exp[k] = v;
233 exp[k+1] = v;
234 exp[k+2] = v;
235 exp[k+3] = v;
236 }
237 return;
238 }
239 }
240 }
241
242
243 /**
244 * Determine a good exponent strategy for all blocks of a single channel.
245 * A pre-defined set of strategies is chosen based on the SSE between each set
246 * and the most accurate strategy set (all blocks EXP_D15).
247 */
248 static int
compute_expstr_ch(uint8_t * exp[A52_NUM_BLOCKS],int ncoefs)249 compute_expstr_ch(uint8_t *exp[A52_NUM_BLOCKS], int ncoefs)
250 {
251 ALIGN16(uint8_t) exponents[A52_NUM_BLOCKS][256];
252 int blk, str, i, j, k;
253 int min_error, exp_error[6];
254 int err;
255
256 min_error = 1;
257 for(str=1; str<6; str++) {
258 // collect exponents
259 for(blk=0; blk<A52_NUM_BLOCKS; blk++) {
260 memcpy(exponents[blk], exp[blk], 256);
261 }
262
263 // encode exponents
264 i = 0;
265 while(i < A52_NUM_BLOCKS) {
266 j = i + 1;
267 while(j < A52_NUM_BLOCKS && str_predef[str][j]==EXP_REUSE) {
268 exponent_min(exponents[i], exponents[j], ncoefs);
269 j++;
270 }
271 encode_exp_blk_ch(exponents[i], ncoefs, str_predef[str][i]);
272 for(k=i+1; k<j; k++) {
273 memcpy(exponents[k], exponents[i], 256);
274 }
275 i = j;
276 }
277
278 // select strategy based on minimum error from unencoded exponents
279 exp_error[str] = 0;
280 for(blk=0; blk<A52_NUM_BLOCKS; blk++) {
281 uint8_t *exp_blk = exp[blk];
282 uint8_t *exponents_blk = exponents[blk];
283 union {
284 __m64 v;
285 int32_t res[2];
286 } ures;
287 __m64 vzero = _mm_setzero_si64();
288 __m64 vres = vzero;
289
290 for(i=0; i<(ncoefs & ~7); i+=8) {
291 __m64 vexp = *(__m64*)&exp_blk[i];
292 __m64 vexp2 = *(__m64*)&exponents_blk[i];
293 #if 0
294 //safer but needed?
295 __m64 vexphi = _mm_unpackhi_pi8(vexp, vzero);
296 __m64 vexp2hi = _mm_unpackhi_pi8(vexp2, vzero);
297 __m64 vexplo = _mm_unpacklo_pi8(vexp, vzero);
298 __m64 vexp2lo = _mm_unpacklo_pi8(vexp2, vzero);
299 __m64 verrhi = _mm_sub_pi16(vexphi, vexp2hi);
300 __m64 verrlo = _mm_sub_pi16(vexplo, vexp2lo);
301 #else
302 __m64 verr = _mm_sub_pi8(vexp, vexp2);
303 __m64 vsign = _mm_cmpgt_pi8(vzero, verr);
304 __m64 verrhi = _mm_unpackhi_pi8(verr, vsign);
305 __m64 verrlo = _mm_unpacklo_pi8(verr, vsign);
306 #endif
307 verrhi = _mm_madd_pi16(verrhi, verrhi);
308 verrlo = _mm_madd_pi16(verrlo, verrlo);
309 verrhi = _mm_add_pi32(verrhi, verrlo);
310 vres = _mm_add_pi32(vres, verrhi);
311 }
312 ures.v = vres;
313 exp_error[str] += ures.res[0]+ures.res[1];
314 switch(ncoefs & 7) {
315 case 7:
316 err = exp_blk[i] - exponents_blk[i];
317 exp_error[str] += (err * err);
318 ++i;
319 case 6:
320 err = exp_blk[i] - exponents_blk[i];
321 exp_error[str] += (err * err);
322 ++i;
323 case 5:
324 err = exp_blk[i] - exponents_blk[i];
325 exp_error[str] += (err * err);
326 ++i;
327 case 4:
328 err = exp_blk[i] - exponents_blk[i];
329 exp_error[str] += (err * err);
330 ++i;
331 case 3:
332 err = exp_blk[i] - exponents_blk[i];
333 exp_error[str] += (err * err);
334 ++i;
335 case 2:
336 err = exp_blk[i] - exponents_blk[i];
337 exp_error[str] += (err * err);
338 ++i;
339 case 1:
340 err = exp_blk[i] - exponents_blk[i];
341 exp_error[str] += (err * err);
342 case 0:
343 ;
344 }
345 }
346 if(exp_error[str] < exp_error[min_error]) {
347 min_error = str;
348 }
349 }
350 return min_error;
351 }
352
353
354 /**
355 * Runs all the processes in extracting, analyzing, and encoding exponents
356 */
357 void
mmx_process_exponents(A52ThreadContext * tctx)358 mmx_process_exponents(A52ThreadContext *tctx)
359 {
360 extract_exponents(tctx);
361
362 compute_exponent_strategy(tctx);
363
364 encode_exponents(tctx);
365
366 group_exponents(tctx);
367 _mm_empty();
368 }
369