1 /**
2  * Aften: A/52 audio encoder
3  * Copyright (c) 2007 Prakash Punnoor <prakash@punnoor.de>
4  *               2006 Justin Ruggles
5  *
6  * Based on "The simplest AC3 encoder" from FFmpeg
7  * Copyright (c) 2000 Fabrice Bellard.
8  *
9  * This library is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public
11  * License as published by the Free Software Foundation; either
12  * version 2 of the License
13  *
14  * This library is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with this library; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22  */
23 
24 /**
25  * @file x86_mmx_exponent.c
26  * A/52 mmx optimized exponent functions
27  */
28 
29 #include "exponent_common.c"
30 #include "x86_simd_support.h"
31 
32 #include <mmintrin.h>
33 
34 
35 /* set exp[i] to min(exp[i], exp1[i]) */
36 static void
exponent_min(uint8_t * exp,uint8_t * exp1,int n)37 exponent_min(uint8_t *exp, uint8_t *exp1, int n)
38 {
39     int i;
40 
41     for(i=0; i<(n & ~7); i+=8) {
42         __m64 vexp = *(__m64*)&exp[i];
43         __m64 vexp1 = *(__m64*)&exp1[i];
44         __m64 vmask = _mm_cmpgt_pi8(vexp, vexp1);
45         vexp = _mm_andnot_si64(vmask, vexp);
46         vexp1 = _mm_and_si64(vmask, vexp1);
47         vexp = _mm_or_si64(vexp, vexp1);
48         *(__m64*)&exp[i] = vexp;
49     }
50     switch(n & 7) {
51     case 7:
52         exp[i] = MIN(exp[i], exp1[i]);
53         ++i;
54     case 6:
55         exp[i] = MIN(exp[i], exp1[i]);
56         ++i;
57     case 5:
58         exp[i] = MIN(exp[i], exp1[i]);
59         ++i;
60     case 4:
61         exp[i] = MIN(exp[i], exp1[i]);
62         ++i;
63     case 3:
64         exp[i] = MIN(exp[i], exp1[i]);
65         ++i;
66     case 2:
67         exp[i] = MIN(exp[i], exp1[i]);
68         ++i;
69     case 1:
70         exp[i] = MIN(exp[i], exp1[i]);
71     case 0:
72         ;
73     }
74 }
75 
76 
77 /**
78  * Update the exponents so that they are the ones the decoder will decode.
79  * Constrain DC exponent, group exponents based on strategy, constrain delta
80  * between adjacent exponents to +2/-2.
81  */
82 static void
encode_exp_blk_ch(uint8_t * exp,int ncoefs,int exp_strategy)83 encode_exp_blk_ch(uint8_t *exp, int ncoefs, int exp_strategy)
84 {
85     int grpsize, ngrps, i, k, exp_min1, exp_min2;
86     uint8_t v;
87 
88     ngrps = nexpgrptab[exp_strategy-1][ncoefs] * 3;
89     grpsize = exp_strategy + (exp_strategy == EXP_D45);
90 
91     // for D15 strategy, there is no need to group/ungroup exponents
92     switch (grpsize) {
93     case 1: {
94         // constraint for DC exponent
95         exp[0] = MIN(exp[0], 15);
96 
97         // Decrease the delta between each groups to within 2
98         // so that they can be differentially encoded
99         for(i=1; i<=ngrps; i++)
100             exp[i] = MIN(exp[i], exp[i-1]+2);
101         for(i=ngrps-1; i>=0; i--)
102             exp[i] = MIN(exp[i], exp[i+1]+2);
103 
104         return;
105     }
106     // for each group, compute the minimum exponent
107     case 2: {
108         ALIGN16(uint16_t) exp1[256];
109         ALIGN16(const union __m64ui) vmask = {{0x00ff00ff, 0x00ff00ff}};
110 
111         i=0;k=1;
112         for(; i<(ngrps & ~3); i+=4, k+=8) {
113             __m64 v1 = *(__m64*)&exp[k];
114             __m64 v2 = _mm_srli_si64(v1, 8);
115             __m64 vcmask;
116             v1 = _mm_and_si64(v1, vmask.v);
117             //v1 = _mm_min_pi8(v1, v2);
118             vcmask = _mm_cmpgt_pi8(v1, v2);
119             v1 = _mm_andnot_si64(vcmask, v1);
120             v2 = _mm_and_si64(vcmask, v2);
121             v1 = _mm_or_si64(v1, v2);
122 
123             *(__m64*)&exp1[i] = v1;
124         }
125         switch (ngrps & 3) {
126         case 3:
127             exp1[i] = MIN(exp[k], exp[k+1]);
128             ++i;
129             k += 2;
130         case 2:
131             exp1[i] = MIN(exp[k], exp[k+1]);
132             ++i;
133             k += 2;
134         case 1:
135             exp1[i] = MIN(exp[k], exp[k+1]);
136         case 0:
137             ;
138         }
139         // constraint for DC exponent
140         exp[0] = MIN(exp[0], 15);
141         // Decrease the delta between each groups to within 2
142         // so that they can be differentially encoded
143         exp1[0] = MIN(exp1[0], (uint16_t)exp[0]+2);
144         for(i=1; i<ngrps; i++)
145             exp1[i] = MIN(exp1[i], exp1[i-1]+2);
146         for(i=ngrps-2; i>=0; i--)
147             exp1[i] = MIN(exp1[i], exp1[i+1]+2);
148        // now we have the exponent values the decoder will see
149         exp[0] = MIN(exp[0], exp1[0]+2); // DC exponent is handled separately
150 
151         i=0;k=1;
152         for(; i<(ngrps & ~3); i+=4, k+=8) {
153             __m64 v1 = *(__m64*)&exp1[i];
154             __m64 v2 = _mm_slli_si64(v1, 8);
155             v1 = _mm_or_si64(v1, v2);
156             *(__m64*)&exp[k] = v1;
157         }
158         switch (ngrps & 3) {
159         case 3:
160             v = exp1[i];
161             exp[k] = v;
162             exp[k+1] = v;
163             ++i;
164             k += 2;
165         case 2:
166             v = exp1[i];
167             exp[k] = v;
168             exp[k+1] = v;
169             ++i;
170             k += 2;
171         case 1:
172             v = exp1[i];
173             exp[k] = v;
174             exp[k+1] = v;
175         case 0:
176             ;
177         }
178         return;
179         }
180     default: {
181         ALIGN16(uint32_t) exp1[256];
182         ALIGN16(const union __m64ui) vmask2 = {{0x000000ff, 0x000000ff}};
183 
184         i=0;k=1;
185         for(; i<(ngrps & ~1); i+=2, k+=8) {
186             __m64 v1 = *(__m64*)&exp[k];
187             __m64 v2 = _mm_srli_si64(v1, 8);
188             //v1 = _mm_min_pi8(v1, v2);
189             __m64 vcmask = _mm_cmpgt_pi8(v1, v2);
190             v1 = _mm_andnot_si64(vcmask, v1);
191             v2 = _mm_and_si64(vcmask, v2);
192             v1 = _mm_or_si64(v1, v2);
193 
194             v2 = _mm_srli_si64(v1, 16);
195             //v1 = _mm_min_pi8(v1, v2);
196             vcmask = _mm_cmpgt_pi8(v1, v2);
197             v1 = _mm_andnot_si64(vcmask, v1);
198             v2 = _mm_and_si64(vcmask, v2);
199             v1 = _mm_or_si64(v1, v2);
200 
201             v1 = _mm_and_si64(v1, vmask2.v);
202             *(__m64*)&exp1[i] = v1;
203         }
204         if(ngrps & 1) {
205             exp_min1 = MIN(exp[k  ], exp[k+1]);
206             exp_min2 = MIN(exp[k+2], exp[k+3]);
207             exp1[i]  = MIN(exp_min1, exp_min2);
208         }
209         // constraint for DC exponent
210         exp[0] = MIN(exp[0], 15);
211         // Decrease the delta between each groups to within 2
212         // so that they can be differentially encoded
213         exp1[0] = MIN(exp1[0], (uint32_t)exp[0]+2);
214         for(i=1; i<ngrps; i++)
215             exp1[i] = MIN(exp1[i], exp1[i-1]+2);
216         for(i=ngrps-2; i>=0; i--)
217             exp1[i] = MIN(exp1[i], exp1[i+1]+2);
218        // now we have the exponent values the decoder will see
219         exp[0] = MIN(exp[0], exp1[0]+2); // DC exponent is handled separately
220 
221         i=0;k=1;
222         for(; i<(ngrps & ~1); i+=2, k+=8) {
223             __m64 v1 = *(__m64*)&exp1[i];
224             __m64 v2 = _mm_slli_si64(v1, 8);
225             v1 = _mm_or_si64(v1, v2);
226             v2 = _mm_slli_si64(v1, 16);
227             v1 = _mm_or_si64(v1, v2);
228             *(__m64*)&exp[k] = v1;
229         }
230         if(ngrps & 1) {
231             v = exp1[i];
232             exp[k] = v;
233             exp[k+1] = v;
234             exp[k+2] = v;
235             exp[k+3] = v;
236         }
237         return;
238     }
239     }
240 }
241 
242 
243 /**
244  * Determine a good exponent strategy for all blocks of a single channel.
245  * A pre-defined set of strategies is chosen based on the SSE between each set
246  * and the most accurate strategy set (all blocks EXP_D15).
247  */
248 static int
compute_expstr_ch(uint8_t * exp[A52_NUM_BLOCKS],int ncoefs)249 compute_expstr_ch(uint8_t *exp[A52_NUM_BLOCKS], int ncoefs)
250 {
251     ALIGN16(uint8_t) exponents[A52_NUM_BLOCKS][256];
252     int blk, str, i, j, k;
253     int min_error, exp_error[6];
254     int err;
255 
256     min_error = 1;
257     for(str=1; str<6; str++) {
258         // collect exponents
259         for(blk=0; blk<A52_NUM_BLOCKS; blk++) {
260             memcpy(exponents[blk], exp[blk], 256);
261         }
262 
263         // encode exponents
264         i = 0;
265         while(i < A52_NUM_BLOCKS) {
266             j = i + 1;
267             while(j < A52_NUM_BLOCKS && str_predef[str][j]==EXP_REUSE) {
268                 exponent_min(exponents[i], exponents[j], ncoefs);
269                 j++;
270             }
271             encode_exp_blk_ch(exponents[i], ncoefs, str_predef[str][i]);
272             for(k=i+1; k<j; k++) {
273                 memcpy(exponents[k], exponents[i], 256);
274             }
275             i = j;
276         }
277 
278         // select strategy based on minimum error from unencoded exponents
279         exp_error[str] = 0;
280         for(blk=0; blk<A52_NUM_BLOCKS; blk++) {
281             uint8_t *exp_blk = exp[blk];
282             uint8_t *exponents_blk = exponents[blk];
283             union {
284                 __m64 v;
285                 int32_t res[2];
286             } ures;
287             __m64 vzero = _mm_setzero_si64();
288             __m64 vres = vzero;
289 
290             for(i=0; i<(ncoefs & ~7); i+=8) {
291                 __m64 vexp = *(__m64*)&exp_blk[i];
292                 __m64 vexp2 = *(__m64*)&exponents_blk[i];
293 #if 0
294                 //safer but needed?
295                 __m64 vexphi = _mm_unpackhi_pi8(vexp, vzero);
296                 __m64 vexp2hi = _mm_unpackhi_pi8(vexp2, vzero);
297                 __m64 vexplo = _mm_unpacklo_pi8(vexp, vzero);
298                 __m64 vexp2lo = _mm_unpacklo_pi8(vexp2, vzero);
299                 __m64 verrhi = _mm_sub_pi16(vexphi, vexp2hi);
300                 __m64 verrlo = _mm_sub_pi16(vexplo, vexp2lo);
301 #else
302                 __m64 verr = _mm_sub_pi8(vexp, vexp2);
303                 __m64 vsign = _mm_cmpgt_pi8(vzero, verr);
304                 __m64 verrhi = _mm_unpackhi_pi8(verr, vsign);
305                 __m64 verrlo = _mm_unpacklo_pi8(verr, vsign);
306 #endif
307                 verrhi = _mm_madd_pi16(verrhi, verrhi);
308                 verrlo = _mm_madd_pi16(verrlo, verrlo);
309                 verrhi = _mm_add_pi32(verrhi, verrlo);
310                 vres = _mm_add_pi32(vres, verrhi);
311             }
312             ures.v = vres;
313             exp_error[str] += ures.res[0]+ures.res[1];
314             switch(ncoefs & 7) {
315             case 7:
316                 err = exp_blk[i] - exponents_blk[i];
317                 exp_error[str] += (err * err);
318                 ++i;
319             case 6:
320                 err = exp_blk[i] - exponents_blk[i];
321                 exp_error[str] += (err * err);
322                 ++i;
323             case 5:
324                 err = exp_blk[i] - exponents_blk[i];
325                 exp_error[str] += (err * err);
326                 ++i;
327             case 4:
328                 err = exp_blk[i] - exponents_blk[i];
329                 exp_error[str] += (err * err);
330                 ++i;
331             case 3:
332                 err = exp_blk[i] - exponents_blk[i];
333                 exp_error[str] += (err * err);
334                 ++i;
335             case 2:
336                 err = exp_blk[i] - exponents_blk[i];
337                 exp_error[str] += (err * err);
338                 ++i;
339             case 1:
340                 err = exp_blk[i] - exponents_blk[i];
341                 exp_error[str] += (err * err);
342             case 0:
343                 ;
344             }
345         }
346         if(exp_error[str] < exp_error[min_error]) {
347             min_error = str;
348         }
349     }
350     return min_error;
351 }
352 
353 
354 /**
355  * Runs all the processes in extracting, analyzing, and encoding exponents
356  */
357 void
mmx_process_exponents(A52ThreadContext * tctx)358 mmx_process_exponents(A52ThreadContext *tctx)
359 {
360     extract_exponents(tctx);
361 
362     compute_exponent_strategy(tctx);
363 
364     encode_exponents(tctx);
365 
366     group_exponents(tctx);
367     _mm_empty();
368 }
369