1 /*
2 	dct64_i386.c: DCT64, a C variant for i386
3 
4 	copyright ?-2006 by the mpg123 project - free software under the terms of the LGPL 2.1
5 	see COPYING and AUTHORS files in distribution or http://mpg123.org
6 	initially written by Michael Hipp
7 */
8 
9 /*
10  * Discrete Cosine Tansform (DCT) for subband synthesis
11  * optimized for machines with no auto-increment.
12  * The performance is highly compiler dependend. Maybe
13  * the dct64.c version for 'normal' processor may be faster
14  * even for Intel processors.
15  */
16 
17 #include "mpg123lib_intern.h"
18 
19 static void dct64_1(real *out0,real *out1,real *b1,real *b2,real *samples)
20 {
21  {
22   register real *costab = pnts[0];
23 
24   b1[0x00] = samples[0x00] + samples[0x1F];
25   b1[0x01] = samples[0x01] + samples[0x1E];
26   b1[0x1F] = REAL_MUL(samples[0x00] - samples[0x1F], costab[0x0]);
27   b1[0x1E] = REAL_MUL(samples[0x01] - samples[0x1E], costab[0x1]);
28 
29   b1[0x02] = samples[0x02] + samples[0x1D];
30   b1[0x03] = samples[0x03] + samples[0x1C];
31   b1[0x1D] = REAL_MUL(samples[0x02] - samples[0x1D], costab[0x2]);
32   b1[0x1C] = REAL_MUL(samples[0x03] - samples[0x1C], costab[0x3]);
33 
34   b1[0x04] = samples[0x04] + samples[0x1B];
35   b1[0x05] = samples[0x05] + samples[0x1A];
36   b1[0x1B] = REAL_MUL(samples[0x04] - samples[0x1B], costab[0x4]);
37   b1[0x1A] = REAL_MUL(samples[0x05] - samples[0x1A], costab[0x5]);
38 
39   b1[0x06] = samples[0x06] + samples[0x19];
40   b1[0x07] = samples[0x07] + samples[0x18];
41   b1[0x19] = REAL_MUL(samples[0x06] - samples[0x19], costab[0x6]);
42   b1[0x18] = REAL_MUL(samples[0x07] - samples[0x18], costab[0x7]);
43 
44   b1[0x08] = samples[0x08] + samples[0x17];
45   b1[0x09] = samples[0x09] + samples[0x16];
46   b1[0x17] = REAL_MUL(samples[0x08] - samples[0x17], costab[0x8]);
47   b1[0x16] = REAL_MUL(samples[0x09] - samples[0x16], costab[0x9]);
48 
49   b1[0x0A] = samples[0x0A] + samples[0x15];
50   b1[0x0B] = samples[0x0B] + samples[0x14];
51   b1[0x15] = REAL_MUL(samples[0x0A] - samples[0x15], costab[0xA]);
52   b1[0x14] = REAL_MUL(samples[0x0B] - samples[0x14], costab[0xB]);
53 
54   b1[0x0C] = samples[0x0C] + samples[0x13];
55   b1[0x0D] = samples[0x0D] + samples[0x12];
56   b1[0x13] = REAL_MUL(samples[0x0C] - samples[0x13], costab[0xC]);
57   b1[0x12] = REAL_MUL(samples[0x0D] - samples[0x12], costab[0xD]);
58 
59   b1[0x0E] = samples[0x0E] + samples[0x11];
60   b1[0x0F] = samples[0x0F] + samples[0x10];
61   b1[0x11] = REAL_MUL(samples[0x0E] - samples[0x11], costab[0xE]);
62   b1[0x10] = REAL_MUL(samples[0x0F] - samples[0x10], costab[0xF]);
63 
64  }
65 
66 
67  {
68   register real *costab = pnts[1];
69 
70   b2[0x00] = b1[0x00] + b1[0x0F];
71   b2[0x01] = b1[0x01] + b1[0x0E];
72   b2[0x0F] = REAL_MUL(b1[0x00] - b1[0x0F], costab[0]);
73   b2[0x0E] = REAL_MUL(b1[0x01] - b1[0x0E], costab[1]);
74 
75   b2[0x02] = b1[0x02] + b1[0x0D];
76   b2[0x03] = b1[0x03] + b1[0x0C];
77   b2[0x0D] = REAL_MUL(b1[0x02] - b1[0x0D], costab[2]);
78   b2[0x0C] = REAL_MUL(b1[0x03] - b1[0x0C], costab[3]);
79 
80   b2[0x04] = b1[0x04] + b1[0x0B];
81   b2[0x05] = b1[0x05] + b1[0x0A];
82   b2[0x0B] = REAL_MUL(b1[0x04] - b1[0x0B], costab[4]);
83   b2[0x0A] = REAL_MUL(b1[0x05] - b1[0x0A], costab[5]);
84 
85   b2[0x06] = b1[0x06] + b1[0x09];
86   b2[0x07] = b1[0x07] + b1[0x08];
87   b2[0x09] = REAL_MUL(b1[0x06] - b1[0x09], costab[6]);
88   b2[0x08] = REAL_MUL(b1[0x07] - b1[0x08], costab[7]);
89 
90   /* */
91 
92   b2[0x10] = b1[0x10] + b1[0x1F];
93   b2[0x11] = b1[0x11] + b1[0x1E];
94   b2[0x1F] = REAL_MUL(b1[0x1F] - b1[0x10], costab[0]);
95   b2[0x1E] = REAL_MUL(b1[0x1E] - b1[0x11], costab[1]);
96 
97   b2[0x12] = b1[0x12] + b1[0x1D];
98   b2[0x13] = b1[0x13] + b1[0x1C];
99   b2[0x1D] = REAL_MUL(b1[0x1D] - b1[0x12], costab[2]);
100   b2[0x1C] = REAL_MUL(b1[0x1C] - b1[0x13], costab[3]);
101 
102   b2[0x14] = b1[0x14] + b1[0x1B];
103   b2[0x15] = b1[0x15] + b1[0x1A];
104   b2[0x1B] = REAL_MUL(b1[0x1B] - b1[0x14], costab[4]);
105   b2[0x1A] = REAL_MUL(b1[0x1A] - b1[0x15], costab[5]);
106 
107   b2[0x16] = b1[0x16] + b1[0x19];
108   b2[0x17] = b1[0x17] + b1[0x18];
109   b2[0x19] = REAL_MUL(b1[0x19] - b1[0x16], costab[6]);
110   b2[0x18] = REAL_MUL(b1[0x18] - b1[0x17], costab[7]);
111  }
112 
113  {
114   register real *costab = pnts[2];
115 
116   b1[0x00] = b2[0x00] + b2[0x07];
117   b1[0x07] = REAL_MUL(b2[0x00] - b2[0x07], costab[0]);
118   b1[0x01] = b2[0x01] + b2[0x06];
119   b1[0x06] = REAL_MUL(b2[0x01] - b2[0x06], costab[1]);
120   b1[0x02] = b2[0x02] + b2[0x05];
121   b1[0x05] = REAL_MUL(b2[0x02] - b2[0x05], costab[2]);
122   b1[0x03] = b2[0x03] + b2[0x04];
123   b1[0x04] = REAL_MUL(b2[0x03] - b2[0x04], costab[3]);
124 
125   b1[0x08] = b2[0x08] + b2[0x0F];
126   b1[0x0F] = REAL_MUL(b2[0x0F] - b2[0x08], costab[0]);
127   b1[0x09] = b2[0x09] + b2[0x0E];
128   b1[0x0E] = REAL_MUL(b2[0x0E] - b2[0x09], costab[1]);
129   b1[0x0A] = b2[0x0A] + b2[0x0D];
130   b1[0x0D] = REAL_MUL(b2[0x0D] - b2[0x0A], costab[2]);
131   b1[0x0B] = b2[0x0B] + b2[0x0C];
132   b1[0x0C] = REAL_MUL(b2[0x0C] - b2[0x0B], costab[3]);
133 
134   b1[0x10] = b2[0x10] + b2[0x17];
135   b1[0x17] = REAL_MUL(b2[0x10] - b2[0x17], costab[0]);
136   b1[0x11] = b2[0x11] + b2[0x16];
137   b1[0x16] = REAL_MUL(b2[0x11] - b2[0x16], costab[1]);
138   b1[0x12] = b2[0x12] + b2[0x15];
139   b1[0x15] = REAL_MUL(b2[0x12] - b2[0x15], costab[2]);
140   b1[0x13] = b2[0x13] + b2[0x14];
141   b1[0x14] = REAL_MUL(b2[0x13] - b2[0x14], costab[3]);
142 
143   b1[0x18] = b2[0x18] + b2[0x1F];
144   b1[0x1F] = REAL_MUL(b2[0x1F] - b2[0x18], costab[0]);
145   b1[0x19] = b2[0x19] + b2[0x1E];
146   b1[0x1E] = REAL_MUL(b2[0x1E] - b2[0x19], costab[1]);
147   b1[0x1A] = b2[0x1A] + b2[0x1D];
148   b1[0x1D] = REAL_MUL(b2[0x1D] - b2[0x1A], costab[2]);
149   b1[0x1B] = b2[0x1B] + b2[0x1C];
150   b1[0x1C] = REAL_MUL(b2[0x1C] - b2[0x1B], costab[3]);
151  }
152 
153  {
154   register real const cos0 = pnts[3][0];
155   register real const cos1 = pnts[3][1];
156 
157   b2[0x00] = b1[0x00] + b1[0x03];
158   b2[0x03] = REAL_MUL(b1[0x00] - b1[0x03], cos0);
159   b2[0x01] = b1[0x01] + b1[0x02];
160   b2[0x02] = REAL_MUL(b1[0x01] - b1[0x02], cos1);
161 
162   b2[0x04] = b1[0x04] + b1[0x07];
163   b2[0x07] = REAL_MUL(b1[0x07] - b1[0x04], cos0);
164   b2[0x05] = b1[0x05] + b1[0x06];
165   b2[0x06] = REAL_MUL(b1[0x06] - b1[0x05], cos1);
166 
167   b2[0x08] = b1[0x08] + b1[0x0B];
168   b2[0x0B] = REAL_MUL(b1[0x08] - b1[0x0B], cos0);
169   b2[0x09] = b1[0x09] + b1[0x0A];
170   b2[0x0A] = REAL_MUL(b1[0x09] - b1[0x0A], cos1);
171 
172   b2[0x0C] = b1[0x0C] + b1[0x0F];
173   b2[0x0F] = REAL_MUL(b1[0x0F] - b1[0x0C], cos0);
174   b2[0x0D] = b1[0x0D] + b1[0x0E];
175   b2[0x0E] = REAL_MUL(b1[0x0E] - b1[0x0D], cos1);
176 
177   b2[0x10] = b1[0x10] + b1[0x13];
178   b2[0x13] = REAL_MUL(b1[0x10] - b1[0x13], cos0);
179   b2[0x11] = b1[0x11] + b1[0x12];
180   b2[0x12] = REAL_MUL(b1[0x11] - b1[0x12], cos1);
181 
182   b2[0x14] = b1[0x14] + b1[0x17];
183   b2[0x17] = REAL_MUL(b1[0x17] - b1[0x14], cos0);
184   b2[0x15] = b1[0x15] + b1[0x16];
185   b2[0x16] = REAL_MUL(b1[0x16] - b1[0x15], cos1);
186 
187   b2[0x18] = b1[0x18] + b1[0x1B];
188   b2[0x1B] = REAL_MUL(b1[0x18] - b1[0x1B], cos0);
189   b2[0x19] = b1[0x19] + b1[0x1A];
190   b2[0x1A] = REAL_MUL(b1[0x19] - b1[0x1A], cos1);
191 
192   b2[0x1C] = b1[0x1C] + b1[0x1F];
193   b2[0x1F] = REAL_MUL(b1[0x1F] - b1[0x1C], cos0);
194   b2[0x1D] = b1[0x1D] + b1[0x1E];
195   b2[0x1E] = REAL_MUL(b1[0x1E] - b1[0x1D], cos1);
196  }
197 
198  {
199   register real const cos0 = pnts[4][0];
200 
201   b1[0x00] = b2[0x00] + b2[0x01];
202   b1[0x01] = REAL_MUL(b2[0x00] - b2[0x01], cos0);
203   b1[0x02] = b2[0x02] + b2[0x03];
204   b1[0x03] = REAL_MUL(b2[0x03] - b2[0x02], cos0);
205   b1[0x02] += b1[0x03];
206 
207   b1[0x04] = b2[0x04] + b2[0x05];
208   b1[0x05] = REAL_MUL(b2[0x04] - b2[0x05], cos0);
209   b1[0x06] = b2[0x06] + b2[0x07];
210   b1[0x07] = REAL_MUL(b2[0x07] - b2[0x06], cos0);
211   b1[0x06] += b1[0x07];
212   b1[0x04] += b1[0x06];
213   b1[0x06] += b1[0x05];
214   b1[0x05] += b1[0x07];
215 
216   b1[0x08] = b2[0x08] + b2[0x09];
217   b1[0x09] = REAL_MUL(b2[0x08] - b2[0x09], cos0);
218   b1[0x0A] = b2[0x0A] + b2[0x0B];
219   b1[0x0B] = REAL_MUL(b2[0x0B] - b2[0x0A], cos0);
220   b1[0x0A] += b1[0x0B];
221 
222   b1[0x0C] = b2[0x0C] + b2[0x0D];
223   b1[0x0D] = REAL_MUL(b2[0x0C] - b2[0x0D], cos0);
224   b1[0x0E] = b2[0x0E] + b2[0x0F];
225   b1[0x0F] = REAL_MUL(b2[0x0F] - b2[0x0E], cos0);
226   b1[0x0E] += b1[0x0F];
227   b1[0x0C] += b1[0x0E];
228   b1[0x0E] += b1[0x0D];
229   b1[0x0D] += b1[0x0F];
230 
231   b1[0x10] = b2[0x10] + b2[0x11];
232   b1[0x11] = REAL_MUL(b2[0x10] - b2[0x11], cos0);
233   b1[0x12] = b2[0x12] + b2[0x13];
234   b1[0x13] = REAL_MUL(b2[0x13] - b2[0x12], cos0);
235   b1[0x12] += b1[0x13];
236 
237   b1[0x14] = b2[0x14] + b2[0x15];
238   b1[0x15] = REAL_MUL(b2[0x14] - b2[0x15], cos0);
239   b1[0x16] = b2[0x16] + b2[0x17];
240   b1[0x17] = REAL_MUL(b2[0x17] - b2[0x16], cos0);
241   b1[0x16] += b1[0x17];
242   b1[0x14] += b1[0x16];
243   b1[0x16] += b1[0x15];
244   b1[0x15] += b1[0x17];
245 
246   b1[0x18] = b2[0x18] + b2[0x19];
247   b1[0x19] = REAL_MUL(b2[0x18] - b2[0x19], cos0);
248   b1[0x1A] = b2[0x1A] + b2[0x1B];
249   b1[0x1B] = REAL_MUL(b2[0x1B] - b2[0x1A], cos0);
250   b1[0x1A] += b1[0x1B];
251 
252   b1[0x1C] = b2[0x1C] + b2[0x1D];
253   b1[0x1D] = REAL_MUL(b2[0x1C] - b2[0x1D], cos0);
254   b1[0x1E] = b2[0x1E] + b2[0x1F];
255   b1[0x1F] = REAL_MUL(b2[0x1F] - b2[0x1E], cos0);
256   b1[0x1E] += b1[0x1F];
257   b1[0x1C] += b1[0x1E];
258   b1[0x1E] += b1[0x1D];
259   b1[0x1D] += b1[0x1F];
260  }
261 
262  out0[0x10*16] = REAL_SCALE_DCT64(b1[0x00]);
263  out0[0x10*12] = REAL_SCALE_DCT64(b1[0x04]);
264  out0[0x10* 8] = REAL_SCALE_DCT64(b1[0x02]);
265  out0[0x10* 4] = REAL_SCALE_DCT64(b1[0x06]);
266  out0[0x10* 0] = REAL_SCALE_DCT64(b1[0x01]);
267  out1[0x10* 0] = REAL_SCALE_DCT64(b1[0x01]);
268  out1[0x10* 4] = REAL_SCALE_DCT64(b1[0x05]);
269  out1[0x10* 8] = REAL_SCALE_DCT64(b1[0x03]);
270  out1[0x10*12] = REAL_SCALE_DCT64(b1[0x07]);
271 
272 #if 1
273  out0[0x10*14] = REAL_SCALE_DCT64(b1[0x08] + b1[0x0C]);
274  out0[0x10*10] = REAL_SCALE_DCT64(b1[0x0C] + b1[0x0a]);
275  out0[0x10* 6] = REAL_SCALE_DCT64(b1[0x0A] + b1[0x0E]);
276  out0[0x10* 2] = REAL_SCALE_DCT64(b1[0x0E] + b1[0x09]);
277  out1[0x10* 2] = REAL_SCALE_DCT64(b1[0x09] + b1[0x0D]);
278  out1[0x10* 6] = REAL_SCALE_DCT64(b1[0x0D] + b1[0x0B]);
279  out1[0x10*10] = REAL_SCALE_DCT64(b1[0x0B] + b1[0x0F]);
280  out1[0x10*14] = REAL_SCALE_DCT64(b1[0x0F]);
281 #else
282  b1[0x08] += b1[0x0C];
283  out0[0x10*14] = REAL_SCALE_DCT64(b1[0x08]);
284  b1[0x0C] += b1[0x0a];
285  out0[0x10*10] = REAL_SCALE_DCT64(b1[0x0C]);
286  b1[0x0A] += b1[0x0E];
287  out0[0x10* 6] = REAL_SCALE_DCT64(b1[0x0A]);
288  b1[0x0E] += b1[0x09];
289  out0[0x10* 2] = REAL_SCALE_DCT64(b1[0x0E]);
290  b1[0x09] += b1[0x0D];
291  out1[0x10* 2] = REAL_SCALE_DCT64(b1[0x09]);
292  b1[0x0D] += b1[0x0B];
293  out1[0x10* 6] = REAL_SCALE_DCT64(b1[0x0D]);
294  b1[0x0B] += b1[0x0F];
295  out1[0x10*10] = REAL_SCALE_DCT64(b1[0x0B]);
296  out1[0x10*14] = REAL_SCALE_DCT64(b1[0x0F]);
297 #endif
298 
299  {
300  real tmp;
301  tmp = b1[0x18] + b1[0x1C];
302  out0[0x10*15] = REAL_SCALE_DCT64(tmp + b1[0x10]);
303  out0[0x10*13] = REAL_SCALE_DCT64(tmp + b1[0x14]);
304  tmp = b1[0x1C] + b1[0x1A];
305  out0[0x10*11] = REAL_SCALE_DCT64(tmp + b1[0x14]);
306  out0[0x10* 9] = REAL_SCALE_DCT64(tmp + b1[0x12]);
307  tmp = b1[0x1A] + b1[0x1E];
308  out0[0x10* 7] = REAL_SCALE_DCT64(tmp + b1[0x12]);
309  out0[0x10* 5] = REAL_SCALE_DCT64(tmp + b1[0x16]);
310  tmp = b1[0x1E] + b1[0x19];
311  out0[0x10* 3] = REAL_SCALE_DCT64(tmp + b1[0x16]);
312  out0[0x10* 1] = REAL_SCALE_DCT64(tmp + b1[0x11]);
313  tmp = b1[0x19] + b1[0x1D];
314  out1[0x10* 1] = REAL_SCALE_DCT64(tmp + b1[0x11]);
315  out1[0x10* 3] = REAL_SCALE_DCT64(tmp + b1[0x15]);
316  tmp = b1[0x1D] + b1[0x1B];
317  out1[0x10* 5] = REAL_SCALE_DCT64(tmp + b1[0x15]);
318  out1[0x10* 7] = REAL_SCALE_DCT64(tmp + b1[0x13]);
319  tmp = b1[0x1B] + b1[0x1F];
320  out1[0x10* 9] = REAL_SCALE_DCT64(tmp + b1[0x13]);
321  out1[0x10*11] = REAL_SCALE_DCT64(tmp + b1[0x17]);
322  out1[0x10*13] = REAL_SCALE_DCT64(b1[0x17] + b1[0x1F]);
323  out1[0x10*15] = REAL_SCALE_DCT64(b1[0x1F]);
324  }
325 }
326 
327 /*
328  * the call via dct64 is a trick to force GCC to use
329  * (new) registers for the b1,b2 pointer to the bufs[xx] field
330  */
331 void dct64_i386(real *a,real *b,real *c)
332 {
333   real bufs[0x40];
334   dct64_1(a,b,bufs,bufs+0x20,c);
335 }
336 
337