1 #include <string.h>
2 #include "idct_cli.h"
3 #define IDCT_REFERENCE_SSE_C
4 #include "idct_ref.h"
5 #include "mmintrin.h"
6
7 /* Perform IEEE 1180 reference (64-bit floating point, separable 8x1
8 * direct matrix multiply) Inverse Discrete Cosine Transform
9 */
10
11 void __stdcall idct_reference_sse(short *block);
12
13 static const float ref_dct_matrix_t[8][8] =
14 {
15 {/* [0][0-7] */ 0.353553, 0.490393, 0.461940, 0.415735, 0.353553, 0.277785, 0.191342, 0.097545},
16 {/* [1][0-7] */ 0.353553, 0.415735, 0.191342, -0.097545, -0.353553, -0.490393, -0.461940, -0.277785},
17 {/* [2][0-7] */ 0.353553, 0.277785, -0.191342, -0.490393, -0.353553, 0.097545, 0.461940, 0.415735},
18 {/* [3][0-7] */ 0.353553, 0.097545, -0.461940, -0.277785, 0.353553, 0.415735, -0.191342, -0.490393},
19 {/* [4][0-7] */ 0.353553, -0.097545, -0.461940, 0.277785, 0.353553, -0.415735, -0.191342, 0.490393},
20 {/* [5][0-7] */ 0.353553, -0.277785, -0.191342, 0.490393, -0.353553, -0.097545, 0.461940, -0.415735},
21 {/* [6][0-7] */ 0.353553, -0.415735, 0.191342, 0.097545, -0.353553, 0.490393, -0.461940, 0.277785},
22 {/* [7][0-7] */ 0.353553, -0.490393, 0.461940, -0.415735, 0.353553, -0.277785, 0.191342, -0.097545}
23 };
24
idct_reference_sse(short * block)25 void __stdcall idct_reference_sse(short *block)
26 {
27 int i, j;
28 short tmp_block[64];
29 float tmp[64];
30 float fblock[64];
31
32 memcpy(tmp_block,block,sizeof(short)*64);
33
34 __asm{
35 pxor mm1,mm1;
36 pxor mm3, mm3;
37 lea edx, [tmp_block];
38 lea edi, [fblock];
39 movd mm0, dword ptr [edx];
40 movd mm2, dword ptr [edx+4];
41 pcmpgtw mm1, mm0;
42 pcmpgtw mm3, mm2;
43 punpcklwd mm0, mm1;
44 punpcklwd mm2, mm3;
45 cvtpi2ps xmm1, mm0;
46 cvtpi2ps xmm2, mm2;
47 movlps [edi], xmm1;
48 movlps [edi+8], xmm2;
49
50 pxor mm1,mm1;
51 pxor mm3, mm3;
52 lea edx, [tmp_block +8];
53 lea edi, [fblock +16];
54 movd mm0, dword ptr [edx];
55 movd mm2, dword ptr [edx+4];
56 pcmpgtw mm1, mm0;
57 pcmpgtw mm3, mm2;
58 punpcklwd mm0, mm1;
59 punpcklwd mm2, mm3;
60 cvtpi2ps xmm1, mm0;
61 cvtpi2ps xmm2, mm2;
62 movlps [edi], xmm1;
63 movlps [edi+8], xmm2;
64
65 pxor mm1,mm1;
66 pxor mm3, mm3;
67 lea edx, [tmp_block +16];
68 lea edi, [fblock +32];
69 movd mm0, dword ptr [edx];
70 movd mm2, dword ptr [edx+4];
71 pcmpgtw mm1, mm0;
72 pcmpgtw mm3, mm2;
73 punpcklwd mm0, mm1;
74 punpcklwd mm2, mm3;
75 cvtpi2ps xmm1, mm0;
76 cvtpi2ps xmm2, mm2;
77 movlps [edi], xmm1;
78 movlps [edi+8], xmm2;
79
80 pxor mm1,mm1;
81 pxor mm3, mm3;
82 lea edx, [tmp_block +24];
83 lea edi, [fblock +48];
84 movd mm0, dword ptr [edx];
85 movd mm2, dword ptr [edx+4];
86 pcmpgtw mm1, mm0;
87 pcmpgtw mm3, mm2;
88 punpcklwd mm0, mm1;
89 punpcklwd mm2, mm3;
90 cvtpi2ps xmm1, mm0;
91 cvtpi2ps xmm2, mm2;
92 movlps [edi], xmm1;
93 movlps [edi+8], xmm2;
94
95 pxor mm1,mm1;
96 pxor mm3, mm3;
97 lea edx, [tmp_block +32];
98 lea edi, [fblock +64];
99 movd mm0, dword ptr [edx];
100 movd mm2, dword ptr [edx+4];
101 pcmpgtw mm1, mm0;
102 pcmpgtw mm3, mm2;
103 punpcklwd mm0, mm1;
104 punpcklwd mm2, mm3;
105 cvtpi2ps xmm1, mm0;
106 cvtpi2ps xmm2, mm2;
107 movlps [edi], xmm1;
108 movlps [edi+8], xmm2;
109
110 pxor mm1,mm1;
111 pxor mm3, mm3;
112 lea edx, [tmp_block +40];
113 lea edi, [fblock +80];
114 movd mm0, dword ptr [edx];
115 movd mm2, dword ptr [edx+4];
116 pcmpgtw mm1, mm0;
117 pcmpgtw mm3, mm2;
118 punpcklwd mm0, mm1;
119 punpcklwd mm2, mm3;
120 cvtpi2ps xmm1, mm0;
121 cvtpi2ps xmm2, mm2;
122 movlps [edi], xmm1;
123 movlps [edi+8], xmm2;
124
125 pxor mm1,mm1;
126 pxor mm3, mm3;
127 lea edx, [tmp_block +48];
128 lea edi, [fblock +96];
129 movd mm0, dword ptr [edx];
130 movd mm2, dword ptr [edx+4];
131 pcmpgtw mm1, mm0;
132 pcmpgtw mm3, mm2;
133 punpcklwd mm0, mm1;
134 punpcklwd mm2, mm3;
135 cvtpi2ps xmm1, mm0;
136 cvtpi2ps xmm2, mm2;
137 movlps [edi], xmm1;
138 movlps [edi+8], xmm2;
139
140 pxor mm1,mm1;
141 pxor mm3, mm3;
142 lea edx, [tmp_block +56];
143 lea edi, [fblock +112];
144 movd mm0, dword ptr [edx];
145 movd mm2, dword ptr [edx+4];
146 pcmpgtw mm1, mm0;
147 pcmpgtw mm3, mm2;
148 punpcklwd mm0, mm1;
149 punpcklwd mm2, mm3;
150 cvtpi2ps xmm1, mm0;
151 cvtpi2ps xmm2, mm2;
152 movlps [edi], xmm1;
153 movlps [edi+8], xmm2;
154
155 pxor mm1,mm1;
156 pxor mm3, mm3;
157 lea edx, [tmp_block +64];
158 lea edi, [fblock +128];
159 movd mm0, dword ptr [edx];
160 movd mm2, dword ptr [edx+4];
161 pcmpgtw mm1, mm0;
162 pcmpgtw mm3, mm2;
163 punpcklwd mm0, mm1;
164 punpcklwd mm2, mm3;
165 cvtpi2ps xmm1, mm0;
166 cvtpi2ps xmm2, mm2;
167 movlps [edi], xmm1;
168 movlps [edi+8], xmm2;
169
170 pxor mm1,mm1;
171 pxor mm3, mm3;
172 lea edx, [tmp_block +72];
173 lea edi, [fblock +144];
174 movd mm0, dword ptr [edx];
175 movd mm2, dword ptr [edx+4];
176 pcmpgtw mm1, mm0;
177 pcmpgtw mm3, mm2;
178 punpcklwd mm0, mm1;
179 punpcklwd mm2, mm3;
180 cvtpi2ps xmm1, mm0;
181 cvtpi2ps xmm2, mm2;
182 movlps [edi], xmm1;
183 movlps [edi+8], xmm2;
184
185 pxor mm1,mm1;
186 pxor mm3, mm3;
187 lea edx, [tmp_block +80];
188 lea edi, [fblock +160];
189 movd mm0, dword ptr [edx];
190 movd mm2, dword ptr [edx+4];
191 pcmpgtw mm1, mm0;
192 pcmpgtw mm3, mm2;
193 punpcklwd mm0, mm1;
194 punpcklwd mm2, mm3;
195 cvtpi2ps xmm1, mm0;
196 cvtpi2ps xmm2, mm2;
197 movlps [edi], xmm1;
198 movlps [edi+8], xmm2;
199
200 pxor mm1,mm1;
201 pxor mm3, mm3;
202 lea edx, [tmp_block +88];
203 lea edi, [fblock +176];
204 movd mm0, dword ptr [edx];
205 movd mm2, dword ptr [edx+4];
206 pcmpgtw mm1, mm0;
207 pcmpgtw mm3, mm2;
208 punpcklwd mm0, mm1;
209 punpcklwd mm2, mm3;
210 cvtpi2ps xmm1, mm0;
211 cvtpi2ps xmm2, mm2;
212 movlps [edi], xmm1;
213 movlps [edi+8], xmm2;
214
215 pxor mm1,mm1;
216 pxor mm3, mm3;
217 lea edx, [tmp_block +96];
218 lea edi, [fblock +192];
219 movd mm0, dword ptr [edx];
220 movd mm2, dword ptr [edx+4];
221 pcmpgtw mm1, mm0;
222 pcmpgtw mm3, mm2;
223 punpcklwd mm0, mm1;
224 punpcklwd mm2, mm3;
225 cvtpi2ps xmm1, mm0;
226 cvtpi2ps xmm2, mm2;
227 movlps [edi], xmm1;
228 movlps [edi+8], xmm2;
229
230 pxor mm1,mm1;
231 pxor mm3, mm3;
232 lea edx, [tmp_block +104];
233 lea edi, [fblock +208];
234 movd mm0, dword ptr [edx];
235 movd mm2, dword ptr [edx+4];
236 pcmpgtw mm1, mm0;
237 pcmpgtw mm3, mm2;
238 punpcklwd mm0, mm1;
239 punpcklwd mm2, mm3;
240 cvtpi2ps xmm1, mm0;
241 cvtpi2ps xmm2, mm2;
242 movlps [edi], xmm1;
243 movlps [edi+8], xmm2;
244
245 pxor mm1,mm1;
246 pxor mm3, mm3;
247 lea edx, [tmp_block +112];
248 lea edi, [fblock +224];
249 movd mm0, dword ptr [edx];
250 movd mm2, dword ptr [edx+4];
251 pcmpgtw mm1, mm0;
252 pcmpgtw mm3, mm2;
253 punpcklwd mm0, mm1;
254 punpcklwd mm2, mm3;
255 cvtpi2ps xmm1, mm0;
256 cvtpi2ps xmm2, mm2;
257 movlps [edi], xmm1;
258 movlps [edi+8], xmm2;
259
260 pxor mm1,mm1;
261 pxor mm3, mm3;
262 lea edx, [tmp_block +120];
263 lea edi, [fblock +240];
264 movd mm0, dword ptr [edx];
265 movd mm2, dword ptr [edx+4];
266 pcmpgtw mm1, mm0;
267 pcmpgtw mm3, mm2;
268 punpcklwd mm0, mm1;
269 punpcklwd mm2, mm3;
270 cvtpi2ps xmm1, mm0;
271 cvtpi2ps xmm2, mm2;
272 movlps [edi], xmm1;
273 movlps [edi+8], xmm2;
274 }
275
276 for (i=0; i<8; i++)
277 {
278 for (j=0; j<8; j++)
279 {
280 __asm{
281 //�I�t�Z�b�g�̌v�Z�Ə������B
282 mov eax, dword ptr [i];
283 xorps xmm7, xmm7;
284
285 mov ebx, dword ptr [j];
286 shl eax, 5;
287
288 shl ebx, 5;
289 lea edx, [fblock +eax];
290
291 lea edi, [ref_dct_matrix_t +ebx];
292 movups xmm1, [edx] ;
293
294 movups xmm2, [edi] ;
295 movups xmm3, [edx +16] ;
296
297 mulps xmm1, xmm2 ;
298 movups xmm4, [edi +16] ;
299 ;
300 mulps xmm3, xmm4 ;
301 ;
302 ;
303 addps xmm7, xmm1 ;
304 ;
305 addps xmm7, xmm3 ;
306
307 movaps xmm1, xmm7 ;
308 mov eax, dword ptr [i];
309
310 shufps xmm7, xmm1, 0x39 ;
311 shl eax, 2;
312
313 addps xmm7, xmm1 ;
314 add eax, ebx;
315
316 movaps xmm1, xmm7 ;
317 lea edi, [tmp +eax];
318
319 shufps xmm7, xmm1,0x2 ;
320
321 addss xmm7, xmm1 ;
322
323 movss [edi], xmm7
324 }
325 }
326 }
327
328 for (j=0; j<8; j++)
329 {
330 for (i=0; i<8; i++)
331 {
332 __asm{
333 mov eax, dword ptr [i];
334 xorps xmm7, xmm7;
335
336 shl eax, 5 ;
337 mov ebx, dword ptr [j];
338
339 lea edx, [tmp +eax] ;
340 shl ebx, 5 ;
341
342 movups xmm1, [edx] ;
343 lea edi, [ref_dct_matrix_t +ebx] ;
344
345 movups xmm3, [edx +16] ;
346 movups xmm2, [edi] ;
347
348 mulps xmm1, xmm2 ;
349 movups xmm4, [edi +16] ;
350
351 mulps xmm3, xmm4 ;
352 addps xmm7, xmm1 ;
353
354 ;
355 addps xmm7, xmm3 ;
356
357 movaps xmm1, xmm7 ;
358
359 shufps xmm7, xmm1, 0x39 ;
360
361 addps xmm7, xmm1 ;
362
363 movaps xmm1, xmm7 ;
364
365 shufps xmm7, xmm1,0x2 ;
366
367 addss xmm7, xmm1 ;
368
369 cvtss2si eax, xmm7 ;
370
371 lea ecx, [eax +IDCT_CLIP_TABLE_OFFSET];
372 mov eax, dword ptr [j];
373
374 lea ebx, [idct_clip_table +ecx*2];
375 mov ecx, dword ptr [i];
376
377 lea edx, [ecx+eax*8];
378
379 mov eax, dword ptr [block];
380 mov cx, word ptr [ebx];
381
382 mov word ptr [eax+edx*2],cx;
383 }
384 }
385 }
386 _mm_empty();
387 }
388