1 #include <string.h>
2 #include "idct_cli.h"
3 #define IDCT_REFERENCE_SSE_C
4 #include "idct_ref.h"
5 #include "mmintrin.h"
6 
7 /*  Perform IEEE 1180 reference (64-bit floating point, separable 8x1
8  *  direct matrix multiply) Inverse Discrete Cosine Transform
9 */
10 
11 void __stdcall idct_reference_sse(short *block);
12 
13 static const float ref_dct_matrix_t[8][8] =
14 {
15     {/* [0][0-7] */ 0.353553,  0.490393,  0.461940,  0.415735,  0.353553,  0.277785,  0.191342,  0.097545},
16     {/* [1][0-7] */ 0.353553,  0.415735,  0.191342, -0.097545, -0.353553, -0.490393, -0.461940, -0.277785},
17     {/* [2][0-7] */ 0.353553,  0.277785, -0.191342, -0.490393, -0.353553,  0.097545,  0.461940,  0.415735},
18     {/* [3][0-7] */ 0.353553,  0.097545, -0.461940, -0.277785,  0.353553,  0.415735, -0.191342, -0.490393},
19     {/* [4][0-7] */ 0.353553, -0.097545, -0.461940,  0.277785,  0.353553, -0.415735, -0.191342,  0.490393},
20     {/* [5][0-7] */ 0.353553, -0.277785, -0.191342,  0.490393, -0.353553, -0.097545,  0.461940, -0.415735},
21     {/* [6][0-7] */ 0.353553, -0.415735,  0.191342,  0.097545, -0.353553,  0.490393, -0.461940,  0.277785},
22     {/* [7][0-7] */ 0.353553, -0.490393,  0.461940, -0.415735,  0.353553, -0.277785,  0.191342, -0.097545}
23 };
24 
idct_reference_sse(short * block)25 void __stdcall idct_reference_sse(short *block)
26 {
27 	int i, j;
28 	short tmp_block[64];
29 	float tmp[64];
30 	float fblock[64];
31 
32 	memcpy(tmp_block,block,sizeof(short)*64);
33 
34 	__asm{
35 		pxor mm1,mm1;
36 		pxor mm3, mm3;
37 		lea edx, [tmp_block];
38 		lea edi, [fblock];
39 		movd mm0, dword ptr [edx];
40 		movd mm2, dword ptr [edx+4];
41 		pcmpgtw mm1, mm0;
42 		pcmpgtw mm3, mm2;
43 		punpcklwd mm0, mm1;
44 		punpcklwd mm2, mm3;
45 		cvtpi2ps xmm1, mm0;
46 		cvtpi2ps xmm2, mm2;
47 		movlps [edi], xmm1;
48 		movlps [edi+8], xmm2;
49 
50 		pxor mm1,mm1;
51 		pxor mm3, mm3;
52 		lea edx, [tmp_block +8];
53 		lea edi, [fblock +16];
54 		movd mm0, dword ptr [edx];
55 		movd mm2, dword ptr [edx+4];
56 		pcmpgtw mm1, mm0;
57 		pcmpgtw mm3, mm2;
58 		punpcklwd mm0, mm1;
59 		punpcklwd mm2, mm3;
60 		cvtpi2ps xmm1, mm0;
61 		cvtpi2ps xmm2, mm2;
62 		movlps [edi], xmm1;
63 		movlps [edi+8], xmm2;
64 
65 		pxor mm1,mm1;
66 		pxor mm3, mm3;
67 		lea edx, [tmp_block +16];
68 		lea edi, [fblock +32];
69 		movd mm0, dword ptr [edx];
70 		movd mm2, dword ptr [edx+4];
71 		pcmpgtw mm1, mm0;
72 		pcmpgtw mm3, mm2;
73 		punpcklwd mm0, mm1;
74 		punpcklwd mm2, mm3;
75 		cvtpi2ps xmm1, mm0;
76 		cvtpi2ps xmm2, mm2;
77 		movlps [edi], xmm1;
78 		movlps [edi+8], xmm2;
79 
80 		pxor mm1,mm1;
81 		pxor mm3, mm3;
82 		lea edx, [tmp_block +24];
83 		lea edi, [fblock +48];
84 		movd mm0, dword ptr [edx];
85 		movd mm2, dword ptr [edx+4];
86 		pcmpgtw mm1, mm0;
87 		pcmpgtw mm3, mm2;
88 		punpcklwd mm0, mm1;
89 		punpcklwd mm2, mm3;
90 		cvtpi2ps xmm1, mm0;
91 		cvtpi2ps xmm2, mm2;
92 		movlps [edi], xmm1;
93 		movlps [edi+8], xmm2;
94 
95 		pxor mm1,mm1;
96 		pxor mm3, mm3;
97 		lea edx, [tmp_block +32];
98 		lea edi, [fblock +64];
99 		movd mm0, dword ptr [edx];
100 		movd mm2, dword ptr [edx+4];
101 		pcmpgtw mm1, mm0;
102 		pcmpgtw mm3, mm2;
103 		punpcklwd mm0, mm1;
104 		punpcklwd mm2, mm3;
105 		cvtpi2ps xmm1, mm0;
106 		cvtpi2ps xmm2, mm2;
107 		movlps [edi], xmm1;
108 		movlps [edi+8], xmm2;
109 
110 		pxor mm1,mm1;
111 		pxor mm3, mm3;
112 		lea edx, [tmp_block +40];
113 		lea edi, [fblock +80];
114 		movd mm0, dword ptr [edx];
115 		movd mm2, dword ptr [edx+4];
116 		pcmpgtw mm1, mm0;
117 		pcmpgtw mm3, mm2;
118 		punpcklwd mm0, mm1;
119 		punpcklwd mm2, mm3;
120 		cvtpi2ps xmm1, mm0;
121 		cvtpi2ps xmm2, mm2;
122 		movlps [edi], xmm1;
123 		movlps [edi+8], xmm2;
124 
125 		pxor mm1,mm1;
126 		pxor mm3, mm3;
127 		lea edx, [tmp_block +48];
128 		lea edi, [fblock +96];
129 		movd mm0, dword ptr [edx];
130 		movd mm2, dword ptr [edx+4];
131 		pcmpgtw mm1, mm0;
132 		pcmpgtw mm3, mm2;
133 		punpcklwd mm0, mm1;
134 		punpcklwd mm2, mm3;
135 		cvtpi2ps xmm1, mm0;
136 		cvtpi2ps xmm2, mm2;
137 		movlps [edi], xmm1;
138 		movlps [edi+8], xmm2;
139 
140 		pxor mm1,mm1;
141 		pxor mm3, mm3;
142 		lea edx, [tmp_block +56];
143 		lea edi, [fblock +112];
144 		movd mm0, dword ptr [edx];
145 		movd mm2, dword ptr [edx+4];
146 		pcmpgtw mm1, mm0;
147 		pcmpgtw mm3, mm2;
148 		punpcklwd mm0, mm1;
149 		punpcklwd mm2, mm3;
150 		cvtpi2ps xmm1, mm0;
151 		cvtpi2ps xmm2, mm2;
152 		movlps [edi], xmm1;
153 		movlps [edi+8], xmm2;
154 
155 		pxor mm1,mm1;
156 		pxor mm3, mm3;
157 		lea edx, [tmp_block +64];
158 		lea edi, [fblock +128];
159 		movd mm0, dword ptr [edx];
160 		movd mm2, dword ptr [edx+4];
161 		pcmpgtw mm1, mm0;
162 		pcmpgtw mm3, mm2;
163 		punpcklwd mm0, mm1;
164 		punpcklwd mm2, mm3;
165 		cvtpi2ps xmm1, mm0;
166 		cvtpi2ps xmm2, mm2;
167 		movlps [edi], xmm1;
168 		movlps [edi+8], xmm2;
169 
170 		pxor mm1,mm1;
171 		pxor mm3, mm3;
172 		lea edx, [tmp_block +72];
173 		lea edi, [fblock +144];
174 		movd mm0, dword ptr [edx];
175 		movd mm2, dword ptr [edx+4];
176 		pcmpgtw mm1, mm0;
177 		pcmpgtw mm3, mm2;
178 		punpcklwd mm0, mm1;
179 		punpcklwd mm2, mm3;
180 		cvtpi2ps xmm1, mm0;
181 		cvtpi2ps xmm2, mm2;
182 		movlps [edi], xmm1;
183 		movlps [edi+8], xmm2;
184 
185 		pxor mm1,mm1;
186 		pxor mm3, mm3;
187 		lea edx, [tmp_block +80];
188 		lea edi, [fblock +160];
189 		movd mm0, dword ptr [edx];
190 		movd mm2, dword ptr [edx+4];
191 		pcmpgtw mm1, mm0;
192 		pcmpgtw mm3, mm2;
193 		punpcklwd mm0, mm1;
194 		punpcklwd mm2, mm3;
195 		cvtpi2ps xmm1, mm0;
196 		cvtpi2ps xmm2, mm2;
197 		movlps [edi], xmm1;
198 		movlps [edi+8], xmm2;
199 
200 		pxor mm1,mm1;
201 		pxor mm3, mm3;
202 		lea edx, [tmp_block +88];
203 		lea edi, [fblock +176];
204 		movd mm0, dword ptr [edx];
205 		movd mm2, dword ptr [edx+4];
206 		pcmpgtw mm1, mm0;
207 		pcmpgtw mm3, mm2;
208 		punpcklwd mm0, mm1;
209 		punpcklwd mm2, mm3;
210 		cvtpi2ps xmm1, mm0;
211 		cvtpi2ps xmm2, mm2;
212 		movlps [edi], xmm1;
213 		movlps [edi+8], xmm2;
214 
215 		pxor mm1,mm1;
216 		pxor mm3, mm3;
217 		lea edx, [tmp_block +96];
218 		lea edi, [fblock +192];
219 		movd mm0, dword ptr [edx];
220 		movd mm2, dword ptr [edx+4];
221 		pcmpgtw mm1, mm0;
222 		pcmpgtw mm3, mm2;
223 		punpcklwd mm0, mm1;
224 		punpcklwd mm2, mm3;
225 		cvtpi2ps xmm1, mm0;
226 		cvtpi2ps xmm2, mm2;
227 		movlps [edi], xmm1;
228 		movlps [edi+8], xmm2;
229 
230 		pxor mm1,mm1;
231 		pxor mm3, mm3;
232 		lea edx, [tmp_block +104];
233 		lea edi, [fblock +208];
234 		movd mm0, dword ptr [edx];
235 		movd mm2, dword ptr [edx+4];
236 		pcmpgtw mm1, mm0;
237 		pcmpgtw mm3, mm2;
238 		punpcklwd mm0, mm1;
239 		punpcklwd mm2, mm3;
240 		cvtpi2ps xmm1, mm0;
241 		cvtpi2ps xmm2, mm2;
242 		movlps [edi], xmm1;
243 		movlps [edi+8], xmm2;
244 
245 		pxor mm1,mm1;
246 		pxor mm3, mm3;
247 		lea edx, [tmp_block +112];
248 		lea edi, [fblock +224];
249 		movd mm0, dword ptr [edx];
250 		movd mm2, dword ptr [edx+4];
251 		pcmpgtw mm1, mm0;
252 		pcmpgtw mm3, mm2;
253 		punpcklwd mm0, mm1;
254 		punpcklwd mm2, mm3;
255 		cvtpi2ps xmm1, mm0;
256 		cvtpi2ps xmm2, mm2;
257 		movlps [edi], xmm1;
258 		movlps [edi+8], xmm2;
259 
260 		pxor mm1,mm1;
261 		pxor mm3, mm3;
262 		lea edx, [tmp_block +120];
263 		lea edi, [fblock +240];
264 		movd mm0, dword ptr [edx];
265 		movd mm2, dword ptr [edx+4];
266 		pcmpgtw mm1, mm0;
267 		pcmpgtw mm3, mm2;
268 		punpcklwd mm0, mm1;
269 		punpcklwd mm2, mm3;
270 		cvtpi2ps xmm1, mm0;
271 		cvtpi2ps xmm2, mm2;
272 		movlps [edi], xmm1;
273 		movlps [edi+8], xmm2;
274 	}
275 
276 	for (i=0; i<8; i++)
277 	{
278 		for (j=0; j<8; j++)
279 		{
280 			__asm{
281 				//�I�t�Z�b�g�̌v�Z�Ə������B
282 				mov eax, dword ptr [i];
283 				xorps xmm7, xmm7;
284 
285 				mov ebx, dword ptr [j];
286 				shl eax, 5;
287 
288 				shl ebx, 5;
289 				lea edx, [fblock +eax];
290 
291 				lea edi, [ref_dct_matrix_t +ebx];
292 				movups xmm1, [edx] ;
293 
294 				movups xmm2, [edi] ;
295 				movups xmm3, [edx +16] ;
296 
297 				mulps xmm1, xmm2 ;
298 				movups xmm4, [edi +16] ;
299 				;
300 				mulps xmm3, xmm4 ;
301 				;
302 				;
303 				addps xmm7, xmm1 ;
304 				;
305 				addps xmm7, xmm3 ;
306 
307 				movaps xmm1, xmm7 ;
308 				mov eax, dword ptr [i];
309 
310 				shufps xmm7, xmm1, 0x39 ;
311 				shl eax, 2;
312 
313 				addps xmm7, xmm1 ;
314 				add eax, ebx;
315 
316 				movaps xmm1, xmm7 ;
317 				lea edi, [tmp +eax];
318 
319 				shufps xmm7, xmm1,0x2 ;
320 
321 				addss xmm7, xmm1 ;
322 
323 				movss [edi], xmm7
324 			}
325 		}
326 	}
327 
328 	for (j=0; j<8; j++)
329 	{
330 		for (i=0; i<8; i++)
331 		{
332 			__asm{
333 				mov eax, dword ptr [i];
334 				xorps xmm7, xmm7;
335 
336 				shl eax, 5 ;
337 				mov ebx, dword ptr [j];
338 
339 				lea edx, [tmp +eax] ;
340 				shl ebx, 5 ;
341 
342 				movups xmm1, [edx] ;
343 				lea edi, [ref_dct_matrix_t +ebx] ;
344 
345 				movups xmm3, [edx +16] ;
346 				movups xmm2, [edi] ;
347 
348 				mulps xmm1, xmm2 ;
349 				movups xmm4, [edi +16] ;
350 
351 				mulps xmm3, xmm4 ;
352 				addps xmm7, xmm1 ;
353 
354 				;
355 				addps xmm7, xmm3 ;
356 
357 				movaps xmm1, xmm7 ;
358 
359 				shufps xmm7, xmm1, 0x39 ;
360 
361 				addps xmm7, xmm1 ;
362 
363 				movaps xmm1, xmm7 ;
364 
365 				shufps xmm7, xmm1,0x2 ;
366 
367 				addss xmm7, xmm1 ;
368 
369 				cvtss2si eax, xmm7 ;
370 
371 				lea ecx, [eax +IDCT_CLIP_TABLE_OFFSET];
372 				mov eax, dword ptr [j];
373 
374 				lea ebx, [idct_clip_table +ecx*2];
375 				mov ecx, dword ptr [i];
376 
377 				lea edx, [ecx+eax*8];
378 
379 				mov eax, dword ptr [block];
380 				mov cx, word ptr [ebx];
381 
382 				mov word ptr [eax+edx*2],cx;
383 			}
384 		}
385 	}
386 	_mm_empty();
387 }
388