1
2
3#if defined(i386) && defined(USE_MMX)
4
5
6/*
7 * the input data is tranposed and each 16 bit element in the 8x8 matrix
8 * is left aligned:
9 * for example in 11...1110000 format
10 * If the iDCT is of I macroblock then 0.5 needs to be added to the;DC Component
11 * (element[0][0] of the matrix)
12 */
13
14/* extrn re_matrix */
15
16.data
17	.align 16
18	.type	 preSC,@object
19preSC:  .short  16384,22725,21407,19266,16384,12873,8867,4520
20        .short  22725,31521,29692,26722,22725,17855,12299,6270
21        .short  21407,29692,27969,25172,21407,16819,11585,5906
22        .short  19266,26722,25172,22654,19266,15137,10426,5315
23        .short  16384,22725,21407,19266,16384,12873,8867,4520
24        .short  12873,17855,16819,15137,25746,20228,13933,7103
25        .short  17734,24598,23170,20853,17734,13933,9597,4892
26        .short  18081,25080,23624,21261,18081,14206,9785,4988
27	.size	 preSC,128
28	.align 8
29	.type	x0005000200010001,@object
30	.size	x0005000200010001,8
31x0005000200010001:
32	.long	0x00010001,0x00050002
33	.align 8
34	.type	x0040000000000000,@object
35	.size	x0040000000000000,8
36x0040000000000000:
37	.long	0, 0x00400000
38	.align 8
39	.type	x5a825a825a825a82,@object
40	.size	x5a825a825a825a82,8
41x5a825a825a825a82:
42	.long	0x5a825a82, 0x5a825a82
43	.align 8
44	.type	x539f539f539f539f,@object
45	.size	x539f539f539f539f,8
46x539f539f539f539f:
47	.long	0x539f539f,0x539f539f
48	.align 8
49	.type	x4546454645464546,@object
50	.size	x4546454645464546,8
51x4546454645464546:
52	.long	0x45464546,0x45464546
53	.align 8
54	.type	x61f861f861f861f8,@object
55	.size	x61f861f861f861f8,8
56x61f861f861f861f8:
57	.long	0x61f861f8,0x61f861f8
58	.align 8
59	.type	 scratch1,@object
60	.size	 scratch1,8
61scratch1:
62	.long 0,0
63	.align 8
64	.type	 scratch3,@object
65	.size	 scratch3,8
66scratch3:
67	.long 0,0
68	.align 8
69	.type	 scratch5,@object
70	.size	 scratch5,8
71scratch5:
72	.long 0,0
73	.align 8
74	.type	 scratch7,@object
75	.size	 scratch7,8
76scratch7:
77	.long 0,0
78	.type	 x0,@object
79	.size	 x0,8
80x0:
81	.long 0,0
82	.align 8
83.text
84	.align 4
85.globl IDCT_mmx
86	.type	 IDCT_mmx,@function
87IDCT_mmx:
88	pushl %ebp
89	movl %esp,%ebp
90	pushl %ebx
91	pushl %ecx
92	pushl %edx
93	pushl %esi
94	pushl %edi
95	movl 8(%ebp),%esi		/* source matrix */
96	leal preSC, %ecx
97/* column 0: even part
98 * use V4, V12, V0, V8 to produce V22..V25
99 */
100	movq 8*12(%ecx), %mm0	/* maybe the first mul can be done together */
101				/* with the dequantization in iHuff module */
102	pmulhw 8*12(%esi), %mm0		/* V12 */
103	movq 8*4(%ecx), %mm1
104	pmulhw 8*4(%esi), %mm1		/* V4 */
105	movq (%ecx), %mm3
106	psraw $1, %mm0			/* t64=t66 */
107	pmulhw (%esi), %mm3		/* V0 */
108	movq 8*8(%ecx), %mm5		/* duplicate V4 */
109	movq %mm1, %mm2			/* added 11/1/96 */
110	pmulhw 8*8(%esi),%mm5		/* V8 */
111	psubsw %mm0, %mm1		/* V16 */
112	pmulhw x5a825a825a825a82, %mm1	/* 23170 ->V18 */
113	paddsw %mm0, %mm2		/* V17 */
114	movq %mm2, %mm0			/* duplicate V17 */
115	psraw $1, %mm2			/* t75=t82 */
116	psraw $2, %mm0			/* t72 */
117	movq %mm3, %mm4			/* duplicate V0 */
118	paddsw %mm5, %mm3		/* V19 */
119	psubsw %mm5, %mm4		/* V20 ;mm5 free */
120/* moved from the block below */
121	movq 8*10(%ecx), %mm7
122	psraw $1, %mm3			/* t74=t81 */
123	movq %mm3, %mm6			/* duplicate t74=t81 */
124	psraw $2, %mm4			/* t77=t79 */
125	psubsw %mm0, %mm1		/* V21 ; mm0 free */
126	paddsw %mm2, %mm3		/* V22 */
127	movq %mm1, %mm5			/* duplicate V21 */
128	paddsw %mm4, %mm1		/* V23 */
129	movq %mm3, 8*4(%esi)		/* V22 */
130	psubsw %mm5, %mm4		/* V24; mm5 free */
131	movq %mm1, 8*12(%esi)		/* V23 */
132	psubsw %mm2, %mm6		/* V25; mm2 free */
133	movq %mm4, (%esi)		/* V24 */
134/* keep mm6 alive all along the next block */
135	/* movq %mm6, 8*8(%esi) 	V25 */
136/* column 0: odd part
137 * use V2, V6, V10, V14 to produce V31, V39, V40, V41
138 */
139/* moved above: movq 8*10(%ecx), %mm7 */
140
141	pmulhw 8*10(%esi), %mm7		/* V10 */
142	movq 8*6(%ecx), %mm0
143	pmulhw 8*6(%esi), %mm0		/* V6 */
144	movq 8*2(%ecx), %mm5
145	movq %mm7, %mm3			/* duplicate V10 */
146	pmulhw 8*2(%esi), %mm5		/* V2 */
147	movq 8*14(%ecx), %mm4
148	psubsw %mm0, %mm7		/* V26 */
149	pmulhw 8*14(%esi), %mm4		/* V14 */
150	paddsw %mm0, %mm3		/* V29 ; free mm0 */
151	movq %mm7, %mm1			/* duplicate V26 */
152	psraw $1, %mm3			/* t91=t94 */
153	pmulhw x539f539f539f539f,%mm7	/* V33 */
154	psraw $1, %mm1			/* t96 */
155	movq %mm5, %mm0			/* duplicate V2 */
156	psraw $2, %mm4			/* t85=t87 */
157	paddsw %mm4,%mm5		/* V27 */
158	psubsw %mm4, %mm0		/* V28 ; free mm4 */
159	movq %mm0, %mm2			/* duplicate V28 */
160	psraw $1, %mm5			/* t90=t93 */
161	pmulhw x4546454645464546,%mm0	/* V35 */
162	psraw $1, %mm2			/* t97 */
163	movq %mm5, %mm4			/* duplicate t90=t93 */
164	psubsw %mm2, %mm1		/* V32 ; free mm2 */
165	pmulhw x61f861f861f861f8,%mm1	/* V36 */
166	psllw $1, %mm7			/* t107 */
167	paddsw %mm3, %mm5		/* V31 */
168	psubsw %mm3, %mm4		/* V30 ; free mm3 */
169	pmulhw x5a825a825a825a82,%mm4	/* V34 */
170	nop
171	psubsw %mm1, %mm0		/* V38 */
172	psubsw %mm7, %mm1		/* V37 ; free mm7 */
173	psllw $1, %mm1			/* t114 */
174/* move from the next block */
175	movq %mm6, %mm3			/* duplicate V25 */
176/* move from the next block */
177	movq 8*4(%esi), %mm7		/* V22 */
178	psllw $1, %mm0			/* t110 */
179	psubsw %mm5, %mm0		/* V39 (mm5 needed for next block) */
180	psllw $2, %mm4			/* t112 */
181/* moved from the next block */
182	movq 8*12(%esi), %mm2		/* V23 */
183	psubsw %mm0, %mm4		/* V40 */
184	paddsw %mm4, %mm1		/* V41; free mm0 */
185/* moved from the next block */
186	psllw $1, %mm2			/* t117=t125 */
187/* column 0: output butterfly */
188/* moved above:
189 * movq %mm6, %mm3			duplicate V25
190 * movq 8*4(%esi), %mm7			V22
191 * movq 8*12(%esi), %mm2		V23
192 * psllw $1, %mm2			t117=t125
193 */
194	psubsw %mm1, %mm6		/* tm6 */
195	paddsw %mm1, %mm3		/* tm8; free mm1 */
196	movq %mm7, %mm1			/* duplicate V22 */
197	paddsw %mm5, %mm7		/* tm0 */
198	movq %mm3, 8*8(%esi)		/* tm8; free mm3 */
199	psubsw %mm5, %mm1		/* tm14; free mm5 */
200	movq %mm6, 8*6(%esi)		/* tm6; free mm6 */
201	movq %mm2, %mm3			/* duplicate t117=t125 */
202	movq (%esi), %mm6		/* V24 */
203	paddsw %mm0, %mm2		/* tm2 */
204	movq %mm7, (%esi)		/* tm0; free mm7 */
205	psubsw %mm0, %mm3		/* tm12; free mm0 */
206	movq %mm1, 8*14(%esi)		/* tm14; free mm1 */
207	psllw $1, %mm6			/* t119=t123 */
208	movq %mm2, 8*2(%esi)		/* tm2; free mm2 */
209	movq %mm6, %mm0			/* duplicate t119=t123 */
210	movq %mm3, 8*12(%esi)		/* tm12; free mm3 */
211	paddsw %mm4, %mm6		/* tm4 */
212/* moved from next block */
213	movq 8*5(%ecx), %mm1
214	psubsw %mm4, %mm0		/* tm10; free mm4 */
215/* moved from next block */
216	pmulhw 8*5(%esi), %mm1		/* V5 */
217	movq %mm6, 8*4(%esi)		/* tm4; free mm6 */
218	movq %mm0, 8*10(%esi)		/* tm10; free mm0 */
219/* column 1: even part
220 * use V5, V13, V1, V9 to produce V56..V59
221 */
222/* moved to prev block:
223 *	movq 8*5(%ecx), %mm1
224 *	pmulhw 8*5(%esi), %mm1		 V5
225 */
226	movq 8*13(%ecx), %mm7
227	psllw $1, %mm1			/* t128=t130 */
228	pmulhw 8*13(%esi), %mm7		/* V13 */
229	movq %mm1, %mm2			/* duplicate t128=t130 */
230	movq 8(%ecx), %mm3
231	pmulhw 8(%esi), %mm3		/* V1 */
232	movq 8*9(%ecx), %mm5
233	psubsw %mm7, %mm1		/* V50 */
234	pmulhw 8*9(%esi), %mm5		/* V9 */
235	paddsw %mm7, %mm2		/* V51 */
236	pmulhw x5a825a825a825a82, %mm1	/* 23170 ->V52 */
237	movq %mm2, %mm6			/* duplicate V51 */
238	psraw $1, %mm2			/* t138=t144 */
239	movq %mm3, %mm4			/* duplicate V1 */
240	psraw $2, %mm6			/* t136 */
241	paddsw %mm5, %mm3		/* V53 */
242	psubsw %mm5, %mm4		/* V54 ;mm5 free */
243	movq %mm3, %mm7			/* duplicate V53 */
244/* moved from next block */
245	movq 8*11(%ecx), %mm0
246	psraw $1, %mm4			/* t140=t142 */
247	psubsw %mm6, %mm1		/* V55 ; mm6 free */
248	paddsw %mm2, %mm3		/* V56 */
249	movq %mm4, %mm5			/* duplicate t140=t142 */
250	paddsw %mm1, %mm4		/* V57 */
251	movq %mm3, 8*5(%esi)		/* V56 */
252	psubsw %mm1, %mm5		/* V58; mm1 free */
253	movq %mm4, 8*13(%esi)		/* V57 */
254	psubsw %mm2, %mm7		/* V59; mm2 free */
255	movq %mm5, 8*9(%esi)		/* V58 */
256/* keep mm7 alive all along the next block
257 *	movq %mm7, 8(%esi)		V59
258 * moved above
259 *	movq 8*11(%ecx), %mm0
260 */
261	pmulhw 8*11(%esi), %mm0		/* V11 */
262	movq 8*7(%ecx), %mm6
263	pmulhw 8*7(%esi), %mm6		/* V7 */
264	movq 8*15(%ecx), %mm4
265	movq %mm0, %mm3			/* duplicate V11 */
266	pmulhw 8*15(%esi), %mm4		/* V15 */
267	movq 8*3(%ecx), %mm5
268	psllw $1, %mm6			/* t146=t152 */
269	pmulhw 8*3(%esi), %mm5		/* V3 */
270	paddsw %mm6, %mm0		/* V63 */
271/* note that V15 computation has a correction step:
272 * this is a 'magic' constant that rebiases the results to be closer to the
273 * expected result.  this magic constant can be refined to reduce the error
274 * even more by doing the correction step in a later stage when the number
275 * is actually multiplied by 16
276 */
277	paddw x0005000200010001, %mm4
278	psubsw %mm6, %mm3		/* V60 ; free mm6 */
279	psraw $1, %mm0			/* t154=t156 */
280	movq %mm3, %mm1			/* duplicate V60 */
281	pmulhw x539f539f539f539f, %mm1	/* V67 */
282	movq %mm5, %mm6			/* duplicate V3 */
283	psraw $2, %mm4			/* t148=t150 */
284	paddsw %mm4, %mm5		/* V61 */
285	psubsw %mm4, %mm6		/* V62 ; free mm4 */
286	movq %mm5, %mm4			/* duplicate V61 */
287	psllw $1, %mm1			/* t169 */
288	paddsw %mm0, %mm5		/* V65 -> result */
289	psubsw %mm0, %mm4		/* V64 ; free mm0 */
290	pmulhw x5a825a825a825a82, %mm4	/* V68 */
291	psraw $1, %mm3			/* t158 */
292	psubsw %mm6, %mm3		/* V66 */
293	movq %mm5, %mm2			/* duplicate V65 */
294	pmulhw x61f861f861f861f8, %mm3	/* V70 */
295	psllw $1, %mm6			/* t165 */
296	pmulhw x4546454645464546, %mm6	/* V69 */
297	psraw $1, %mm2			/* t172 */
298/* moved from next block */
299	movq 8*5(%esi), %mm0		/* V56 */
300	psllw $1, %mm4			/* t174 */
301/* moved from next block */
302	psraw $1, %mm0			/* t177=t188 */
303	nop
304	psubsw %mm3, %mm6		/* V72 */
305	psubsw %mm1, %mm3		/* V71 ; free mm1 */
306	psubsw %mm2, %mm6		/* V73 ; free mm2 */
307/* moved from next block */
308	psraw $1, %mm5			/* t178=t189 */
309	psubsw %mm6, %mm4		/* V74 */
310/* moved from next block */
311	movq %mm0, %mm1			/* duplicate t177=t188 */
312	paddsw %mm4, %mm3		/* V75 */
313/* moved from next block */
314	paddsw %mm5, %mm0		/* tm1 */
315/* location
316 *  5 - V56
317 * 13 - V57
318 *  9 - V58
319 *  X - V59, mm7
320 *  X - V65, mm5
321 *  X - V73, mm6
322 *  X - V74, mm4
323 *  X - V75, mm3
324 * free mm0, mm1 & mm2
325 * moved above
326 *	movq 8*5(%esi), %mm0		V56
327 *	psllw $1, %mm0			t177=t188 ! new !!
328 *	psllw $1, %mm5			t178=t189 ! new !!
329 *	movq %mm0, %mm1			duplicate t177=t188
330 *	paddsw %mm5, %mm0		tm1
331 */
332	movq 8*13(%esi), %mm2		/* V57 */
333	psubsw %mm5, %mm1		/* tm15; free mm5 */
334	movq %mm0, 8(%esi)		/* tm1; free mm0 */
335	psraw $1, %mm7			/* t182=t184 ! new !! */
336/* save the store as used directly in the transpose
337 *	movq %mm1, 120(%esi)		tm15; free mm1
338 */
339	movq %mm7, %mm5			/* duplicate t182=t184 */
340	psubsw %mm3, %mm7		/* tm7 */
341	paddsw %mm3, %mm5		/* tm9; free mm3 */
342	movq 8*9(%esi), %mm0		/* V58 */
343	movq %mm2, %mm3			/* duplicate V57 */
344	movq %mm7, 8*7(%esi)		/* tm7; free mm7 */
345	psubsw %mm6, %mm3		/* tm13 */
346	paddsw %mm6, %mm2		/* tm3 ; free mm6 */
347/* moved up from the transpose */
348	movq %mm3, %mm7
349/* moved up from the transpose */
350	punpcklwd %mm1, %mm3
351	movq %mm0, %mm6			/* duplicate V58 */
352	movq %mm2, 8*3(%esi)		/* tm3; free mm2 */
353	paddsw %mm4, %mm0		/* tm5 */
354	psubsw %mm4, %mm6		/* tm11; free mm4 */
355/* moved up from the transpose */
356	punpckhwd %mm1, %mm7
357	movq %mm0, 8*5(%esi)		/* tm5; free mm0 */
358/* moved up from the transpose */
359	movq %mm5, %mm2
360/* transpose - M4 part
361 *  ---------       ---------
362 * | M1 | M2 |     | M1'| M3'|
363 *  ---------  -->  ---------
364 * | M3 | M4 |     | M2'| M4'|
365 *  ---------       ---------
366 * Two alternatives: use full mmword approach so the following code can be
367 * scheduled before the transpose is done without stores, or use the faster
368 * half mmword stores (when possible)
369 */
370	movd %mm3, 8*9+4(%esi)		/* MS part of tmt9 */
371	punpcklwd %mm6, %mm5
372	movd %mm7, 8*13+4(%esi)		/* MS part of tmt13 */
373	punpckhwd %mm6, %mm2
374	movd %mm5, 8*9(%esi)		/* LS part of tmt9 */
375	punpckhdq %mm3, %mm5		/* free mm3 */
376	movd %mm2, 8*13(%esi)		/* LS part of tmt13 */
377	punpckhdq %mm7, %mm2		/* free mm7 */
378/* moved up from the M3 transpose */
379	movq 8*8(%esi), %mm0
380/* moved up from the M3 transpose */
381	movq 8*10(%esi), %mm1
382/* moved up from the M3 transpose */
383	movq %mm0, %mm3
384/* shuffle the rest of the data, and write it with 2 mmword writes */
385	movq %mm5, 8*11(%esi)		/* tmt11 */
386/* moved up from the M3 transpose */
387	punpcklwd %mm1, %mm0
388	movq %mm2, 8*15(%esi)		/* tmt15 */
389/* moved up from the M3 transpose */
390	punpckhwd %mm1, %mm3
391/* transpose - M3 part
392 * moved up to previous code section
393 *	movq 8*8(%esi), %mm0
394 *	movq 8*10(%esi), %mm1
395 *	movq %mm0, %mm3
396 *	punpcklwd %mm1, %mm0
397 *	punpckhwd %mm1, %mm3
398 */
399	movq 8*12(%esi), %mm6
400	movq 8*14(%esi), %mm4
401	movq %mm6, %mm2
402/* shuffle the data and write the lower parts of the transposed in 4 dwords */
403	punpcklwd %mm4, %mm6
404	movq %mm0, %mm1
405	punpckhdq %mm6, %mm1
406	movq %mm3, %mm7
407	punpckhwd %mm4, %mm2		/* free mm4 */
408	punpckldq %mm6, %mm0		/* free mm6 */
409/* moved from next block */
410	movq 8*13(%esi), %mm4		/* tmt13 */
411	punpckldq %mm2, %mm3
412	punpckhdq %mm2, %mm7		/* free mm2 */
413/* moved from next block */
414	movq %mm3, %mm5			/* duplicate tmt5 */
415/* column 1: even part (after transpose)
416* moved above
417*	movq %mm3, %mm5			duplicate tmt5
418*	movq 8*13(%esi), %mm4		tmt13
419*/
420	psubsw %mm4, %mm3		/* V134 */
421	pmulhw x5a825a825a825a82, %mm3	/* 23170 ->V136 */
422	movq 8*9(%esi), %mm6		/* tmt9 */
423	paddsw %mm4, %mm5		/* V135 ; mm4 free */
424	movq %mm0, %mm4			/* duplicate tmt1 */
425	paddsw %mm6, %mm0		/* V137 */
426	psubsw %mm6, %mm4		/* V138 ; mm6 free */
427	psllw $2, %mm3			/* t290 */
428	psubsw %mm5, %mm3		/* V139 */
429	movq %mm0, %mm6			/* duplicate V137 */
430	paddsw %mm5, %mm0		/* V140 */
431	movq %mm4, %mm2			/* duplicate V138 */
432	paddsw %mm3, %mm2		/* V141 */
433	psubsw %mm3, %mm4		/* V142 ; mm3 free */
434	movq %mm0, 8*9(%esi)		/* V140 */
435	psubsw %mm5, %mm6		/* V143 ; mm5 free */
436/* moved from next block */
437	movq 8*11(%esi), %mm0		/* tmt11 */
438	movq %mm2, 8*13(%esi)		/* V141 */
439/* moved from next block */
440	movq %mm0, %mm2			/* duplicate tmt11 */
441/* column 1: odd part (after transpose) */
442/* moved up to the prev block
443 *	movq 8*11(%esi), %mm0		tmt11
444 *	movq %mm0, %mm2			duplicate tmt11
445 */
446	movq 8*15(%esi), %mm5		/* tmt15 */
447	psubsw %mm7, %mm0		/* V144 */
448	movq %mm0, %mm3			/* duplicate V144 */
449	paddsw %mm7, %mm2		/* V147 ; free mm7 */
450	pmulhw x539f539f539f539f, %mm0	/* 21407-> V151 */
451	movq %mm1, %mm7			/* duplicate tmt3 */
452	paddsw %mm5, %mm7		/* V145 */
453	psubsw %mm5, %mm1		/* V146 ; free mm5 */
454	psubsw %mm1, %mm3		/* V150 */
455	movq %mm7, %mm5			/* duplicate V145 */
456	pmulhw x4546454645464546, %mm1	/* 17734-> V153 */
457	psubsw %mm2, %mm5		/* V148 */
458	pmulhw x61f861f861f861f8, %mm3	/* 25080-> V154 */
459	psllw $2, %mm0			/* t311 */
460	pmulhw x5a825a825a825a82, %mm5	/* 23170-> V152 */
461	paddsw %mm2, %mm7		/* V149 ; free mm2 */
462	psllw $1, %mm1			/* t313 */
463	nop	/* without the nop - freeze here for one clock */
464	movq %mm3, %mm2			/* duplicate V154 */
465	psubsw %mm0, %mm3		/* V155 ; free mm0 */
466	psubsw %mm2, %mm1		/* V156 ; free mm2 */
467/* moved from the next block */
468	movq %mm6, %mm2			/* duplicate V143 */
469/* moved from the next block */
470	movq 8*13(%esi), %mm0		/* V141 */
471	psllw $1, %mm1			/* t315 */
472	psubsw %mm7, %mm1		/* V157 (keep V149) */
473	psllw $2, %mm5			/* t317 */
474	psubsw %mm1, %mm5		/* V158 */
475	psllw $1, %mm3			/* t319 */
476	paddsw %mm5, %mm3		/* V159 */
477/* column 1: output butterfly (after transform)
478 * moved to the prev block
479 *	movq %mm6, %mm2			duplicate V143
480 *	movq 8*13(%esi), %mm0		V141
481 */
482	psubsw %mm3, %mm2		/* V163 */
483	paddsw %mm3, %mm6		/* V164 ; free mm3 */
484	movq %mm4, %mm3			/* duplicate V142 */
485	psubsw %mm5, %mm4		/* V165 ; free mm5 */
486	movq %mm2, scratch7		/* out7 */
487	psraw $4, %mm6
488	psraw $4, %mm4
489	paddsw %mm5, %mm3		/* V162 */
490	movq 8*9(%esi), %mm2		/* V140 */
491	movq %mm0, %mm5			/* duplicate V141 */
492/* in order not to perculate this line up,
493 * we read 72(%esi) very near to this location
494 */
495	movq %mm6, 8*9(%esi)		/* out9 */
496	paddsw %mm1, %mm0		/* V161 */
497	movq %mm3, scratch5		/* out5 */
498	psubsw %mm1, %mm5		/* V166 ; free mm1 */
499	movq %mm4, 8*11(%esi)		/* out11 */
500	psraw $4, %mm5
501	movq %mm0, scratch3		/* out3 */
502	movq %mm2, %mm4			/* duplicate V140 */
503	movq %mm5, 8*13(%esi)		/* out13 */
504	paddsw %mm7, %mm2		/* V160 */
505/* moved from the next block */
506	movq 8(%esi), %mm0
507	psubsw %mm7, %mm4		/* V167 ; free mm7 */
508/* moved from the next block */
509	movq 8*3(%esi), %mm7
510	psraw $4, %mm4
511	movq %mm2, scratch1		/* out1 */
512/* moved from the next block */
513	movq %mm0, %mm1
514	movq %mm4, 8*15(%esi)		/* out15 */
515/* moved from the next block */
516	punpcklwd %mm7, %mm0
517/* transpose - M2 parts
518 * moved up to the prev block
519 *	movq 8(%esi), %mm0
520 *	movq 8*3(%esi), %mm7
521 *	movq %mm0, %mm1
522 *	punpcklwd %mm7, %mm0
523 */
524	movq 8*5(%esi), %mm5
525	punpckhwd %mm7, %mm1
526	movq 8*7(%esi), %mm4
527	movq %mm5, %mm3
528/* shuffle the data and write the lower parts of the trasposed in 4 dwords */
529	movd %mm0, 8*8(%esi)		/* LS part of tmt8 */
530	punpcklwd %mm4, %mm5
531	movd %mm1, 8*12(%esi)		/* LS part of tmt12 */
532	punpckhwd %mm4, %mm3
533	movd %mm5, 8*8+4(%esi)		/* MS part of tmt8 */
534	punpckhdq %mm5, %mm0		/* tmt10 */
535	movd %mm3, 8*12+4(%esi)		/* MS part of tmt12 */
536	punpckhdq %mm3, %mm1		/* tmt14 */
537/* transpose - M1 parts */
538	movq (%esi), %mm7
539	movq 8*2(%esi), %mm2
540	movq %mm7, %mm6
541	movq 8*4(%esi), %mm5
542	punpcklwd %mm2, %mm7
543	movq 8*6(%esi), %mm4
544	punpckhwd %mm2, %mm6		/* free mm2 */
545	movq %mm5, %mm3
546	punpcklwd %mm4, %mm5
547	punpckhwd %mm4, %mm3		/* free mm4 */
548	movq %mm7, %mm2
549	movq %mm6, %mm4
550	punpckldq %mm5, %mm7		/* tmt0 */
551	punpckhdq %mm5, %mm2		/* tmt2 ; free mm5 */
552/* shuffle the rest of the data, and write it with 2 mmword writes */
553	punpckldq %mm3, %mm6		/* tmt4 */
554/* moved from next block */
555	movq %mm2, %mm5			/* duplicate tmt2 */
556	punpckhdq %mm3, %mm4		/* tmt6 ; free mm3 */
557/* moved from next block */
558	movq %mm0, %mm3			/* duplicate tmt10 */
559/* column 0: odd part (after transpose)
560 *moved up to prev block
561 *	movq %mm0, %mm3			duplicate tmt10
562 *	movq %mm2, %mm5			duplicate tmt2
563 */
564	psubsw %mm4, %mm0		/* V110 */
565	paddsw %mm4, %mm3		/* V113 ; free mm4 */
566	movq %mm0, %mm4			/* duplicate V110 */
567	paddsw %mm1, %mm2		/* V111 */
568	pmulhw x539f539f539f539f, %mm0	/* 21407-> V117 */
569	psubsw %mm1, %mm5		/* V112 ; free mm1 */
570	psubsw %mm5, %mm4		/* V116 */
571	movq %mm2, %mm1			/* duplicate V111 */
572	pmulhw x4546454645464546, %mm5	/* 17734-> V119 */
573	psubsw %mm3, %mm2		/* V114 */
574	pmulhw x61f861f861f861f8, %mm4	/* 25080-> V120 */
575	paddsw %mm3, %mm1		/* V115 ; free mm3 */
576	pmulhw x5a825a825a825a82, %mm2	/* 23170-> V118 */
577	psllw $2, %mm0			/* t266 */
578	movq %mm1, (%esi)		/* save V115 */
579	psllw $1, %mm5			/* t268 */
580	psubsw %mm4, %mm5		/* V122 */
581	psubsw %mm0, %mm4		/* V121 ; free mm0 */
582	psllw $1, %mm5			/* t270 */
583	psubsw %mm1, %mm5		/* V123 ; free mm1 */
584	psllw $2, %mm2			/* t272 */
585	psubsw %mm5, %mm2		/* V124 (keep V123) */
586	psllw $1, %mm4			/* t274 */
587	movq %mm5, 8*2(%esi)		/* save V123 ; free mm5 */
588	paddsw %mm2, %mm4		/* V125 (keep V124) */
589/* column 0: even part (after transpose) */
590	movq 8*12(%esi), %mm0		/* tmt12 */
591	movq %mm6, %mm3			/* duplicate tmt4 */
592	psubsw %mm0, %mm6		/* V100 */
593	paddsw %mm0, %mm3		/* V101 ; free mm0 */
594	pmulhw x5a825a825a825a82, %mm6	/* 23170 ->V102 */
595	movq %mm7, %mm5			/* duplicate tmt0 */
596	movq 8*8(%esi), %mm1		/* tmt8 */
597	paddsw %mm1, %mm7		/* V103 */
598	psubsw %mm1, %mm5		/* V104 ; free mm1 */
599	movq %mm7, %mm0			/* duplicate V103 */
600	psllw $2, %mm6			/* t245 */
601	paddsw %mm3, %mm7		/* V106 */
602	movq %mm5, %mm1			/* duplicate V104 */
603	psubsw %mm3, %mm6		/* V105 */
604	psubsw %mm3, %mm0		/* V109; free mm3 */
605	paddsw %mm6, %mm5		/* V107 */
606	psubsw %mm6, %mm1		/* V108 ; free mm6 */
607/* column 0: output butterfly (after transform) */
608	movq %mm1, %mm3			/* duplicate V108 */
609	paddsw %mm2, %mm1		/* out4 */
610	psraw $4, %mm1
611	psubsw %mm2, %mm3		/* out10 ; free mm2 */
612	psraw $4, %mm3
613	movq %mm0, %mm6			/* duplicate V109 */
614	movq %mm1, 8*4(%esi)		/* out4 ; free mm1 */
615	psubsw %mm4, %mm0		/* out6 */
616	movq %mm3, 8*10(%esi)		/* out10 ; free mm3 */
617	psraw $4, %mm0
618	paddsw %mm4, %mm6		/* out8 ; free mm4 */
619	movq %mm7, %mm1			/* duplicate V106 */
620	movq %mm0, 8*6(%esi)		/* out6 ; free mm0 */
621	psraw $4, %mm6
622	movq (%esi), %mm4		/* V115 */
623	movq %mm6, 8*8(%esi)		/* out8 ; free mm6 */
624	movq %mm5, %mm2			/* duplicate V107 */
625	movq 8*2(%esi), %mm3		/* V123 */
626	paddsw %mm4, %mm7		/* out0 */
627/* moved up from next block */
628	movq scratch3, %mm0
629	psraw $4, %mm7
630/* moved up from next block */
631	movq scratch5, %mm6
632	psubsw %mm4, %mm1		/* out14 ; free mm4 */
633	paddsw %mm3, %mm5		/* out2 */
634	psraw $4, %mm1
635	movq %mm7, (%esi)		/* out0 ; free mm7 */
636	psraw $4, %mm5
637	movq %mm1, 8*14(%esi)		/* out14 ; free mm1 */
638	psubsw %mm3, %mm2		/* out12 ; free mm3 */
639	movq %mm5, 8*2(%esi)		/* out2 ; free mm5 */
640	psraw $4, %mm2
641/* moved up to the prev block */
642	movq scratch7, %mm4
643/* moved up to the prev block */
644	psraw $4, %mm0
645	movq %mm2, 8*12(%esi)		/* out12 ; free mm2 */
646/* moved up to the prev block */
647	psraw $4, %mm6
648/* move back the data to its correct place
649* moved up to the prev block
650 *	movq scratch3, %mm0
651 *	movq scratch5, %mm6
652 *	movq scratch7, %mm4
653 *	psraw $4, %mm0
654 *	psraw $4, %mm6
655*/
656	movq scratch1, %mm1
657	psraw $4, %mm4
658	movq %mm0, 8*3(%esi)		/* out3 */
659	psraw $4, %mm1
660	movq %mm6, 8*5(%esi)		/* out5 */
661	movq %mm4, 8*7(%esi)		/* out7 */
662	movq %mm1, 8(%esi)		/* out1 */
663	popl %edi
664	popl %esi
665	popl %edx
666	popl %ecx
667	popl %ebx
668	movl %ebp,%esp
669	popl %ebp
670	ret
671.Lfe1:
672	.size	 IDCT_mmx,.Lfe1-IDCT_mmx
673
674
675#endif /* i386 && USE_MMX */
676