1; XVID MPEG-4 VIDEO CODEC
2;
3; Conversion from gcc syntax to x264asm syntax with modifications
4; by Christophe Gisquet <christophe.gisquet@gmail.com>
5;
6; ===========     SSE2 inverse discrete cosine transform     ===========
7;
8; Copyright(C) 2003 Pascal Massimino <skal@planet-d.net>
9;
10; Conversion to gcc syntax with modifications
11; by Alexander Strange <astrange@ithinksw.com>
12;
13; Originally from dct/x86_asm/fdct_sse2_skal.asm in Xvid.
14;
15; Vertical pass is an implementation of the scheme:
16;  Loeffler C., Ligtenberg A., and Moschytz C.S.:
17;  Practical Fast 1D DCT Algorithm with Eleven Multiplications,
18;  Proc. ICASSP 1989, 988-991.
19;
20; Horizontal pass is a double 4x4 vector/matrix multiplication,
21; (see also Intel's Application Note 922:
22;  http://developer.intel.com/vtune/cbts/strmsimd/922down.htm
23;  Copyright (C) 1999 Intel Corporation)
24;
25; More details at http://skal.planet-d.net/coding/dct.html
26;
27; =======     MMX and XMM forward discrete cosine transform     =======
28;
29; Copyright(C) 2001 Peter Ross <pross@xvid.org>
30;
31; Originally provided by Intel at AP-922
32; http://developer.intel.com/vtune/cbts/strmsimd/922down.htm
33; (See more app notes at http://developer.intel.com/vtune/cbts/strmsimd/appnotes.htm)
34; but in a limited edition.
35; New macro implements a column part for precise iDCT
36; The routine precision now satisfies IEEE standard 1180-1990.
37;
38; Copyright(C) 2000-2001 Peter Gubanov <peter@elecard.net.ru>
39; Rounding trick Copyright(C) 2000 Michel Lespinasse <walken@zoy.org>
40;
41; http://www.elecard.com/peter/idct.html
42; http://www.linuxvideo.org/mpeg2dec/
43;
44; These examples contain code fragments for first stage iDCT 8x8
45; (for rows) and first stage DCT 8x8 (for columns)
46;
47; conversion to gcc syntax by Michael Niedermayer
48;
49; ======================================================================
50;
51; This file is part of FFmpeg.
52;
53; FFmpeg is free software; you can redistribute it and/or
54; modify it under the terms of the GNU Lesser General Public
55; License as published by the Free Software Foundation; either
56; version 2.1 of the License, or (at your option) any later version.
57;
58; FFmpeg is distributed in the hope that it will be useful,
59; but WITHOUT ANY WARRANTY; without even the implied warranty of
60; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
61; Lesser General Public License for more details.
62;
63; You should have received a copy of the GNU Lesser General Public License
64; along with FFmpeg; if not, write to the Free Software Foundation,
65; Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
66
67%include "libavutil/x86/x86util.asm"
68
69SECTION_RODATA
70; Similar to tg_1_16 in MMX code
71tan1:   times 8 dw 13036
72tan2:   times 8 dw 27146
73tan3:   times 8 dw 43790
74sqrt2:  times 8 dw 23170
75
76; SSE2 tables
77iTab1:  dw 0x4000, 0x539f, 0xc000, 0xac61, 0x4000, 0xdd5d, 0x4000, 0xdd5d
78        dw 0x4000, 0x22a3, 0x4000, 0x22a3, 0xc000, 0x539f, 0x4000, 0xac61
79        dw 0x3249, 0x11a8, 0x4b42, 0xee58, 0x11a8, 0x4b42, 0x11a8, 0xcdb7
80        dw 0x58c5, 0x4b42, 0xa73b, 0xcdb7, 0x3249, 0xa73b, 0x4b42, 0xa73b
81iTab2:  dw 0x58c5, 0x73fc, 0xa73b, 0x8c04, 0x58c5, 0xcff5, 0x58c5, 0xcff5
82        dw 0x58c5, 0x300b, 0x58c5, 0x300b, 0xa73b, 0x73fc, 0x58c5, 0x8c04
83        dw 0x45bf, 0x187e, 0x6862, 0xe782, 0x187e, 0x6862, 0x187e, 0xba41
84        dw 0x7b21, 0x6862, 0x84df, 0xba41, 0x45bf, 0x84df, 0x6862, 0x84df
85iTab3:  dw 0x539f, 0x6d41, 0xac61, 0x92bf, 0x539f, 0xd2bf, 0x539f, 0xd2bf
86        dw 0x539f, 0x2d41, 0x539f, 0x2d41, 0xac61, 0x6d41, 0x539f, 0x92bf
87        dw 0x41b3, 0x1712, 0x6254, 0xe8ee, 0x1712, 0x6254, 0x1712, 0xbe4d
88        dw 0x73fc, 0x6254, 0x8c04, 0xbe4d, 0x41b3, 0x8c04, 0x6254, 0x8c04
89iTab4:  dw 0x4b42, 0x6254, 0xb4be, 0x9dac, 0x4b42, 0xd746, 0x4b42, 0xd746
90        dw 0x4b42, 0x28ba, 0x4b42, 0x28ba, 0xb4be, 0x6254, 0x4b42, 0x9dac
91        dw 0x3b21, 0x14c3, 0x587e, 0xeb3d, 0x14c3, 0x587e, 0x14c3, 0xc4df
92        dw 0x6862, 0x587e, 0x979e, 0xc4df, 0x3b21, 0x979e, 0x587e, 0x979e
93
94%if ARCH_X86_32
95; -----------------------------------------------------------------------------
96;
97; The first stage iDCT 8x8 - inverse DCTs of rows
98;
99; -----------------------------------------------------------------------------
100; The 8-point inverse DCT direct algorithm
101; -----------------------------------------------------------------------------
102;
103; static const short w[32] = {
104;     FIX(cos_4_16),  FIX(cos_2_16),  FIX(cos_4_16),  FIX(cos_6_16),
105;     FIX(cos_4_16),  FIX(cos_6_16), -FIX(cos_4_16), -FIX(cos_2_16),
106;     FIX(cos_4_16), -FIX(cos_6_16), -FIX(cos_4_16),  FIX(cos_2_16),
107;     FIX(cos_4_16), -FIX(cos_2_16),  FIX(cos_4_16), -FIX(cos_6_16),
108;     FIX(cos_1_16),  FIX(cos_3_16),  FIX(cos_5_16),  FIX(cos_7_16),
109;     FIX(cos_3_16), -FIX(cos_7_16), -FIX(cos_1_16), -FIX(cos_5_16),
110;     FIX(cos_5_16), -FIX(cos_1_16),  FIX(cos_7_16),  FIX(cos_3_16),
111;     FIX(cos_7_16), -FIX(cos_5_16),  FIX(cos_3_16), -FIX(cos_1_16) };
112;
113; #define DCT_8_INV_ROW(x, y)
114; {
115;     int a0, a1, a2, a3, b0, b1, b2, b3;
116;
117;     a0 = x[0] * w[0]  + x[2] * w[1]  + x[4] * w[2]  + x[6] * w[3];
118;     a1 = x[0] * w[4]  + x[2] * w[5]  + x[4] * w[6]  + x[6] * w[7];
119;     a2 = x[0] * w[8]  + x[2] * w[9]  + x[4] * w[10] + x[6] * w[11];
120;     a3 = x[0] * w[12] + x[2] * w[13] + x[4] * w[14] + x[6] * w[15];
121;     b0 = x[1] * w[16] + x[3] * w[17] + x[5] * w[18] + x[7] * w[19];
122;     b1 = x[1] * w[20] + x[3] * w[21] + x[5] * w[22] + x[7] * w[23];
123;     b2 = x[1] * w[24] + x[3] * w[25] + x[5] * w[26] + x[7] * w[27];
124;     b3 = x[1] * w[28] + x[3] * w[29] + x[5] * w[30] + x[7] * w[31];
125;
126;     y[0] = SHIFT_ROUND(a0 + b0);
127;     y[1] = SHIFT_ROUND(a1 + b1);
128;     y[2] = SHIFT_ROUND(a2 + b2);
129;     y[3] = SHIFT_ROUND(a3 + b3);
130;     y[4] = SHIFT_ROUND(a3 - b3);
131;     y[5] = SHIFT_ROUND(a2 - b2);
132;     y[6] = SHIFT_ROUND(a1 - b1);
133;     y[7] = SHIFT_ROUND(a0 - b0);
134; }
135;
136; -----------------------------------------------------------------------------
137;
138; In this implementation the outputs of the iDCT-1D are multiplied
139;     for rows 0,4 - by cos_4_16,
140;     for rows 1,7 - by cos_1_16,
141;     for rows 2,6 - by cos_2_16,
142;     for rows 3,5 - by cos_3_16
143; and are shifted to the left for better accuracy.
144;
145; For the constants used,
146;     FIX(float_const) = (short) (float_const * (1 << 15) + 0.5)
147;
148; -----------------------------------------------------------------------------
149
150; -----------------------------------------------------------------------------
151; Tables for mmx processors
152; -----------------------------------------------------------------------------
153
154; Table for rows 0,4 - constants are multiplied by cos_4_16
155tab_i_04_mmx: dw  16384,  16384,  16384, -16384
156              dw  21407,   8867,   8867, -21407 ; w07 w05 w03 w01
157              dw  16384, -16384,  16384,  16384 ; w14 w12 w10 w08
158              dw  -8867,  21407, -21407,  -8867 ; w15 w13 w11 w09
159              dw  22725,  12873,  19266, -22725 ; w22 w20 w18 w16
160              dw  19266,   4520,  -4520, -12873 ; w23 w21 w19 w17
161              dw  12873,   4520,   4520,  19266 ; w30 w28 w26 w24
162              dw -22725,  19266, -12873, -22725 ; w31 w29 w27 w25
163; Table for rows 1,7 - constants are multiplied by cos_1_16
164              dw  22725,  22725,  22725, -22725 ; movq-> w06 w04 w02 w00
165              dw  29692,  12299,  12299, -29692 ; w07 w05 w03 w01
166              dw  22725, -22725,  22725,  22725 ; w14 w12 w10 w08
167              dw -12299,  29692, -29692, -12299 ; w15 w13 w11 w09
168              dw  31521,  17855,  26722, -31521 ; w22 w20 w18 w16
169              dw  26722,   6270,  -6270, -17855 ; w23 w21 w19 w17
170              dw  17855,   6270,   6270,  26722 ; w30 w28 w26 w24
171              dw -31521,  26722, -17855, -31521 ; w31 w29 w27 w25
172; Table for rows 2,6 - constants are multiplied by cos_2_16
173              dw  21407,  21407,  21407, -21407 ; movq-> w06 w04 w02 w00
174              dw  27969,  11585,  11585, -27969 ; w07 w05 w03 w01
175              dw  21407, -21407,  21407,  21407 ; w14 w12 w10 w08
176              dw -11585,  27969, -27969, -11585 ; w15 w13 w11 w09
177              dw  29692,  16819,  25172, -29692 ; w22 w20 w18 w16
178              dw  25172,   5906,  -5906, -16819 ; w23 w21 w19 w17
179              dw  16819,   5906,   5906,  25172 ; w30 w28 w26 w24
180              dw -29692,  25172, -16819, -29692 ; w31 w29 w27 w25
181; Table for rows 3,5 - constants are multiplied by cos_3_16
182              dw  19266,  19266,  19266, -19266 ; movq-> w06 w04 w02 w00
183              dw  25172,  10426,  10426, -25172 ; w07 w05 w03 w01
184              dw  19266, -19266,  19266,  19266 ; w14 w12 w10 w08
185              dw -10426,  25172, -25172, -10426 ; w15 w13 w11 w09
186              dw  26722,  15137,  22654, -26722 ; w22 w20 w18 w16
187              dw  22654,   5315,  -5315, -15137 ; w23 w21 w19 w17
188              dw  15137,   5315,   5315,  22654 ; w30 w28 w26 w24
189              dw -26722,  22654, -15137, -26722 ; w31 w29 w27 w25
190
191; -----------------------------------------------------------------------------
192; Tables for xmm processors
193; -----------------------------------------------------------------------------
194
195; %3 for rows 0,4 - constants are multiplied by cos_4_16
196tab_i_04_xmm: dw  16384,  21407,  16384,   8867 ; movq-> w05 w04 w01 w00
197              dw  16384,   8867, -16384, -21407 ; w07 w06 w03 w02
198              dw  16384,  -8867,  16384, -21407 ; w13 w12 w09 w08
199              dw -16384,  21407,  16384,  -8867 ; w15 w14 w11 w10
200              dw  22725,  19266,  19266,  -4520 ; w21 w20 w17 w16
201              dw  12873,   4520, -22725, -12873 ; w23 w22 w19 w18
202              dw  12873, -22725,   4520, -12873 ; w29 w28 w25 w24
203              dw   4520,  19266,  19266, -22725 ; w31 w30 w27 w26
204; %3 for rows 1,7 - constants are multiplied by cos_1_16
205              dw  22725,  29692,  22725,  12299 ; movq-> w05 w04 w01 w00
206              dw  22725,  12299, -22725, -29692 ; w07 w06 w03 w02
207              dw  22725, -12299,  22725, -29692 ; w13 w12 w09 w08
208              dw -22725,  29692,  22725, -12299 ; w15 w14 w11 w10
209              dw  31521,  26722,  26722,  -6270 ; w21 w20 w17 w16
210              dw  17855,   6270, -31521, -17855 ; w23 w22 w19 w18
211              dw  17855, -31521,   6270, -17855 ; w29 w28 w25 w24
212              dw   6270,  26722,  26722, -31521 ; w31 w30 w27 w26
213; %3 for rows 2,6 - constants are multiplied by cos_2_16
214              dw  21407,  27969,  21407,  11585 ; movq-> w05 w04 w01 w00
215              dw  21407,  11585, -21407, -27969 ; w07 w06 w03 w02
216              dw  21407, -11585,  21407, -27969 ; w13 w12 w09 w08
217              dw -21407,  27969,  21407, -11585 ; w15 w14 w11 w10
218              dw  29692,  25172,  25172,  -5906 ; w21 w20 w17 w16
219              dw  16819,   5906, -29692, -16819 ; w23 w22 w19 w18
220              dw  16819, -29692,   5906, -16819 ; w29 w28 w25 w24
221              dw   5906,  25172,  25172, -29692 ; w31 w30 w27 w26
222; %3 for rows 3,5 - constants are multiplied by cos_3_16
223              dw  19266,  25172,  19266,  10426 ; movq-> w05 w04 w01 w00
224              dw  19266,  10426, -19266, -25172 ; w07 w06 w03 w02
225              dw  19266, -10426,  19266, -25172 ; w13 w12 w09 w08
226              dw -19266,  25172,  19266, -10426 ; w15 w14 w11 w10
227              dw  26722,  22654,  22654,  -5315 ; w21 w20 w17 w16
228              dw  15137,   5315, -26722, -15137 ; w23 w22 w19 w18
229              dw  15137, -26722,   5315, -15137 ; w29 w28 w25 w24
230              dw   5315,  22654,  22654, -26722 ; w31 w30 w27 w26
231%endif ; ~ARCH_X86_32
232
233; Similar to rounder_0 in MMX code
234; 4 first similar, then: 4*8->6*16  5*8->4*16  6/7*8->5*16
235walkenIdctRounders: times 4 dd 65536
236                    times 4 dd  3597
237                    times 4 dd  2260
238                    times 4 dd  1203
239                    times 4 dd   120
240                    times 4 dd   512
241                    times 2 dd     0
242
243pb_127: times 8 db 127
244
245SECTION .text
246
247; Temporary storage before the column pass
248%define ROW1 xmm6
249%define ROW3 xmm4
250%define ROW5 xmm5
251%define ROW7 xmm7
252
253%macro CLEAR_ODD 1
254    pxor      %1, %1
255%endmacro
256%macro PUT_ODD 1
257    pshufhw   %1, xmm2, 0x1B
258%endmacro
259
260%macro MOV32 2
261%if ARCH_X86_32
262    movdqa    %2, %1
263%endif
264%endmacro
265
266%macro CLEAR_EVEN 1
267%if ARCH_X86_64
268    CLEAR_ODD %1
269%endif
270%endmacro
271
272%macro PUT_EVEN 1
273%if ARCH_X86_64
274    PUT_ODD   %1
275%else
276    pshufhw xmm2, xmm2, 0x1B
277    movdqa    %1, xmm2
278%endif
279%endmacro
280
281%if ARCH_X86_64
282%define ROW0  xmm8
283%define REG0  ROW0
284%define ROW2  xmm9
285%define REG2  ROW2
286%define ROW4  xmm10
287%define REG4  ROW4
288%define ROW6  xmm11
289%define REG6  ROW6
290%define XMMS  xmm12
291%define SREG2 REG2
292%define TAN3  xmm13
293%define TAN1  xmm14
294%else
295%define ROW0  [BLOCK + 0*16]
296%define REG0  xmm4
297%define ROW2  [BLOCK + 2*16]
298%define REG2  xmm4
299%define ROW4  [BLOCK + 4*16]
300%define REG4  xmm6
301%define ROW6  [BLOCK + 6*16]
302%define REG6  xmm6
303%define XMMS  xmm2
304%define SREG2 xmm7
305%define TAN3  xmm0
306%define TAN1  xmm2
307%endif
308
309%macro JZ  2
310    test      %1, %1
311    jz       .%2
312%endmacro
313
314%macro JNZ  2
315    test      %1, %1
316    jnz      .%2
317%endmacro
318
319%macro TEST_ONE_ROW 4 ; src, reg, clear, arg
320    %3        %4
321    movq     mm1, [%1]
322    por      mm1, [%1 + 8]
323    paddusb  mm1, mm0
324    pmovmskb  %2, mm1
325%endmacro
326
327;row1, row2, reg1, reg2, clear1, arg1, clear2, arg2
328%macro  TEST_TWO_ROWS  8
329    %5         %6
330    %7         %8
331    movq      mm1, [%1 + 0]
332    por       mm1, [%1 + 8]
333    movq      mm2, [%2 + 0]
334    por       mm2, [%2 + 8]
335    paddusb   mm1, mm0
336    paddusb   mm2, mm0
337    pmovmskb   %3, mm1
338    pmovmskb   %4, mm2
339%endmacro
340
341; IDCT pass on rows.
342%macro iMTX_MULT   4-5 ; src, table, put, arg, rounder
343    movdqa       xmm3, [%1]
344    movdqa       xmm0, xmm3
345    pshufd       xmm1, xmm3, 0x11 ; 4602
346    punpcklqdq   xmm0, xmm0       ; 0246
347    pmaddwd      xmm0, [%2]
348    pmaddwd      xmm1, [%2+16]
349    pshufd       xmm2, xmm3, 0xBB ; 5713
350    punpckhqdq   xmm3, xmm3       ; 1357
351    pmaddwd      xmm2, [%2+32]
352    pmaddwd      xmm3, [%2+48]
353    paddd        xmm0, xmm1
354    paddd        xmm2, xmm3
355%if %0 == 5
356    paddd        xmm0, [walkenIdctRounders+%5]
357%endif
358    movdqa       xmm3, xmm2
359    paddd        xmm2, xmm0
360    psubd        xmm0, xmm3
361    psrad        xmm2, 11
362    psrad        xmm0, 11
363    packssdw     xmm2, xmm0
364    %3           %4
365%endmacro
366
367%macro iLLM_HEAD 0
368    movdqa   TAN3, [tan3]
369    movdqa   TAN1, [tan1]
370%endmacro
371
372%macro FIRST_HALF 2  ; %1=dct  %2=type(normal,add,put)
373    psraw    xmm5, 6
374    psraw    REG0, 6
375    psraw    TAN3, 6
376    psraw    xmm3, 6
377    ; dct coeffs must still be written for AC prediction
378%if %2 == 0
379    movdqa   [%1+1*16], TAN3
380    movdqa   [%1+2*16], xmm3
381    movdqa   [%1+5*16], REG0
382    movdqa   [%1+6*16], xmm5
383%else
384    ; Must now load args as gprs are no longer used for masks
385    ; DEST is set to where address of dest was loaded
386    %if ARCH_X86_32
387        %if %2 == 2 ; Not enough xmms, store
388    movdqa   [%1+1*16], TAN3
389    movdqa   [%1+2*16], xmm3
390    movdqa   [%1+5*16], REG0
391    movdqa   [%1+6*16], xmm5
392        %endif
393    %xdefine DEST r2q ; BLOCK is r0, stride r1
394    movifnidn DEST, destm
395    movifnidn strideq, stridem
396    %else
397    %xdefine DEST r0q
398    %endif
399    lea      r3q, [3*strideq]
400    %if %2 == 1
401    packuswb TAN3, xmm3
402    packuswb xmm5, REG0
403    movq     [DEST + strideq], TAN3
404    movhps   [DEST + 2*strideq], TAN3
405    ; REG0 and TAN3 are now available (and likely used in second half)
406    %endif
407%endif
408%endmacro
409
410%macro SECOND_HALF 6 ; %1=dct  %2=type(normal,add,put) 3-6: xmms
411    psraw    %3, 6
412    psraw    %4, 6
413    psraw    %5, 6
414    psraw    %6, 6
415    ; dct coeffs must still be written for AC prediction
416%if %2 == 0
417    movdqa   [%1+0*16], %3
418    movdqa   [%1+3*16], %5
419    movdqa   [%1+4*16], %6
420    movdqa   [%1+7*16], %4
421%elif %2 == 1
422    packuswb %3, %5
423    packuswb %6, %4
424    ; address of dest may have been loaded
425    movq     [DEST], %3
426    movhps   [DEST + r3q], %3
427    lea      DEST, [DEST + 4*strideq]
428    movq     [DEST], %6
429    movhps   [DEST + r3q], %6
430    ; and now write remainder of first half
431    movq     [DEST + 2*strideq], xmm5
432    movhps   [DEST + strideq], xmm5
433%elif %2 == 2
434    pxor        xmm0, xmm0
435    %if ARCH_X86_32
436    ; free: m3 REG0=m4 m5
437    ; input: m1, m7, m2, m6
438    movq        xmm3, [DEST+0*strideq]
439    movq        xmm4, [DEST+1*strideq]
440    punpcklbw   xmm3, xmm0
441    punpcklbw   xmm4, xmm0
442    paddsw      xmm3, %3
443    paddsw      xmm4, [%1 + 1*16]
444    movq          %3, [DEST+2*strideq]
445    movq        xmm5, [DEST+      r3q]
446    punpcklbw     %3, xmm0
447    punpcklbw   xmm5, xmm0
448    paddsw        %3, [%1 + 2*16]
449    paddsw      xmm5, %5
450    packuswb    xmm3, xmm4
451    packuswb      %3, xmm5
452    movq    [DEST+0*strideq], xmm3
453    movhps  [DEST+1*strideq], xmm3
454    movq    [DEST+2*strideq], %3
455    movhps  [DEST+      r3q], %3
456    lea         DEST, [DEST+4*strideq]
457    movq        xmm3, [DEST+0*strideq]
458    movq        xmm4, [DEST+1*strideq]
459    movq          %3, [DEST+2*strideq]
460    movq        xmm5, [DEST+      r3q]
461    punpcklbw   xmm3, xmm0
462    punpcklbw   xmm4, xmm0
463    punpcklbw     %3, xmm0
464    punpcklbw   xmm5, xmm0
465    paddsw      xmm3, %6
466    paddsw      xmm4, [%1 + 5*16]
467    paddsw        %3, [%1 + 6*16]
468    paddsw      xmm5, %4
469    packuswb    xmm3, xmm4
470    packuswb      %3, xmm5
471    movq    [DEST+0*strideq], xmm3
472    movhps  [DEST+1*strideq], xmm3
473    movq    [DEST+2*strideq], %3
474    movhps  [DEST+      r3q], %3
475    %else
476    ; l1:TAN3=m13  l2:m3  l5:REG0=m8 l6=m5
477    ; input: m1, m7/SREG2=m9, TAN1=m14, REG4=m10
478    movq        xmm2, [DEST+0*strideq]
479    movq        xmm4, [DEST+1*strideq]
480    movq       xmm12, [DEST+2*strideq]
481    movq       xmm11, [DEST+      r3q]
482    punpcklbw   xmm2, xmm0
483    punpcklbw   xmm4, xmm0
484    punpcklbw  xmm12, xmm0
485    punpcklbw  xmm11, xmm0
486    paddsw      xmm2, %3
487    paddsw      xmm4, TAN3
488    paddsw     xmm12, xmm3
489    paddsw     xmm11, %5
490    packuswb    xmm2, xmm4
491    packuswb   xmm12, xmm11
492    movq    [DEST+0*strideq], xmm2
493    movhps  [DEST+1*strideq], xmm2
494    movq    [DEST+2*strideq], xmm12
495    movhps  [DEST+      r3q], xmm12
496    lea         DEST, [DEST+4*strideq]
497    movq        xmm2, [DEST+0*strideq]
498    movq        xmm4, [DEST+1*strideq]
499    movq       xmm12, [DEST+2*strideq]
500    movq       xmm11, [DEST+      r3q]
501    punpcklbw   xmm2, xmm0
502    punpcklbw   xmm4, xmm0
503    punpcklbw  xmm12, xmm0
504    punpcklbw  xmm11, xmm0
505    paddsw      xmm2, %6
506    paddsw      xmm4, REG0
507    paddsw     xmm12, xmm5
508    paddsw     xmm11, %4
509    packuswb    xmm2, xmm4
510    packuswb   xmm12, xmm11
511    movq    [DEST+0*strideq], xmm2
512    movhps  [DEST+1*strideq], xmm2
513    movq    [DEST+2*strideq], xmm12
514    movhps  [DEST+      r3q], xmm12
515    %endif
516%endif
517%endmacro
518
519
520; IDCT pass on columns.
521%macro iLLM_PASS  2  ; %1=dct  %2=type(normal,add,put)
522    movdqa   xmm1, TAN3
523    movdqa   xmm3, TAN1
524    pmulhw   TAN3, xmm4
525    pmulhw   xmm1, xmm5
526    paddsw   TAN3, xmm4
527    paddsw   xmm1, xmm5
528    psubsw   TAN3, xmm5
529    paddsw   xmm1, xmm4
530    pmulhw   xmm3, xmm7
531    pmulhw   TAN1, xmm6
532    paddsw   xmm3, xmm6
533    psubsw   TAN1, xmm7
534    movdqa   xmm7, xmm3
535    movdqa   xmm6, TAN1
536    psubsw   xmm3, xmm1
537    psubsw   TAN1, TAN3
538    paddsw   xmm1, xmm7
539    paddsw   TAN3, xmm6
540    movdqa   xmm6, xmm3
541    psubsw   xmm3, TAN3
542    paddsw   TAN3, xmm6
543    movdqa   xmm4, [sqrt2]
544    pmulhw   xmm3, xmm4
545    pmulhw   TAN3, xmm4
546    paddsw   TAN3, TAN3
547    paddsw   xmm3, xmm3
548    movdqa   xmm7, [tan2]
549    MOV32    ROW2, REG2
550    MOV32    ROW6, REG6
551    movdqa   xmm5, xmm7
552    pmulhw   xmm7, REG6
553    pmulhw   xmm5, REG2
554    paddsw   xmm7, REG2
555    psubsw   xmm5, REG6
556    MOV32    ROW0, REG0
557    MOV32    ROW4, REG4
558    MOV32    TAN1, [BLOCK]
559    movdqa   XMMS, REG0
560    psubsw   REG0, REG4
561    paddsw   REG4, XMMS
562    movdqa   XMMS, REG4
563    psubsw   REG4, xmm7
564    paddsw   xmm7, XMMS
565    movdqa   XMMS, REG0
566    psubsw   REG0, xmm5
567    paddsw   xmm5, XMMS
568    movdqa   XMMS, xmm5
569    psubsw   xmm5, TAN3
570    paddsw   TAN3, XMMS
571    movdqa   XMMS, REG0
572    psubsw   REG0, xmm3
573    paddsw   xmm3, XMMS
574    MOV32    [BLOCK], TAN1
575
576    FIRST_HALF %1, %2
577
578    movdqa   xmm0, xmm7
579    movdqa   xmm4, REG4
580    psubsw   xmm7, xmm1
581    psubsw   REG4, TAN1
582    paddsw   xmm1, xmm0
583    paddsw   TAN1, xmm4
584
585    SECOND_HALF %1, %2, xmm1, xmm7, TAN1, REG4
586%endmacro
587
588; IDCT pass on columns, assuming rows 4-7 are zero
589%macro iLLM_PASS_SPARSE   2 ; %1=dct   %2=type(normal,put,add)
590    pmulhw   TAN3, xmm4
591    paddsw   TAN3, xmm4
592    movdqa   xmm3, xmm6
593    pmulhw   TAN1, xmm6
594    movdqa   xmm1, xmm4
595    psubsw   xmm3, xmm1
596    paddsw   xmm1, xmm6
597    movdqa   xmm6, TAN1
598    psubsw   TAN1, TAN3
599    paddsw   TAN3, xmm6
600    movdqa   xmm6, xmm3
601    psubsw   xmm3, TAN3
602    paddsw   TAN3, xmm6
603    movdqa   xmm4, [sqrt2]
604    pmulhw   xmm3, xmm4
605    pmulhw   TAN3, xmm4
606    paddsw   TAN3, TAN3
607    paddsw   xmm3, xmm3
608    movdqa   xmm5, [tan2]
609    MOV32    ROW2, SREG2
610    pmulhw   xmm5, SREG2
611    MOV32    ROW0, REG0
612    movdqa   xmm6, REG0
613    psubsw   xmm6, SREG2
614    paddsw  SREG2, REG0
615    MOV32    TAN1, [BLOCK]
616    movdqa   XMMS, REG0
617    psubsw   REG0, xmm5
618    paddsw   xmm5, XMMS
619    movdqa   XMMS, xmm5
620    psubsw   xmm5, TAN3
621    paddsw   TAN3, XMMS
622    movdqa   XMMS, REG0
623    psubsw   REG0, xmm3
624    paddsw   xmm3, XMMS
625    MOV32    [BLOCK], TAN1
626
627    FIRST_HALF %1, %2
628
629    movdqa   xmm0, SREG2
630    movdqa   xmm4, xmm6
631    psubsw  SREG2, xmm1
632    psubsw   xmm6, TAN1
633    paddsw   xmm1, xmm0
634    paddsw   TAN1, xmm4
635
636    SECOND_HALF %1, %2, xmm1, SREG2, TAN1, xmm6
637%endmacro
638
639%macro IDCT_SSE2 1 ; 0=normal  1=put  2=add
640%if %1 == 0 || ARCH_X86_32
641    %define GPR0  r1d
642    %define GPR1  r2d
643    %define GPR2  r3d
644    %define GPR3  r4d
645    %define NUM_GPRS 5
646%else
647    %define GPR0  r3d
648    %define GPR1  r4d
649    %define GPR2  r5d
650    %define GPR3  r6d
651    %define NUM_GPRS 7
652%endif
653%if %1 == 0
654cglobal xvid_idct, 1, NUM_GPRS, 8+7*ARCH_X86_64, block
655%xdefine BLOCK blockq
656%else
657    %if %1 == 1
658cglobal xvid_idct_put, 0, NUM_GPRS, 8+7*ARCH_X86_64, dest, stride, block
659    %else
660cglobal xvid_idct_add, 0, NUM_GPRS, 8+7*ARCH_X86_64, dest, stride, block
661    %endif
662    %if ARCH_X86_64
663    %xdefine BLOCK blockq
664    %else
665    mov    r0q, blockm
666    %xdefine BLOCK r0q
667    %endif
668%endif
669    movq           mm0, [pb_127]
670    iMTX_MULT      BLOCK + 0*16, iTab1, PUT_EVEN, ROW0, 0*16
671    iMTX_MULT      BLOCK + 1*16, iTab2, PUT_ODD, ROW1,  1*16
672    iMTX_MULT      BLOCK + 2*16, iTab3, PUT_EVEN, ROW2, 2*16
673
674    TEST_TWO_ROWS  BLOCK + 3*16, BLOCK + 4*16, GPR0, GPR1, CLEAR_ODD, ROW3, CLEAR_EVEN, ROW4 ; a, c
675    JZ   GPR0, col1
676    iMTX_MULT      BLOCK + 3*16, iTab4, PUT_ODD, ROW3,  3*16
677.col1:
678    TEST_TWO_ROWS  BLOCK + 5*16, BLOCK + 6*16, GPR0, GPR2, CLEAR_ODD, ROW5, CLEAR_EVEN, ROW6 ; a, d
679    TEST_ONE_ROW   BLOCK + 7*16, GPR3, CLEAR_ODD, ROW7 ; esi
680
681    iLLM_HEAD
682    JNZ  GPR1, 2
683    JNZ  GPR0, 3
684    JNZ  GPR2, 4
685    JNZ  GPR3, 5
686    iLLM_PASS_SPARSE BLOCK, %1
687    jmp .6
688.2:
689    iMTX_MULT     BLOCK + 4*16, iTab1, PUT_EVEN, ROW4
690.3:
691    iMTX_MULT     BLOCK + 5*16, iTab4, PUT_ODD, ROW5,  4*16
692    JZ   GPR2, col2
693.4:
694    iMTX_MULT     BLOCK + 6*16, iTab3, PUT_EVEN, ROW6, 5*16
695.col2:
696    JZ   GPR3, col3
697.5:
698    iMTX_MULT     BLOCK + 7*16, iTab2, PUT_ODD, ROW7,  5*16
699.col3:
700%if ARCH_X86_32
701    iLLM_HEAD
702%endif
703    iLLM_PASS     BLOCK, %1
704.6:
705    RET
706%endmacro
707
708INIT_XMM sse2
709IDCT_SSE2 0
710IDCT_SSE2 1
711IDCT_SSE2 2
712
713%if ARCH_X86_32
714
715; %1=offset  %2=tab_offset
716; %3=rnd_offset where 4*8->6*16  5*8->4*16  6/7*8->5*16
717%macro DCT_8_INV_ROW  3
718    movq       mm0, [r0+16*%1+0]  ; 0 ; x3 x2 x1 x0
719    movq       mm1, [r0+16*%1+8]  ; 1 ; x7 x6 x5 x4
720    movq       mm2, mm0       ; 2 ; x3 x2 x1 x0
721    movq       mm3, [%2+ 0]   ; 3 ; w06 w04 w02 w00
722%if cpuflag(mmxext)
723    pshufw     mm0, mm0, 0x88 ; x2 x0 x2 x0
724    movq       mm4, [%2+ 8]   ; 4 ; w07 w06 w03 w02
725    movq       mm5, mm1       ; 5 ; x7 x6 x5 x4
726    pmaddwd    mm3, mm0       ; x2*w05+x0*w04 x2*w01+x0*w00
727    movq       mm6, [%2+32]   ; 6 ; w21 w20 w17 w16
728    pshufw     mm1, mm1, 0x88 ; x6 x4 x6 x4
729    pmaddwd    mm4, mm1       ; x6*w07+x4*w06 x6*w03+x4*w02
730    movq       mm7, [%2+40]   ; 7; w23 w22 w19 w18
731    pshufw     mm2, mm2, 0xdd ; x3 x1 x3 x1
732    pmaddwd    mm6, mm2       ; x3*w21+x1*w20 x3*w17+x1*w16
733    pshufw     mm5, mm5, 0xdd ; x7 x5 x7 x5
734    pmaddwd    mm7, mm5       ; x7*w23+x5*w22 x7*w19+x5*w18
735    paddd      mm3, [walkenIdctRounders + %3]      ; +%3
736    pmaddwd    mm0, [%2+16]   ; x2*w13+x0*w12 x2*w09+x0*w08
737    paddd      mm3, mm4       ; 4 ; a1=sum(even1) a0=sum(even0)
738    pmaddwd    mm1, [%2+24]   ; x6*w15+x4*w14 x6*w11+x4*w10
739    movq       mm4, mm3       ; 4 ; a1 a0
740    pmaddwd    mm2, [%2+48]   ; x3*w29+x1*w28 x3*w25+x1*w24
741    paddd      mm6, mm7       ; 7 ; b1=sum(odd1) b0=sum(odd0)
742    pmaddwd    mm5, [%2+56]   ; x7*w31+x5*w30 x7*w27+x5*w26
743    paddd      mm3, mm6       ; a1+b1 a0+b0
744    paddd      mm0, [walkenIdctRounders + %3]      ; +%3
745    psrad      mm3, 11        ; y1=a1+b1 y0=a0+b0
746    paddd      mm0, mm1       ; 1 ; a3=sum(even3) a2=sum(even2)
747    psubd      mm4, mm6       ; 6 ; a1-b1 a0-b0
748    movq       mm7, mm0       ; 7 ; a3 a2
749    paddd      mm2, mm5       ; 5 ; b3=sum(odd3) b2=sum(odd2)
750    paddd      mm0, mm2       ; a3+b3 a2+b2
751    psrad      mm4, 11        ; y6=a1-b1 y7=a0-b0
752    psubd      mm7, mm2       ; 2 ; a3-b3 a2-b2
753    psrad      mm0, 11        ; y3=a3+b3 y2=a2+b2
754    psrad      mm7, 11        ; y4=a3-b3 y5=a2-b2
755    packssdw   mm3, mm0       ; 0 ; y3 y2 y1 y0
756    packssdw   mm7, mm4       ; 4 ; y6 y7 y4 y5
757    movq  [r0+16*%1+0], mm3       ; 3 ; save y3 y2 y1 y0
758    pshufw     mm7, mm7, 0xb1 ; y7 y6 y5 y4
759%else
760    punpcklwd  mm0, mm1       ; x5 x1 x4 x0
761    movq       mm5, mm0       ; 5 ; x5 x1 x4 x0
762    punpckldq  mm0, mm0       ; x4 x0 x4 x0
763    movq       mm4, [%2+ 8]   ; 4 ; w07 w05 w03 w01
764    punpckhwd  mm2, mm1       ; 1 ; x7 x3 x6 x2
765    pmaddwd    mm3, mm0       ; x4*w06+x0*w04 x4*w02+x0*w00
766    movq       mm6, mm2       ; 6 ; x7 x3 x6 x2
767    movq       mm1, [%2+32]   ; 1 ; w22 w20 w18 w16
768    punpckldq  mm2, mm2       ; x6 x2 x6 x2
769    pmaddwd    mm4, mm2       ; x6*w07+x2*w05 x6*w03+x2*w01
770    punpckhdq  mm5, mm5       ; x5 x1 x5 x1
771    pmaddwd    mm0, [%2+16]   ; x4*w14+x0*w12 x4*w10+x0*w08
772    punpckhdq  mm6, mm6       ; x7 x3 x7 x3
773    movq       mm7, [%2+40]   ; 7 ; w23 w21 w19 w17
774    pmaddwd    mm1, mm5       ; x5*w22+x1*w20 x5*w18+x1*w16
775    paddd      mm3, [walkenIdctRounders + %3]     ; +%3
776    pmaddwd    mm7, mm6       ; x7*w23+x3*w21 x7*w19+x3*w17
777    pmaddwd    mm2, [%2+24]   ; x6*w15+x2*w13 x6*w11+x2*w09
778    paddd      mm3, mm4       ; 4 ; a1=sum(even1) a0=sum(even0)
779    pmaddwd    mm5, [%2+48]   ; x5*w30+x1*w28 x5*w26+x1*w24
780    movq       mm4, mm3       ; 4 ; a1 a0
781    pmaddwd    mm6, [%2+56]   ; x7*w31+x3*w29 x7*w27+x3*w25
782    paddd      mm1, mm7       ; 7 ; b1=sum(odd1) b0=sum(odd0)
783    paddd      mm0, [walkenIdctRounders + %3]     ; +%3
784    psubd      mm3, mm1       ; a1-b1 a0-b0
785    psrad      mm3, 11        ; y6=a1-b1 y7=a0-b0
786    paddd      mm1, mm4       ; 4 ; a1+b1 a0+b0
787    paddd      mm0, mm2       ; 2 ; a3=sum(even3) a2=sum(even2)
788    psrad      mm1, 11        ; y1=a1+b1 y0=a0+b0
789    paddd      mm5, mm6       ; 6 ; b3=sum(odd3) b2=sum(odd2)
790    movq       mm4, mm0       ; 4 ; a3 a2
791    paddd      mm0, mm5       ; a3+b3 a2+b2
792    psubd      mm4, mm5       ; 5 ; a3-b3 a2-b2
793    psrad      mm0, 11        ; y3=a3+b3 y2=a2+b2
794    psrad      mm4, 11        ; y4=a3-b3 y5=a2-b2
795    packssdw   mm1, mm0       ; 0 ; y3 y2 y1 y0
796    packssdw   mm4, mm3       ; 3 ; y6 y7 y4 y5
797    movq       mm7, mm4       ; 7 ; y6 y7 y4 y5
798    psrld      mm4, 16        ; 0 y6 0 y4
799    pslld      mm7, 16        ; y7 0 y5 0
800    movq  [r0+16*%1+0], mm1   ; 1 ; save y3 y2 y1 y0
801    por        mm7, mm4       ; 4 ; y7 y6 y5 y4
802%endif
803    movq  [r0+16*%1+8], mm7   ; 7 ; save y7 y6 y5 y4
804%endmacro
805
806; -----------------------------------------------------------------------------
807;
808; The first stage DCT 8x8 - forward DCTs of columns
809;
810; The %2puts are multiplied
811; for rows 0,4 - on cos_4_16,
812; for rows 1,7 - on cos_1_16,
813; for rows 2,6 - on cos_2_16,
814; for rows 3,5 - on cos_3_16
815; and are shifted to the left for rise of accuracy
816;
817; -----------------------------------------------------------------------------
818;
819; The 8-point scaled forward DCT algorithm (26a8m)
820;
821; -----------------------------------------------------------------------------
822;
823;#define DCT_8_FRW_COL(x, y)
824; {
825;     short t0, t1, t2, t3, t4, t5, t6, t7;
826;     short tp03, tm03, tp12, tm12, tp65, tm65;
827;     short tp465, tm465, tp765, tm765;
828;
829;     t0 = LEFT_SHIFT(x[0] + x[7]);
830;     t1 = LEFT_SHIFT(x[1] + x[6]);
831;     t2 = LEFT_SHIFT(x[2] + x[5]);
832;     t3 = LEFT_SHIFT(x[3] + x[4]);
833;     t4 = LEFT_SHIFT(x[3] - x[4]);
834;     t5 = LEFT_SHIFT(x[2] - x[5]);
835;     t6 = LEFT_SHIFT(x[1] - x[6]);
836;     t7 = LEFT_SHIFT(x[0] - x[7]);
837;
838;     tp03 = t0 + t3;
839;     tm03 = t0 - t3;
840;     tp12 = t1 + t2;
841;     tm12 = t1 - t2;
842;
843;     y[0] = tp03 + tp12;
844;     y[4] = tp03 - tp12;
845;
846;     y[2] = tm03 + tm12 * tg_2_16;
847;     y[6] = tm03 * tg_2_16 - tm12;
848;
849;     tp65 = (t6 + t5) * cos_4_16;
850;     tm65 = (t6 - t5) * cos_4_16;
851;
852;     tp765 = t7 + tp65;
853;     tm765 = t7 - tp65;
854;     tp465 = t4 + tm65;
855;     tm465 = t4 - tm65;
856;
857;     y[1] = tp765 + tp465 * tg_1_16;
858;     y[7] = tp765 * tg_1_16 - tp465;
859;     y[5] = tm765 * tg_3_16 + tm465;
860;     y[3] = tm765 - tm465 * tg_3_16;
861; }
862;
863; -----------------------------------------------------------------------------
864
865; -----------------------------------------------------------------------------
866; DCT_8_INV_COL_4  INP,OUT
867; -----------------------------------------------------------------------------
868%macro DCT_8_INV_COL 1
869    movq        mm0, [tan3]
870    movq        mm3, [%1+16*3]
871    movq        mm1, mm0 ; tg_3_16
872    movq        mm5, [%1+16*5]
873    pmulhw      mm0, mm3 ; x3*(tg_3_16-1)
874    movq        mm4, [tan1]
875    pmulhw      mm1, mm5 ; x5*(tg_3_16-1)
876    movq        mm7, [%1+16*7]
877    movq        mm2, mm4 ; tg_1_16
878    movq        mm6, [%1+16*1]
879    pmulhw      mm4, mm7 ; x7*tg_1_16
880    paddsw      mm0, mm3 ; x3*tg_3_16
881    pmulhw      mm2, mm6 ; x1*tg_1_16
882    paddsw      mm1, mm3 ; x3+x5*(tg_3_16-1)
883    psubsw      mm0, mm5 ; x3*tg_3_16-x5 = tm35
884    movq        mm3, [sqrt2]
885    paddsw      mm1, mm5 ; x3+x5*tg_3_16 = tp35
886    paddsw      mm4, mm6 ; x1+tg_1_16*x7 = tp17
887    psubsw      mm2, mm7 ; x1*tg_1_16-x7 = tm17
888    movq        mm5, mm4 ; tp17
889    movq        mm6, mm2 ; tm17
890    paddsw      mm5, mm1 ; tp17+tp35 = b0
891    psubsw      mm6, mm0 ; tm17-tm35 = b3
892    psubsw      mm4, mm1 ; tp17-tp35 = t1
893    paddsw      mm2, mm0 ; tm17+tm35 = t2
894    movq        mm7, [tan2]
895    movq        mm1, mm4 ; t1
896    movq  [%1+3*16], mm5 ; save b0
897    paddsw      mm1, mm2 ; t1+t2
898    movq  [%1+5*16], mm6 ; save b3
899    psubsw      mm4, mm2 ; t1-t2
900    movq        mm5, [%1+2*16]
901    movq        mm0, mm7 ; tg_2_16
902    movq        mm6, [%1+6*16]
903    pmulhw      mm0, mm5 ; x2*tg_2_16
904    pmulhw      mm7, mm6 ; x6*tg_2_16
905    pmulhw      mm1, mm3 ; ocos_4_16*(t1+t2) = b1/2
906    movq        mm2, [%1+0*16]
907    pmulhw      mm4, mm3 ; ocos_4_16*(t1-t2) = b2/2
908    psubsw      mm0, mm6 ; t2*tg_2_16-x6 = tm26
909    movq        mm3, mm2 ; x0
910    movq        mm6, [%1+4*16]
911    paddsw      mm7, mm5 ; x2+x6*tg_2_16 = tp26
912    paddsw      mm2, mm6 ; x0+x4 = tp04
913    psubsw      mm3, mm6 ; x0-x4 = tm04
914    movq        mm5, mm2 ; tp04
915    movq        mm6, mm3 ; tm04
916    psubsw      mm2, mm7 ; tp04-tp26 = a3
917    paddsw      mm3, mm0 ; tm04+tm26 = a1
918    paddsw      mm1, mm1 ; b1
919    paddsw      mm4, mm4 ; b2
920    paddsw      mm5, mm7 ; tp04+tp26 = a0
921    psubsw      mm6, mm0 ; tm04-tm26 = a2
922    movq        mm7, mm3 ; a1
923    movq        mm0, mm6 ; a2
924    paddsw      mm3, mm1 ; a1+b1
925    paddsw      mm6, mm4 ; a2+b2
926    psraw       mm3, 6   ; dst1
927    psubsw      mm7, mm1 ; a1-b1
928    psraw       mm6, 6   ; dst2
929    psubsw      mm0, mm4 ; a2-b2
930    movq        mm1, [%1+3*16] ; load b0
931    psraw       mm7, 6   ; dst6
932    movq        mm4, mm5 ; a0
933    psraw       mm0, 6   ; dst5
934    movq  [%1+1*16], mm3
935    paddsw      mm5, mm1 ; a0+b0
936    movq  [%1+2*16], mm6
937    psubsw      mm4, mm1 ; a0-b0
938    movq        mm3, [%1+5*16] ; load b3
939    psraw       mm5, 6   ; dst0
940    movq        mm6, mm2 ; a3
941    psraw       mm4, 6   ; dst7
942    movq  [%1+5*16], mm0
943    paddsw      mm2, mm3 ; a3+b3
944    movq  [%1+6*16], mm7
945    psubsw      mm6, mm3 ; a3-b3
946    movq  [%1+0*16], mm5
947    psraw       mm2, 6   ; dst3
948    movq  [%1+7*16], mm4
949    psraw       mm6, 6   ; dst4
950    movq  [%1+3*16], mm2
951    movq  [%1+4*16], mm6
952%endmacro
953
954%macro XVID_IDCT_MMX 0
955cglobal xvid_idct, 1, 1, 0, block
956%if cpuflag(mmxext)
957%define TAB tab_i_04_xmm
958%else
959%define TAB tab_i_04_mmx
960%endif
961    ; Process each row - beware of rounder offset
962    DCT_8_INV_ROW  0, TAB + 64 * 0, 0*16
963    DCT_8_INV_ROW  1, TAB + 64 * 1, 1*16
964    DCT_8_INV_ROW  2, TAB + 64 * 2, 2*16
965    DCT_8_INV_ROW  3, TAB + 64 * 3, 3*16
966    DCT_8_INV_ROW  4, TAB + 64 * 0, 6*16
967    DCT_8_INV_ROW  5, TAB + 64 * 3, 4*16
968    DCT_8_INV_ROW  6, TAB + 64 * 2, 5*16
969    DCT_8_INV_ROW  7, TAB + 64 * 1, 5*16
970
971    ; Process the columns (4 at a time)
972    DCT_8_INV_COL  r0+0
973    DCT_8_INV_COL  r0+8
974
975    RET
976%endmacro
977
978INIT_MMX mmx
979XVID_IDCT_MMX
980INIT_MMX mmxext
981XVID_IDCT_MMX
982
983%endif ; ~ARCH_X86_32
984