1;
2; Simple IDCT MMX
3;
4; Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at>
5;
6; Conversion from gcc syntax to x264asm syntax with minimal modifications
7; by James Darnley <jdarnley@obe.tv>.
8;
9; This file is part of FFmpeg.
10;
11; FFmpeg is free software; you can redistribute it and/or
12; modify it under the terms of the GNU Lesser General Public
13; License as published by the Free Software Foundation; either
14; version 2.1 of the License, or (at your option) any later version.
15;
16; FFmpeg is distributed in the hope that it will be useful,
17; but WITHOUT ANY WARRANTY; without even the implied warranty of
18; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19; Lesser General Public License for more details.
20;
21; You should have received a copy of the GNU Lesser General Public
22; License along with FFmpeg; if not, write to the Free Software
23; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24;/
25
26%include "libavutil/x86/x86util.asm"
27
28SECTION_RODATA
29
30cextern pb_80
31
32wm1010: dw 0, 0xffff, 0, 0xffff
33d40000: dd 4 << 16, 0
34
35; 23170.475006
36; 22725.260826
37; 21406.727617
38; 19265.545870
39; 16384.000000
40; 12872.826198
41; 8866.956905
42; 4520.335430
43
44%define C0 23170 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
45%define C1 22725 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
46%define C2 21407 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
47%define C3 19266 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
48%define C4 16383 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5
49%define C5 12873 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
50%define C6 8867  ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
51%define C7 4520  ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
52
53%define ROW_SHIFT 11
54%define COL_SHIFT 20 ; 6
55
56coeffs:
57    dw 1 << (ROW_SHIFT - 1), 0
58    dw 1 << (ROW_SHIFT - 1), 0
59    dw 1 << (ROW_SHIFT - 1), 1
60    dw 1 << (ROW_SHIFT - 1), 0
61
62    dw C4,  C4,  C4,  C4
63    dw C4, -C4,  C4, -C4
64
65    dw C2,  C6,  C2,  C6
66    dw C6, -C2,  C6, -C2
67
68    dw C1,  C3,  C1,  C3
69    dw C5,  C7,  C5,  C7
70
71    dw C3, -C7,  C3, -C7
72    dw -C1, -C5, -C1, -C5
73
74    dw C5, -C1,  C5, -C1
75    dw C7,  C3,  C7,  C3
76
77    dw C7, -C5,  C7, -C5
78    dw C3, -C1,  C3, -C1
79
80SECTION .text
81
82%macro DC_COND_IDCT 7
83    movq            mm0, [blockq + %1]  ; R4     R0      r4      r0
84    movq            mm1, [blockq + %2]  ; R6     R2      r6      r2
85    movq            mm2, [blockq + %3]  ; R3     R1      r3      r1
86    movq            mm3, [blockq + %4]  ; R7     R5      r7      r5
87    movq            mm4, [wm1010]
88    pand            mm4, mm0
89    por             mm4, mm1
90    por             mm4, mm2
91    por             mm4, mm3
92    packssdw        mm4, mm4
93    movd            t0d, mm4
94    or              t0d, t0d
95    jz              %%1
96    movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
97    pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
98    movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
99    pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
100    movq            mm5, [coeffs + 32]  ; C6     C2      C6      C2
101    pmaddwd         mm5, mm1            ; C6R6+C2R2      C6r6+C2r2
102    movq            mm6, [coeffs + 40]  ; -C2    C6      -C2     C6
103    pmaddwd         mm1, mm6            ; -C2R6+C6R2     -C2r6+C6r2
104    movq            mm7, [coeffs + 48]  ; C3     C1      C3      C1
105    pmaddwd         mm7, mm2            ; C3R3+C1R1      C3r3+C1r1
106    paddd           mm4, [coeffs + 8]
107    movq            mm6, mm4            ; C4R4+C4R0      C4r4+C4r0
108    paddd           mm4, mm5            ; A0             a0
109    psubd           mm6, mm5            ; A3             a3
110    movq            mm5, [coeffs + 56]  ; C7     C5      C7      C5
111    pmaddwd         mm5, mm3            ; C7R7+C5R5      C7r7+C5r5
112    paddd           mm0, [coeffs + 8]
113    paddd           mm1, mm0            ; A1             a1
114    paddd           mm0, mm0
115    psubd           mm0, mm1            ; A2             a2
116    pmaddwd         mm2, [coeffs + 64]  ; -C7R3+C3R1     -C7r3+C3r1
117    paddd           mm7, mm5            ; B0             b0
118    movq            mm5, [coeffs + 72]  ; -C5    -C1     -C5     -C1
119    pmaddwd         mm5, mm3            ; -C5R7-C1R5     -C5r7-C1r5
120    paddd           mm7, mm4            ; A0+B0          a0+b0
121    paddd           mm4, mm4            ; 2A0            2a0
122    psubd           mm4, mm7            ; A0-B0          a0-b0
123    paddd           mm5, mm2            ; B1             b1
124    psrad           mm7, %7
125    psrad           mm4, %7
126    movq            mm2, mm1            ; A1             a1
127    paddd           mm1, mm5            ; A1+B1          a1+b1
128    psubd           mm2, mm5            ; A1-B1          a1-b1
129    psrad           mm1, %7
130    psrad           mm2, %7
131    packssdw        mm7, mm1            ; A1+B1  a1+b1   A0+B0   a0+b0
132    packssdw        mm2, mm4            ; A0-B0  a0-b0   A1-B1   a1-b1
133    movq           [%5], mm7
134    movq            mm1, [blockq + %3]  ; R3     R1      r3      r1
135    movq            mm4, [coeffs + 80]  ; -C1    C5      -C1     C5
136    movq      [24 + %5], mm2
137    pmaddwd         mm4, mm1            ; -C1R3+C5R1     -C1r3+C5r1
138    movq            mm7, [coeffs + 88]  ; C3     C7      C3      C7
139    pmaddwd         mm1, [coeffs + 96]  ; -C5R3+C7R1     -C5r3+C7r1
140    pmaddwd         mm7, mm3            ; C3R7+C7R5      C3r7+C7r5
141    movq            mm2, mm0            ; A2             a2
142    pmaddwd         mm3, [coeffs + 104] ; -C1R7+C3R5     -C1r7+C3r5
143    paddd           mm4, mm7            ; B2             b2
144    paddd           mm2, mm4            ; A2+B2          a2+b2
145    psubd           mm0, mm4            ; a2-B2          a2-b2
146    psrad           mm2, %7
147    psrad           mm0, %7
148    movq            mm4, mm6            ; A3             a3
149    paddd           mm3, mm1            ; B3             b3
150    paddd           mm6, mm3            ; A3+B3          a3+b3
151    psubd           mm4, mm3            ; a3-B3          a3-b3
152    psrad           mm6, %7
153    packssdw        mm2, mm6            ; A3+B3  a3+b3   A2+B2   a2+b2
154    movq       [8 + %5], mm2
155    psrad           mm4, %7
156    packssdw        mm4, mm0            ; A2-B2  a2-b2   A3-B3   a3-b3
157    movq      [16 + %5], mm4
158    jmp             %%2
159%%1:
160    pslld           mm0, 16
161    paddd           mm0, [d40000]
162    psrad           mm0, 13
163    packssdw        mm0, mm0
164    movq           [%5], mm0
165    movq       [8 + %5], mm0
166    movq      [16 + %5], mm0
167    movq      [24 + %5], mm0
168%%2:
169%endmacro
170
171%macro Z_COND_IDCT 8
172    movq            mm0, [blockq + %1]  ; R4     R0      r4      r0
173    movq            mm1, [blockq + %2]  ; R6     R2      r6      r2
174    movq            mm2, [blockq + %3]  ; R3     R1      r3      r1
175    movq            mm3, [blockq + %4]  ; R7     R5      r7      r5
176    movq            mm4, mm0
177    por             mm4, mm1
178    por             mm4, mm2
179    por             mm4, mm3
180    packssdw        mm4, mm4
181    movd            t0d, mm4
182    or              t0d, t0d
183    jz               %8
184    movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
185    pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
186    movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
187    pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
188    movq            mm5, [coeffs + 32]  ; C6     C2      C6      C2
189    pmaddwd         mm5, mm1            ; C6R6+C2R2      C6r6+C2r2
190    movq            mm6, [coeffs + 40]  ; -C2    C6      -C2     C6
191    pmaddwd         mm1, mm6            ; -C2R6+C6R2     -C2r6+C6r2
192    movq            mm7, [coeffs + 48]  ; C3     C1      C3      C1
193    pmaddwd         mm7, mm2            ; C3R3+C1R1      C3r3+C1r1
194    paddd           mm4, [coeffs]
195    movq            mm6, mm4            ; C4R4+C4R0      C4r4+C4r0
196    paddd           mm4, mm5            ; A0             a0
197    psubd           mm6, mm5            ; A3             a3
198    movq            mm5, [coeffs + 56]  ; C7     C5      C7      C5
199    pmaddwd         mm5, mm3            ; C7R7+C5R5      C7r7+C5r5
200    paddd           mm0, [coeffs]
201    paddd           mm1, mm0            ; A1             a1
202    paddd           mm0, mm0
203    psubd           mm0, mm1            ; A2             a2
204    pmaddwd         mm2, [coeffs + 64]  ; -C7R3+C3R1     -C7r3+C3r1
205    paddd           mm7, mm5            ; B0             b0
206    movq            mm5, [coeffs + 72]  ; -C5    -C1     -C5     -C1
207    pmaddwd         mm5, mm3            ; -C5R7-C1R5     -C5r7-C1r5
208    paddd           mm7, mm4            ; A0+B0          a0+b0
209    paddd           mm4, mm4            ; 2A0            2a0
210    psubd           mm4, mm7            ; A0-B0          a0-b0
211    paddd           mm5, mm2            ; B1             b1
212    psrad           mm7, %7
213    psrad           mm4, %7
214    movq            mm2, mm1            ; A1             a1
215    paddd           mm1, mm5            ; A1+B1          a1+b1
216    psubd           mm2, mm5            ; A1-B1          a1-b1
217    psrad           mm1, %7
218    psrad           mm2, %7
219    packssdw        mm7, mm1            ; A1+B1  a1+b1   A0+B0   a0+b0
220    packssdw        mm2, mm4            ; A0-B0  a0-b0   A1-B1   a1-b1
221    movq           [%5], mm7
222    movq            mm1, [blockq + %3]  ; R3     R1      r3      r1
223    movq            mm4, [coeffs + 80]  ; -C1    C5      -C1     C5
224    movq      [24 + %5], mm2
225    pmaddwd         mm4, mm1            ; -C1R3+C5R1     -C1r3+C5r1
226    movq            mm7, [coeffs + 88]  ; C3     C7      C3      C7
227    pmaddwd         mm1, [coeffs + 96]  ; -C5R3+C7R1     -C5r3+C7r1
228    pmaddwd         mm7, mm3            ; C3R7+C7R5      C3r7+C7r5
229    movq            mm2, mm0            ; A2             a2
230    pmaddwd         mm3, [coeffs + 104] ; -C1R7+C3R5     -C1r7+C3r5
231    paddd           mm4, mm7            ; B2             b2
232    paddd           mm2, mm4            ; A2+B2          a2+b2
233    psubd           mm0, mm4            ; a2-B2          a2-b2
234    psrad           mm2, %7
235    psrad           mm0, %7
236    movq            mm4, mm6            ; A3             a3
237    paddd           mm3, mm1            ; B3             b3
238    paddd           mm6, mm3            ; A3+B3          a3+b3
239    psubd           mm4, mm3            ; a3-B3          a3-b3
240    psrad           mm6, %7
241    packssdw        mm2, mm6            ; A3+B3  a3+b3   A2+B2   a2+b2
242    movq       [8 + %5], mm2
243    psrad           mm4, %7
244    packssdw        mm4, mm0            ; A2-B2  a2-b2   A3-B3   a3-b3
245    movq      [16 + %5], mm4
246%endmacro
247
248%macro IDCT1 6
249    movq            mm0, %1             ; R4     R0      r4      r0
250    movq            mm1, %2             ; R6     R2      r6      r2
251    movq            mm2, %3             ; R3     R1      r3      r1
252    movq            mm3, %4             ; R7     R5      r7      r5
253    movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
254    pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
255    movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
256    pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
257    movq            mm5, [coeffs + 32]  ; C6     C2      C6      C2
258    pmaddwd         mm5, mm1            ; C6R6+C2R2      C6r6+C2r2
259    movq            mm6, [coeffs + 40]  ; -C2    C6      -C2     C6
260    pmaddwd         mm1, mm6            ; -C2R6+C6R2     -C2r6+C6r2
261    movq            mm6, mm4            ; C4R4+C4R0      C4r4+C4r0
262    movq            mm7, [coeffs + 48]  ; C3     C1      C3      C1
263    pmaddwd         mm7, mm2            ; C3R3+C1R1      C3r3+C1r1
264    paddd           mm4, mm5            ; A0             a0
265    psubd           mm6, mm5            ; A3             a3
266    movq            mm5, mm0            ; -C4R4+C4R0     -C4r4+C4r0
267    paddd           mm0, mm1            ; A1             a1
268    psubd           mm5, mm1            ; A2             a2
269    movq            mm1, [coeffs + 56]  ; C7     C5      C7      C5
270    pmaddwd         mm1, mm3            ; C7R7+C5R5      C7r7+C5r5
271    pmaddwd         mm2, [coeffs + 64]  ; -C7R3+C3R1     -C7r3+C3r1
272    paddd           mm7, mm1            ; B0             b0
273    movq            mm1, [coeffs + 72]  ; -C5    -C1     -C5     -C1
274    pmaddwd         mm1, mm3            ; -C5R7-C1R5     -C5r7-C1r5
275    paddd           mm7, mm4            ; A0+B0          a0+b0
276    paddd           mm4, mm4            ; 2A0            2a0
277    psubd           mm4, mm7            ; A0-B0          a0-b0
278    paddd           mm1, mm2            ; B1             b1
279    psrad           mm7, %6
280    psrad           mm4, %6
281    movq            mm2, mm0            ; A1             a1
282    paddd           mm0, mm1            ; A1+B1          a1+b1
283    psubd           mm2, mm1            ; A1-B1          a1-b1
284    psrad           mm0, %6
285    psrad           mm2, %6
286    packssdw        mm7, mm7            ; A0+B0  a0+b0
287    movd           [%5], mm7
288    packssdw        mm0, mm0            ; A1+B1  a1+b1
289    movd      [16 + %5], mm0
290    packssdw        mm2, mm2            ; A1-B1  a1-b1
291    movd      [96 + %5], mm2
292    packssdw        mm4, mm4            ; A0-B0  a0-b0
293    movd     [112 + %5], mm4
294    movq            mm0, %3             ; R3     R1      r3      r1
295    movq            mm4, [coeffs + 80]  ; -C1    C5      -C1     C5
296    pmaddwd         mm4, mm0            ; -C1R3+C5R1     -C1r3+C5r1
297    movq            mm7, [coeffs + 88]  ; C3     C7      C3      C7
298    pmaddwd         mm0, [coeffs + 96]  ; -C5R3+C7R1     -C5r3+C7r1
299    pmaddwd         mm7, mm3            ; C3R7+C7R5      C3r7+C7r5
300    movq            mm2, mm5            ; A2             a2
301    pmaddwd         mm3, [coeffs + 104] ; -C1R7+C3R5     -C1r7+C3r5
302    paddd           mm4, mm7            ; B2             b2
303    paddd           mm2, mm4            ; A2+B2          a2+b2
304    psubd           mm5, mm4            ; a2-B2          a2-b2
305    psrad           mm2, %6
306    psrad           mm5, %6
307    movq            mm4, mm6            ; A3             a3
308    paddd           mm3, mm0            ; B3             b3
309    paddd           mm6, mm3            ; A3+B3          a3+b3
310    psubd           mm4, mm3            ; a3-B3          a3-b3
311    psrad           mm6, %6
312    psrad           mm4, %6
313    packssdw        mm2, mm2            ; A2+B2  a2+b2
314    packssdw        mm6, mm6            ; A3+B3  a3+b3
315    movd      [32 + %5], mm2
316    packssdw        mm4, mm4            ; A3-B3  a3-b3
317    packssdw        mm5, mm5            ; A2-B2  a2-b2
318    movd      [48 + %5], mm6
319    movd      [64 + %5], mm4
320    movd      [80 + %5], mm5
321%endmacro
322
323%macro IDCT2 6
324    movq            mm0, %1             ; R4     R0      r4      r0
325    movq            mm1, %2             ; R6     R2      r6      r2
326    movq            mm3, %4             ; R7     R5      r7      r5
327    movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
328    pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
329    movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
330    pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
331    movq            mm5, [coeffs + 32]  ; C6     C2      C6      C2
332    pmaddwd         mm5, mm1            ; C6R6+C2R2      C6r6+C2r2
333    movq            mm6, [coeffs + 40]  ; -C2    C6      -C2     C6
334    pmaddwd         mm1, mm6            ; -C2R6+C6R2     -C2r6+C6r2
335    movq            mm6, mm4            ; C4R4+C4R0      C4r4+C4r0
336    paddd           mm4, mm5            ; A0             a0
337    psubd           mm6, mm5            ; A3             a3
338    movq            mm5, mm0            ; -C4R4+C4R0     -C4r4+C4r0
339    paddd           mm0, mm1            ; A1             a1
340    psubd           mm5, mm1            ; A2             a2
341    movq            mm1, [coeffs + 56]  ; C7     C5      C7      C5
342    pmaddwd         mm1, mm3            ; C7R7+C5R5      C7r7+C5r5
343    movq            mm7, [coeffs + 72]  ; -C5    -C1     -C5     -C1
344    pmaddwd         mm7, mm3            ; -C5R7-C1R5     -C5r7-C1r5
345    paddd           mm1, mm4            ; A0+B0          a0+b0
346    paddd           mm4, mm4            ; 2A0            2a0
347    psubd           mm4, mm1            ; A0-B0          a0-b0
348    psrad           mm1, %6
349    psrad           mm4, %6
350    movq            mm2, mm0            ; A1             a1
351    paddd           mm0, mm7            ; A1+B1          a1+b1
352    psubd           mm2, mm7            ; A1-B1          a1-b1
353    psrad           mm0, %6
354    psrad           mm2, %6
355    packssdw        mm1, mm1            ; A0+B0  a0+b0
356    movd           [%5], mm1
357    packssdw        mm0, mm0            ; A1+B1  a1+b1
358    movd      [16 + %5], mm0
359    packssdw        mm2, mm2            ; A1-B1  a1-b1
360    movd      [96 + %5], mm2
361    packssdw        mm4, mm4            ; A0-B0  a0-b0
362    movd     [112 + %5], mm4
363    movq            mm1, [coeffs + 88]  ; C3     C7      C3      C7
364    pmaddwd         mm1, mm3            ; C3R7+C7R5      C3r7+C7r5
365    movq            mm2, mm5            ; A2             a2
366    pmaddwd         mm3, [coeffs + 104] ; -C1R7+C3R5     -C1r7+C3r5
367    paddd           mm2, mm1            ; A2+B2          a2+b2
368    psubd           mm5, mm1            ; a2-B2          a2-b2
369    psrad           mm2, %6
370    psrad           mm5, %6
371    movq            mm1, mm6            ; A3             a3
372    paddd           mm6, mm3            ; A3+B3          a3+b3
373    psubd           mm1, mm3            ; a3-B3          a3-b3
374    psrad           mm6, %6
375    psrad           mm1, %6
376    packssdw        mm2, mm2            ; A2+B2  a2+b2
377    packssdw        mm6, mm6            ; A3+B3  a3+b3
378    movd      [32 + %5], mm2
379    packssdw        mm1, mm1            ; A3-B3  a3-b3
380    packssdw        mm5, mm5            ; A2-B2  a2-b2
381    movd      [48 + %5], mm6
382    movd      [64 + %5], mm1
383    movd      [80 + %5], mm5
384%endmacro
385
386%macro IDCT3 6
387    movq            mm0, %1             ; R4     R0      r4      r0
388    movq            mm3, %4             ; R7     R5      r7      r5
389    movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
390    pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
391    movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
392    pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
393    movq            mm6, mm4            ; C4R4+C4R0      C4r4+C4r0
394    movq            mm5, mm0            ; -C4R4+C4R0     -C4r4+C4r0
395    movq            mm1, [coeffs + 56]  ; C7     C5      C7      C5
396    pmaddwd         mm1, mm3            ; C7R7+C5R5      C7r7+C5r5
397    movq            mm7, [coeffs + 72]  ; -C5    -C1     -C5     -C1
398    pmaddwd         mm7, mm3            ; -C5R7-C1R5     -C5r7-C1r5
399    paddd           mm1, mm4            ; A0+B0          a0+b0
400    paddd           mm4, mm4            ; 2A0            2a0
401    psubd           mm4, mm1            ; A0-B0          a0-b0
402    psrad           mm1, %6
403    psrad           mm4, %6
404    movq            mm2, mm0            ; A1             a1
405    paddd           mm0, mm7            ; A1+B1          a1+b1
406    psubd           mm2, mm7            ; A1-B1          a1-b1
407    psrad           mm0, %6
408    psrad           mm2, %6
409    packssdw        mm1, mm1            ; A0+B0  a0+b0
410    movd           [%5], mm1
411    packssdw        mm0, mm0            ; A1+B1  a1+b1
412    movd      [16 + %5], mm0
413    packssdw        mm2, mm2            ; A1-B1  a1-b1
414    movd      [96 + %5], mm2
415    packssdw        mm4, mm4            ; A0-B0  a0-b0
416    movd     [112 + %5], mm4
417    movq            mm1, [coeffs + 88]  ; C3     C7      C3      C7
418    pmaddwd         mm1, mm3            ; C3R7+C7R5      C3r7+C7r5
419    movq            mm2, mm5            ; A2             a2
420    pmaddwd         mm3, [coeffs + 104] ; -C1R7+C3R5     -C1r7+C3r5
421    paddd           mm2, mm1            ; A2+B2          a2+b2
422    psubd           mm5, mm1            ; a2-B2          a2-b2
423    psrad           mm2, %6
424    psrad           mm5, %6
425    movq            mm1, mm6            ; A3             a3
426    paddd           mm6, mm3            ; A3+B3          a3+b3
427    psubd           mm1, mm3            ; a3-B3          a3-b3
428    psrad           mm6, %6
429    psrad           mm1, %6
430    packssdw        mm2, mm2            ; A2+B2  a2+b2
431    packssdw        mm6, mm6            ; A3+B3  a3+b3
432    movd      [32 + %5], mm2
433    packssdw        mm1, mm1            ; A3-B3  a3-b3
434    packssdw        mm5, mm5            ; A2-B2  a2-b2
435    movd      [48 + %5], mm6
436    movd      [64 + %5], mm1
437    movd      [80 + %5], mm5
438%endmacro
439
440%macro IDCT4 6
441    movq            mm0, %1             ; R4     R0      r4      r0
442    movq            mm2, %3             ; R3     R1      r3      r1
443    movq            mm3, %4             ; R7     R5      r7      r5
444    movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
445    pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
446    movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
447    pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
448    movq            mm6, mm4            ; C4R4+C4R0      C4r4+C4r0
449    movq            mm7, [coeffs + 48]  ; C3     C1      C3      C1
450    pmaddwd         mm7, mm2            ; C3R3+C1R1      C3r3+C1r1
451    movq            mm5, mm0            ; -C4R4+C4R0     -C4r4+C4r0
452    movq            mm1, [coeffs + 56]  ; C7     C5      C7      C5
453    pmaddwd         mm1, mm3            ; C7R7+C5R5      C7r7+C5r5
454    pmaddwd         mm2, [coeffs + 64]  ; -C7R3+C3R1     -C7r3+C3r1
455    paddd           mm7, mm1            ; B0             b0
456    movq            mm1, [coeffs + 72]  ; -C5    -C1     -C5     -C1
457    pmaddwd         mm1, mm3            ; -C5R7-C1R5     -C5r7-C1r5
458    paddd           mm7, mm4            ; A0+B0          a0+b0
459    paddd           mm4, mm4            ; 2A0            2a0
460    psubd           mm4, mm7            ; A0-B0          a0-b0
461    paddd           mm1, mm2            ; B1             b1
462    psrad           mm7, %6
463    psrad           mm4, %6
464    movq            mm2, mm0            ; A1             a1
465    paddd           mm0, mm1            ; A1+B1          a1+b1
466    psubd           mm2, mm1            ; A1-B1          a1-b1
467    psrad           mm0, %6
468    psrad           mm2, %6
469    packssdw        mm7, mm7            ; A0+B0  a0+b0
470    movd           [%5], mm7
471    packssdw        mm0, mm0            ; A1+B1  a1+b1
472    movd      [16 + %5], mm0
473    packssdw        mm2, mm2            ; A1-B1  a1-b1
474    movd      [96 + %5], mm2
475    packssdw        mm4, mm4            ; A0-B0  a0-b0
476    movd     [112 + %5], mm4
477    movq            mm0, %3             ; R3     R1      r3      r1
478    movq            mm4, [coeffs + 80]  ; -C1    C5      -C1     C5
479    pmaddwd         mm4, mm0            ; -C1R3+C5R1     -C1r3+C5r1
480    movq            mm7, [coeffs + 88]  ; C3     C7      C3      C7
481    pmaddwd         mm0, [coeffs + 96]  ; -C5R3+C7R1     -C5r3+C7r1
482    pmaddwd         mm7, mm3            ; C3R7+C7R5      C3r7+C7r5
483    movq            mm2, mm5            ; A2             a2
484    pmaddwd         mm3, [coeffs + 104] ; -C1R7+C3R5     -C1r7+C3r5
485    paddd           mm4, mm7            ; B2             b2
486    paddd           mm2, mm4            ; A2+B2          a2+b2
487    psubd           mm5, mm4            ; a2-B2          a2-b2
488    psrad           mm2, %6
489    psrad           mm5, %6
490    movq            mm4, mm6            ; A3             a3
491    paddd           mm3, mm0            ; B3             b3
492    paddd           mm6, mm3            ; A3+B3          a3+b3
493    psubd           mm4, mm3            ; a3-B3          a3-b3
494    psrad           mm6, %6
495    psrad           mm4, %6
496    packssdw        mm2, mm2            ; A2+B2  a2+b2
497    packssdw        mm6, mm6            ; A3+B3  a3+b3
498    movd      [32 + %5], mm2
499    packssdw        mm4, mm4            ; A3-B3  a3-b3
500    packssdw        mm5, mm5            ; A2-B2  a2-b2
501    movd      [48 + %5], mm6
502    movd      [64 + %5], mm4
503    movd      [80 + %5], mm5
504%endmacro
505
506%macro IDCT5 6
507    movq            mm0, %1             ; R4     R0      r4      r0
508    movq            mm2, %3             ; R3     R1      r3      r1
509    movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
510    pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
511    movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
512    pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
513    movq            mm6, mm4            ; C4R4+C4R0      C4r4+C4r0
514    movq            mm7, [coeffs + 48]  ; C3     C1      C3      C1
515    pmaddwd         mm7, mm2            ; C3R3+C1R1      C3r3+C1r1
516    movq            mm5, mm0            ; -C4R4+C4R0     -C4r4+C4r0
517    movq            mm3, [coeffs + 64]
518    pmaddwd         mm3, mm2            ; -C7R3+C3R1     -C7r3+C3r1
519    paddd           mm7, mm4            ; A0+B0          a0+b0
520    paddd           mm4, mm4            ; 2A0            2a0
521    psubd           mm4, mm7            ; A0-B0          a0-b0
522    psrad           mm7, %6
523    psrad           mm4, %6
524    movq            mm1, mm0            ; A1             a1
525    paddd           mm0, mm3            ; A1+B1          a1+b1
526    psubd           mm1, mm3            ; A1-B1          a1-b1
527    psrad           mm0, %6
528    psrad           mm1, %6
529    packssdw        mm7, mm7            ; A0+B0  a0+b0
530    movd           [%5], mm7
531    packssdw        mm0, mm0            ; A1+B1  a1+b1
532    movd      [16 + %5], mm0
533    packssdw        mm1, mm1            ; A1-B1  a1-b1
534    movd      [96 + %5], mm1
535    packssdw        mm4, mm4            ; A0-B0  a0-b0
536    movd     [112 + %5], mm4
537    movq            mm4, [coeffs + 80]  ; -C1    C5      -C1     C5
538    pmaddwd         mm4, mm2            ; -C1R3+C5R1     -C1r3+C5r1
539    pmaddwd         mm2, [coeffs + 96]  ; -C5R3+C7R1     -C5r3+C7r1
540    movq            mm1, mm5            ; A2             a2
541    paddd           mm1, mm4            ; A2+B2          a2+b2
542    psubd           mm5, mm4            ; a2-B2          a2-b2
543    psrad           mm1, %6
544    psrad           mm5, %6
545    movq            mm4, mm6            ; A3             a3
546    paddd           mm6, mm2            ; A3+B3          a3+b3
547    psubd           mm4, mm2            ; a3-B3          a3-b3
548    psrad           mm6, %6
549    psrad           mm4, %6
550    packssdw        mm1, mm1            ; A2+B2  a2+b2
551    packssdw        mm6, mm6            ; A3+B3  a3+b3
552    movd      [32 + %5], mm1
553    packssdw        mm4, mm4            ; A3-B3  a3-b3
554    packssdw        mm5, mm5            ; A2-B2  a2-b2
555    movd      [48 + %5], mm6
556    movd      [64 + %5], mm4
557    movd      [80 + %5], mm5
558%endmacro
559
560%macro IDCT6 6
561    movq            mm0, [%1]           ; R4     R0      r4      r0
562    movq            mm1, [%2]           ; R6     R2      r6      r2
563    movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
564    pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
565    movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
566    pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
567    movq            mm5, [coeffs + 32]  ; C6     C2      C6      C2
568    pmaddwd         mm5, mm1            ; C6R6+C2R2      C6r6+C2r2
569    movq            mm6, [coeffs + 40]  ; -C2    C6      -C2     C6
570    pmaddwd         mm1, mm6            ; -C2R6+C6R2     -C2r6+C6r2
571    movq            mm6, mm4            ; C4R4+C4R0      C4r4+C4r0
572    paddd           mm4, mm5            ; A0             a0
573    psubd           mm6, mm5            ; A3             a3
574    movq            mm5, mm0            ; -C4R4+C4R0     -C4r4+C4r0
575    paddd           mm0, mm1            ; A1             a1
576    psubd           mm5, mm1            ; A2             a2
577    movq            mm2, [8 + %1]       ; R4     R0      r4      r0
578    movq            mm3, [8 + %2]       ; R6     R2      r6      r2
579    movq            mm1, [coeffs + 16]  ; C4     C4      C4      C4
580    pmaddwd         mm1, mm2            ; C4R4+C4R0      C4r4+C4r0
581    movq            mm7, [coeffs + 24]  ; -C4    C4      -C4     C4
582    pmaddwd         mm2, mm7            ; -C4R4+C4R0     -C4r4+C4r0
583    movq            mm7, [coeffs + 32]  ; C6     C2      C6      C2
584    pmaddwd         mm7, mm3            ; C6R6+C2R2      C6r6+C2r2
585    pmaddwd         mm3, [coeffs + 40]  ; -C2R6+C6R2     -C2r6+C6r2
586    paddd           mm7, mm1            ; A0             a0
587    paddd           mm1, mm1            ; 2C0            2c0
588    psubd           mm1, mm7            ; A3             a3
589    paddd           mm3, mm2            ; A1             a1
590    paddd           mm2, mm2            ; 2C1            2c1
591    psubd           mm2, mm3            ; A2             a2
592    psrad           mm4, %6
593    psrad           mm7, %6
594    psrad           mm3, %6
595    packssdw        mm4, mm7            ; A0     a0
596    movq           [%5], mm4
597    psrad           mm0, %6
598    packssdw        mm0, mm3            ; A1     a1
599    movq      [16 + %5], mm0
600    movq      [96 + %5], mm0
601    movq     [112 + %5], mm4
602    psrad           mm5, %6
603    psrad           mm6, %6
604    psrad           mm2, %6
605    packssdw        mm5, mm2            ; A2-B2  a2-b2
606    movq      [32 + %5], mm5
607    psrad           mm1, %6
608    packssdw        mm6, mm1            ; A3+B3  a3+b3
609    movq      [48 + %5], mm6
610    movq      [64 + %5], mm6
611    movq      [80 + %5], mm5
612%endmacro
613
614%macro IDCT7 6
615    movq            mm0, %1             ; R4     R0      r4      r0
616    movq            mm1, %2             ; R6     R2      r6      r2
617    movq            mm2, %3             ; R3     R1      r3      r1
618    movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
619    pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
620    movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
621    pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
622    movq            mm5, [coeffs + 32]  ; C6     C2      C6      C2
623    pmaddwd         mm5, mm1            ; C6R6+C2R2      C6r6+C2r2
624    movq            mm6, [coeffs + 40]  ; -C2    C6      -C2     C6
625    pmaddwd         mm1, mm6            ; -C2R6+C6R2     -C2r6+C6r2
626    movq            mm6, mm4            ; C4R4+C4R0      C4r4+C4r0
627    movq            mm7, [coeffs + 48]  ; C3     C1      C3      C1
628    pmaddwd         mm7, mm2            ; C3R3+C1R1      C3r3+C1r1
629    paddd           mm4, mm5            ; A0             a0
630    psubd           mm6, mm5            ; A3             a3
631    movq            mm5, mm0            ; -C4R4+C4R0     -C4r4+C4r0
632    paddd           mm0, mm1            ; A1             a1
633    psubd           mm5, mm1            ; A2             a2
634    movq            mm1, [coeffs + 64]
635    pmaddwd         mm1, mm2            ; -C7R3+C3R1     -C7r3+C3r1
636    paddd           mm7, mm4            ; A0+B0          a0+b0
637    paddd           mm4, mm4            ; 2A0            2a0
638    psubd           mm4, mm7            ; A0-B0          a0-b0
639    psrad           mm7, %6
640    psrad           mm4, %6
641    movq            mm3, mm0            ; A1             a1
642    paddd           mm0, mm1            ; A1+B1          a1+b1
643    psubd           mm3, mm1            ; A1-B1          a1-b1
644    psrad           mm0, %6
645    psrad           mm3, %6
646    packssdw        mm7, mm7            ; A0+B0  a0+b0
647    movd           [%5], mm7
648    packssdw        mm0, mm0            ; A1+B1  a1+b1
649    movd      [16 + %5], mm0
650    packssdw        mm3, mm3            ; A1-B1  a1-b1
651    movd      [96 + %5], mm3
652    packssdw        mm4, mm4            ; A0-B0  a0-b0
653    movd     [112 + %5], mm4
654    movq            mm4, [coeffs + 80]  ; -C1    C5      -C1     C5
655    pmaddwd         mm4, mm2            ; -C1R3+C5R1     -C1r3+C5r1
656    pmaddwd         mm2, [coeffs + 96]  ; -C5R3+C7R1     -C5r3+C7r1
657    movq            mm3, mm5            ; A2             a2
658    paddd           mm3, mm4            ; A2+B2          a2+b2
659    psubd           mm5, mm4            ; a2-B2          a2-b2
660    psrad           mm3, %6
661    psrad           mm5, %6
662    movq            mm4, mm6            ; A3             a3
663    paddd           mm6, mm2            ; A3+B3          a3+b3
664    psubd           mm4, mm2            ; a3-B3          a3-b3
665    psrad           mm6, %6
666    packssdw        mm3, mm3            ; A2+B2  a2+b2
667    movd      [32 + %5], mm3
668    psrad           mm4, %6
669    packssdw        mm6, mm6            ; A3+B3  a3+b3
670    movd      [48 + %5], mm6
671    packssdw        mm4, mm4            ; A3-B3  a3-b3
672    packssdw        mm5, mm5            ; A2-B2  a2-b2
673    movd      [64 + %5], mm4
674    movd      [80 + %5], mm5
675%endmacro
676
677%macro IDCT8 6
678    movq            mm0, [%1]           ; R4     R0      r4      r0
679    movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
680    pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
681    movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
682    pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
683    psrad           mm4, %6
684    psrad           mm0, %6
685    movq            mm2, [8 + %1]       ; R4     R0      r4      r0
686    movq            mm1, [coeffs + 16]  ; C4     C4      C4      C4
687    pmaddwd         mm1, mm2            ; C4R4+C4R0      C4r4+C4r0
688    movq            mm7, [coeffs + 24]  ; -C4    C4      -C4     C4
689    pmaddwd         mm2, mm7            ; -C4R4+C4R0     -C4r4+C4r0
690    movq            mm7, [coeffs + 32]  ; C6     C2      C6      C2
691    psrad           mm1, %6
692    packssdw        mm4, mm1            ; A0     a0
693    movq           [%5], mm4
694    psrad           mm2, %6
695    packssdw        mm0, mm2            ; A1     a1
696    movq      [16 + %5], mm0
697    movq      [96 + %5], mm0
698    movq     [112 + %5], mm4
699    movq      [32 + %5], mm0
700    movq      [48 + %5], mm4
701    movq      [64 + %5], mm4
702    movq      [80 + %5], mm0
703%endmacro
704
705%macro IDCT 0
706    DC_COND_IDCT  0,   8,  16,  24, rsp +  0, null, 11
707    Z_COND_IDCT  32,  40,  48,  56, rsp + 32, null, 11, %%4
708    Z_COND_IDCT  64,  72,  80,  88, rsp + 64, null, 11, %%2
709    Z_COND_IDCT  96, 104, 112, 120, rsp + 96, null, 11, %%1
710
711    IDCT1 [rsp +  0], [rsp + 64], [rsp + 32], [rsp +  96], blockq +  0, 20
712    IDCT1 [rsp +  8], [rsp + 72], [rsp + 40], [rsp + 104], blockq +  4, 20
713    IDCT1 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq +  8, 20
714    IDCT1 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20
715    jmp %%9
716
717    ALIGN 16
718    %%4:
719    Z_COND_IDCT 64,  72,  80,  88, rsp + 64, null, 11, %%6
720    Z_COND_IDCT 96, 104, 112, 120, rsp + 96, null, 11, %%5
721
722    IDCT2 [rsp +  0], [rsp + 64], [rsp + 32], [rsp +  96], blockq +  0, 20
723    IDCT2 [rsp +  8], [rsp + 72], [rsp + 40], [rsp + 104], blockq +  4, 20
724    IDCT2 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq +  8, 20
725    IDCT2 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20
726    jmp %%9
727
728    ALIGN 16
729    %%6:
730    Z_COND_IDCT 96, 104, 112, 120, rsp + 96, null, 11, %%7
731
732    IDCT3 [rsp +  0], [rsp + 64], [rsp + 32], [rsp +  96], blockq +  0, 20
733    IDCT3 [rsp +  8], [rsp + 72], [rsp + 40], [rsp + 104], blockq +  4, 20
734    IDCT3 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq +  8, 20
735    IDCT3 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20
736    jmp %%9
737
738    ALIGN 16
739    %%2:
740    Z_COND_IDCT 96, 104, 112, 120, rsp + 96, null, 11, %%3
741
742    IDCT4 [rsp +  0], [rsp + 64], [rsp + 32], [rsp +  96], blockq +  0, 20
743    IDCT4 [rsp +  8], [rsp + 72], [rsp + 40], [rsp + 104], blockq +  4, 20
744    IDCT4 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq +  8, 20
745    IDCT4 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20
746    jmp %%9
747
748    ALIGN 16
749    %%3:
750
751    IDCT5 [rsp +  0], [rsp + 64], [rsp + 32], [rsp +  96], blockq +  0, 20
752    IDCT5 [rsp +  8], [rsp + 72], [rsp + 40], [rsp + 104], blockq +  4, 20
753    IDCT5 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq +  8, 20
754    IDCT5 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20
755    jmp %%9
756
757    ALIGN 16
758    %%5:
759
760    IDCT6 rsp +  0, rsp + 64, rsp + 32, rsp +  96, blockq +  0, 20
761    IDCT6 rsp + 16, rsp + 80, rsp + 48, rsp + 112, blockq +  8, 20
762    jmp %%9
763
764    ALIGN 16
765    %%1:
766
767    IDCT7 [rsp +  0], [rsp + 64], [rsp + 32], [rsp +  96], blockq +  0, 20
768    IDCT7 [rsp +  8], [rsp + 72], [rsp + 40], [rsp + 104], blockq +  4, 20
769    IDCT7 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq +  8, 20
770    IDCT7 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20
771    jmp %%9
772
773    ALIGN 16
774    %%7:
775
776    IDCT8 rsp +  0, rsp + 64, rsp + 32, rsp +  96, blockq +  0, 20
777    IDCT8 rsp + 16, rsp + 80, rsp + 48, rsp + 112, blockq +  8, 20
778
779    %%9:
780%endmacro
781
782%macro PUT_PIXELS_CLAMPED_HALF 1
783    mova     m0, [blockq+mmsize*0+%1]
784    mova     m1, [blockq+mmsize*2+%1]
785%if mmsize == 8
786    mova     m2, [blockq+mmsize*4+%1]
787    mova     m3, [blockq+mmsize*6+%1]
788%endif
789    packuswb m0, [blockq+mmsize*1+%1]
790    packuswb m1, [blockq+mmsize*3+%1]
791%if mmsize == 8
792    packuswb m2, [blockq+mmsize*5+%1]
793    packuswb m3, [blockq+mmsize*7+%1]
794    movq           [pixelsq], m0
795    movq    [lsizeq+pixelsq], m1
796    movq  [2*lsizeq+pixelsq], m2
797    movq   [lsize3q+pixelsq], m3
798%else
799    movq           [pixelsq], m0
800    movhps  [lsizeq+pixelsq], m0
801    movq  [2*lsizeq+pixelsq], m1
802    movhps [lsize3q+pixelsq], m1
803%endif
804%endmacro
805
806%macro ADD_PIXELS_CLAMPED 1
807    mova       m0, [blockq+mmsize*0+%1]
808    mova       m1, [blockq+mmsize*1+%1]
809%if mmsize == 8
810    mova       m5, [blockq+mmsize*2+%1]
811    mova       m6, [blockq+mmsize*3+%1]
812%endif
813    movq       m2, [pixelsq]
814    movq       m3, [pixelsq+lsizeq]
815%if mmsize == 8
816    mova       m7, m2
817    punpcklbw  m2, m4
818    punpckhbw  m7, m4
819    paddsw     m0, m2
820    paddsw     m1, m7
821    mova       m7, m3
822    punpcklbw  m3, m4
823    punpckhbw  m7, m4
824    paddsw     m5, m3
825    paddsw     m6, m7
826%else
827    punpcklbw  m2, m4
828    punpcklbw  m3, m4
829    paddsw     m0, m2
830    paddsw     m1, m3
831%endif
832    packuswb   m0, m1
833%if mmsize == 8
834    packuswb   m5, m6
835    movq       [pixelsq], m0
836    movq       [pixelsq+lsizeq], m5
837%else
838    movq       [pixelsq], m0
839    movhps     [pixelsq+lsizeq], m0
840%endif
841%endmacro
842
843INIT_MMX mmx
844
845cglobal simple_idct, 1, 2, 8, 128, block, t0
846    IDCT
847RET
848
849cglobal simple_idct_put, 3, 5, 8, 128, pixels, lsize, block, lsize3, t0
850    IDCT
851    lea lsize3q, [lsizeq*3]
852    PUT_PIXELS_CLAMPED_HALF 0
853    lea pixelsq, [pixelsq+lsizeq*4]
854    PUT_PIXELS_CLAMPED_HALF 64
855RET
856
857cglobal simple_idct_add, 3, 4, 8, 128, pixels, lsize, block, t0
858    IDCT
859    pxor       m4, m4
860    ADD_PIXELS_CLAMPED 0
861    lea        pixelsq, [pixelsq+lsizeq*2]
862    ADD_PIXELS_CLAMPED 32
863    lea        pixelsq, [pixelsq+lsizeq*2]
864    ADD_PIXELS_CLAMPED 64
865    lea        pixelsq, [pixelsq+lsizeq*2]
866    ADD_PIXELS_CLAMPED 96
867RET
868
869INIT_XMM sse2
870
871cglobal simple_idct_put, 3, 5, 8, 128, pixels, lsize, block, lsize3, t0
872    IDCT
873    lea lsize3q, [lsizeq*3]
874    PUT_PIXELS_CLAMPED_HALF 0
875    lea pixelsq, [pixelsq+lsizeq*4]
876    PUT_PIXELS_CLAMPED_HALF 64
877RET
878
879cglobal simple_idct_add, 3, 4, 8, 128, pixels, lsize, block, t0
880    IDCT
881    pxor       m4, m4
882    ADD_PIXELS_CLAMPED 0
883    lea        pixelsq, [pixelsq+lsizeq*2]
884    ADD_PIXELS_CLAMPED 32
885    lea        pixelsq, [pixelsq+lsizeq*2]
886    ADD_PIXELS_CLAMPED 64
887    lea        pixelsq, [pixelsq+lsizeq*2]
888    ADD_PIXELS_CLAMPED 96
889RET
890