1;******************************************************************************
2;* MMX/SSE2-optimized functions for the VP3 decoder
3;* Copyright (c) 2007 Aurelien Jacobs <aurel@gnuage.org>
4;*
5;* This file is part of FFmpeg.
6;*
7;* FFmpeg is free software; you can redistribute it and/or
8;* modify it under the terms of the GNU Lesser General Public
9;* License as published by the Free Software Foundation; either
10;* version 2.1 of the License, or (at your option) any later version.
11;*
12;* FFmpeg is distributed in the hope that it will be useful,
13;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15;* Lesser General Public License for more details.
16;*
17;* You should have received a copy of the GNU Lesser General Public
18;* License along with FFmpeg; if not, write to the Free Software
19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20;******************************************************************************
21
22%include "libavutil/x86/x86util.asm"
23
24; MMX-optimized functions cribbed from the original VP3 source code.
25
26SECTION_RODATA
27
28vp3_idct_data: times 8 dw 64277
29               times 8 dw 60547
30               times 8 dw 54491
31               times 8 dw 46341
32               times 8 dw 36410
33               times 8 dw 25080
34               times 8 dw 12785
35
36pb_7:  times 8 db 0x07
37pb_1F: times 8 db 0x1f
38pb_81: times 8 db 0x81
39
40cextern pb_1
41cextern pb_3
42cextern pb_80
43cextern pb_FE
44
45cextern pw_8
46
47SECTION .text
48
49; this is off by one or two for some cases when filter_limit is greater than 63
50; in:  p0 in mm6, p1 in mm4, p2 in mm2, p3 in mm1
51; out: p1 in mm4, p2 in mm3
52%macro VP3_LOOP_FILTER 0
53    movq          m7, m6
54    pand          m6, [pb_7]    ; p0&7
55    psrlw         m7, 3
56    pand          m7, [pb_1F]   ; p0>>3
57    movq          m3, m2        ; p2
58    pxor          m2, m4
59    pand          m2, [pb_1]    ; (p2^p1)&1
60    movq          m5, m2
61    paddb         m2, m2
62    paddb         m2, m5        ; 3*(p2^p1)&1
63    paddb         m2, m6        ; extra bits lost in shifts
64    pcmpeqb       m0, m0
65    pxor          m1, m0        ; 255 - p3
66    pavgb         m1, m2        ; (256 - p3 + extrabits) >> 1
67    pxor          m0, m4        ; 255 - p1
68    pavgb         m0, m3        ; (256 + p2-p1) >> 1
69    paddb         m1, [pb_3]
70    pavgb         m1, m0        ; 128+2+(   p2-p1  - p3) >> 2
71    pavgb         m1, m0        ; 128+1+(3*(p2-p1) - p3) >> 3
72    paddusb       m7, m1        ; d+128+1
73    movq          m6, [pb_81]
74    psubusb       m6, m7
75    psubusb       m7, [pb_81]
76
77    movq          m5, [r2+516]  ; flim
78    pminub        m6, m5
79    pminub        m7, m5
80    movq          m0, m6
81    movq          m1, m7
82    paddb         m6, m6
83    paddb         m7, m7
84    pminub        m6, m5
85    pminub        m7, m5
86    psubb         m6, m0
87    psubb         m7, m1
88    paddusb       m4, m7
89    psubusb       m4, m6
90    psubusb       m3, m7
91    paddusb       m3, m6
92%endmacro
93
94%macro STORE_4_WORDS 1
95    movd         r2d, %1
96    mov  [r0     -1], r2w
97    psrlq         %1, 32
98    shr           r2, 16
99    mov  [r0+r1  -1], r2w
100    movd         r2d, %1
101    mov  [r0+r1*2-1], r2w
102    shr           r2, 16
103    mov  [r0+r3  -1], r2w
104%endmacro
105
106INIT_MMX mmxext
107cglobal vp3_v_loop_filter, 3, 4
108    mov           r3, r1
109    neg           r1
110    movq          m6, [r0+r1*2]
111    movq          m4, [r0+r1  ]
112    movq          m2, [r0     ]
113    movq          m1, [r0+r3  ]
114
115    VP3_LOOP_FILTER
116
117    movq     [r0+r1], m4
118    movq     [r0   ], m3
119    RET
120
121cglobal vp3_h_loop_filter, 3, 4
122    lea           r3, [r1*3]
123
124    movd          m6, [r0     -2]
125    movd          m4, [r0+r1  -2]
126    movd          m2, [r0+r1*2-2]
127    movd          m1, [r0+r3  -2]
128    lea           r0, [r0+r1*4  ]
129    punpcklbw     m6, [r0     -2]
130    punpcklbw     m4, [r0+r1  -2]
131    punpcklbw     m2, [r0+r1*2-2]
132    punpcklbw     m1, [r0+r3  -2]
133    sub           r0, r3
134    sub           r0, r1
135
136    TRANSPOSE4x4B  6, 4, 2, 1, 0
137    VP3_LOOP_FILTER
138    SBUTTERFLY    bw, 4, 3, 5
139
140    STORE_4_WORDS m4
141    lea           r0, [r0+r1*4  ]
142    STORE_4_WORDS m3
143    RET
144
145%macro PAVGB_NO_RND 0
146    mova   m4, m0
147    mova   m5, m2
148    pand   m4, m1
149    pand   m5, m3
150    pxor   m1, m0
151    pxor   m3, m2
152    pand   m1, m6
153    pand   m3, m6
154    psrlq  m1, 1
155    psrlq  m3, 1
156    paddb  m4, m1
157    paddb  m5, m3
158%endmacro
159
160INIT_MMX mmx
161cglobal put_vp_no_rnd_pixels8_l2, 5, 6, 0, dst, src1, src2, stride, h, stride3
162    mova   m6, [pb_FE]
163    lea    stride3q,[strideq+strideq*2]
164.loop:
165    mova   m0, [src1q]
166    mova   m1, [src2q]
167    mova   m2, [src1q+strideq]
168    mova   m3, [src2q+strideq]
169    PAVGB_NO_RND
170    mova   [dstq], m4
171    mova   [dstq+strideq], m5
172
173    mova   m0, [src1q+strideq*2]
174    mova   m1, [src2q+strideq*2]
175    mova   m2, [src1q+stride3q]
176    mova   m3, [src2q+stride3q]
177    PAVGB_NO_RND
178    mova   [dstq+strideq*2], m4
179    mova   [dstq+stride3q],  m5
180
181    lea    src1q, [src1q+strideq*4]
182    lea    src2q, [src2q+strideq*4]
183    lea    dstq,  [dstq+strideq*4]
184    sub    hd, 4
185    jnz .loop
186    RET
187
188; from original comments: The Macro does IDct on 4 1-D Dcts
189%macro BeginIDCT 0
190    movq          m2, I(3)
191    movq          m6, C(3)
192    movq          m4, m2
193    movq          m7, J(5)
194    pmulhw        m4, m6        ; r4 = c3*i3 - i3
195    movq          m1, C(5)
196    pmulhw        m6, m7        ; r6 = c3*i5 - i5
197    movq          m5, m1
198    pmulhw        m1, m2        ; r1 = c5*i3 - i3
199    movq          m3, I(1)
200    pmulhw        m5, m7        ; r5 = c5*i5 - i5
201    movq          m0, C(1)
202    paddw         m4, m2        ; r4 = c3*i3
203    paddw         m6, m7        ; r6 = c3*i5
204    paddw         m2, m1        ; r2 = c5*i3
205    movq          m1, J(7)
206    paddw         m7, m5        ; r7 = c5*i5
207    movq          m5, m0        ; r5 = c1
208    pmulhw        m0, m3        ; r0 = c1*i1 - i1
209    paddsw        m4, m7        ; r4 = C = c3*i3 + c5*i5
210    pmulhw        m5, m1        ; r5 = c1*i7 - i7
211    movq          m7, C(7)
212    psubsw        m6, m2        ; r6 = D = c3*i5 - c5*i3
213    paddw         m0, m3        ; r0 = c1*i1
214    pmulhw        m3, m7        ; r3 = c7*i1
215    movq          m2, I(2)
216    pmulhw        m7, m1        ; r7 = c7*i7
217    paddw         m5, m1        ; r5 = c1*i7
218    movq          m1, m2        ; r1 = i2
219    pmulhw        m2, C(2)      ; r2 = c2*i2 - i2
220    psubsw        m3, m5        ; r3 = B = c7*i1 - c1*i7
221    movq          m5, J(6)
222    paddsw        m0, m7        ; r0 = A = c1*i1 + c7*i7
223    movq          m7, m5        ; r7 = i6
224    psubsw        m0, m4        ; r0 = A - C
225    pmulhw        m5, C(2)      ; r5 = c2*i6 - i6
226    paddw         m2, m1        ; r2 = c2*i2
227    pmulhw        m1, C(6)      ; r1 = c6*i2
228    paddsw        m4, m4        ; r4 = C + C
229    paddsw        m4, m0        ; r4 = C. = A + C
230    psubsw        m3, m6        ; r3 = B - D
231    paddw         m5, m7        ; r5 = c2*i6
232    paddsw        m6, m6        ; r6 = D + D
233    pmulhw        m7, C(6)      ; r7 = c6*i6
234    paddsw        m6, m3        ; r6 = D. = B + D
235    movq        I(1), m4        ; save C. at I(1)
236    psubsw        m1, m5        ; r1 = H = c6*i2 - c2*i6
237    movq          m4, C(4)
238    movq          m5, m3        ; r5 = B - D
239    pmulhw        m3, m4        ; r3 = (c4 - 1) * (B - D)
240    paddsw        m7, m2        ; r3 = (c4 - 1) * (B - D)
241    movq        I(2), m6        ; save D. at I(2)
242    movq          m2, m0        ; r2 = A - C
243    movq          m6, I(0)
244    pmulhw        m0, m4        ; r0 = (c4 - 1) * (A - C)
245    paddw         m5, m3        ; r5 = B. = c4 * (B - D)
246    movq          m3, J(4)
247    psubsw        m5, m1        ; r5 = B.. = B. - H
248    paddw         m2, m0        ; r0 = A. = c4 * (A - C)
249    psubsw        m6, m3        ; r6 = i0 - i4
250    movq          m0, m6
251    pmulhw        m6, m4        ; r6 = (c4 - 1) * (i0 - i4)
252    paddsw        m3, m3        ; r3 = i4 + i4
253    paddsw        m1, m1        ; r1 = H + H
254    paddsw        m3, m0        ; r3 = i0 + i4
255    paddsw        m1, m5        ; r1 = H. = B + H
256    pmulhw        m4, m3        ; r4 = (c4 - 1) * (i0 + i4)
257    paddsw        m6, m0        ; r6 = F = c4 * (i0 - i4)
258    psubsw        m6, m2        ; r6 = F. = F - A.
259    paddsw        m2, m2        ; r2 = A. + A.
260    movq          m0, I(1)      ; r0 = C.
261    paddsw        m2, m6        ; r2 = A.. = F + A.
262    paddw         m4, m3        ; r4 = E = c4 * (i0 + i4)
263    psubsw        m2, m1        ; r2 = R2 = A.. - H.
264%endmacro
265
266; RowIDCT gets ready to transpose
267%macro RowIDCT 0
268    BeginIDCT
269    movq          m3, I(2)      ; r3 = D.
270    psubsw        m4, m7        ; r4 = E. = E - G
271    paddsw        m1, m1        ; r1 = H. + H.
272    paddsw        m7, m7        ; r7 = G + G
273    paddsw        m1, m2        ; r1 = R1 = A.. + H.
274    paddsw        m7, m4        ; r1 = R1 = A.. + H.
275    psubsw        m4, m3        ; r4 = R4 = E. - D.
276    paddsw        m3, m3
277    psubsw        m6, m5        ; r6 = R6 = F. - B..
278    paddsw        m5, m5
279    paddsw        m3, m4        ; r3 = R3 = E. + D.
280    paddsw        m5, m6        ; r5 = R5 = F. + B..
281    psubsw        m7, m0        ; r7 = R7 = G. - C.
282    paddsw        m0, m0
283    movq        I(1), m1        ; save R1
284    paddsw        m0, m7        ; r0 = R0 = G. + C.
285%endmacro
286
287; Column IDCT normalizes and stores final results
288%macro ColumnIDCT 0
289    BeginIDCT
290    paddsw        m2, OC_8      ; adjust R2 (and R1) for shift
291    paddsw        m1, m1        ; r1 = H. + H.
292    paddsw        m1, m2        ; r1 = R1 = A.. + H.
293    psraw         m2, 4         ; r2 = NR2
294    psubsw        m4, m7        ; r4 = E. = E - G
295    psraw         m1, 4         ; r1 = NR2
296    movq          m3, I(2)      ; r3 = D.
297    paddsw        m7, m7        ; r7 = G + G
298    movq        I(2), m2        ; store NR2 at I2
299    paddsw        m7, m4        ; r7 = G. = E + G
300    movq        I(1), m1        ; store NR1 at I1
301    psubsw        m4, m3        ; r4 = R4 = E. - D.
302    paddsw        m4, OC_8      ; adjust R4 (and R3) for shift
303    paddsw        m3, m3        ; r3 = D. + D.
304    paddsw        m3, m4        ; r3 = R3 = E. + D.
305    psraw         m4, 4         ; r4 = NR4
306    psubsw        m6, m5        ; r6 = R6 = F. - B..
307    psraw         m3, 4         ; r3 = NR3
308    paddsw        m6, OC_8      ; adjust R6 (and R5) for shift
309    paddsw        m5, m5        ; r5 = B.. + B..
310    paddsw        m5, m6        ; r5 = R5 = F. + B..
311    psraw         m6, 4         ; r6 = NR6
312    movq        J(4), m4        ; store NR4 at J4
313    psraw         m5, 4         ; r5 = NR5
314    movq        I(3), m3        ; store NR3 at I3
315    psubsw        m7, m0        ; r7 = R7 = G. - C.
316    paddsw        m7, OC_8      ; adjust R7 (and R0) for shift
317    paddsw        m0, m0        ; r0 = C. + C.
318    paddsw        m0, m7        ; r0 = R0 = G. + C.
319    psraw         m7, 4         ; r7 = NR7
320    movq        J(6), m6        ; store NR6 at J6
321    psraw         m0, 4         ; r0 = NR0
322    movq        J(5), m5        ; store NR5 at J5
323    movq        J(7), m7        ; store NR7 at J7
324    movq        I(0), m0        ; store NR0 at I0
325%endmacro
326
327; Following macro does two 4x4 transposes in place.
328;
329; At entry (we assume):
330;
331;   r0 = a3 a2 a1 a0
332;   I(1) = b3 b2 b1 b0
333;   r2 = c3 c2 c1 c0
334;   r3 = d3 d2 d1 d0
335;
336;   r4 = e3 e2 e1 e0
337;   r5 = f3 f2 f1 f0
338;   r6 = g3 g2 g1 g0
339;   r7 = h3 h2 h1 h0
340;
341; At exit, we have:
342;
343;   I(0) = d0 c0 b0 a0
344;   I(1) = d1 c1 b1 a1
345;   I(2) = d2 c2 b2 a2
346;   I(3) = d3 c3 b3 a3
347;
348;   J(4) = h0 g0 f0 e0
349;   J(5) = h1 g1 f1 e1
350;   J(6) = h2 g2 f2 e2
351;   J(7) = h3 g3 f3 e3
352;
353;  I(0) I(1) I(2) I(3)  is the transpose of r0 I(1) r2 r3.
354;  J(4) J(5) J(6) J(7)  is the transpose of r4 r5 r6 r7.
355;
356;  Since r1 is free at entry, we calculate the Js first.
357%macro Transpose 0
358    movq          m1, m4        ; r1 = e3 e2 e1 e0
359    punpcklwd     m4, m5        ; r4 = f1 e1 f0 e0
360    movq        I(0), m0        ; save a3 a2 a1 a0
361    punpckhwd     m1, m5        ; r1 = f3 e3 f2 e2
362    movq          m0, m6        ; r0 = g3 g2 g1 g0
363    punpcklwd     m6, m7        ; r6 = h1 g1 h0 g0
364    movq          m5, m4        ; r5 = f1 e1 f0 e0
365    punpckldq     m4, m6        ; r4 = h0 g0 f0 e0 = R4
366    punpckhdq     m5, m6        ; r5 = h1 g1 f1 e1 = R5
367    movq          m6, m1        ; r6 = f3 e3 f2 e2
368    movq        J(4), m4
369    punpckhwd     m0, m7        ; r0 = h3 g3 h2 g2
370    movq        J(5), m5
371    punpckhdq     m6, m0        ; r6 = h3 g3 f3 e3 = R7
372    movq          m4, I(0)      ; r4 = a3 a2 a1 a0
373    punpckldq     m1, m0        ; r1 = h2 g2 f2 e2 = R6
374    movq          m5, I(1)      ; r5 = b3 b2 b1 b0
375    movq          m0, m4        ; r0 = a3 a2 a1 a0
376    movq        J(7), m6
377    punpcklwd     m0, m5        ; r0 = b1 a1 b0 a0
378    movq        J(6), m1
379    punpckhwd     m4, m5        ; r4 = b3 a3 b2 a2
380    movq          m5, m2        ; r5 = c3 c2 c1 c0
381    punpcklwd     m2, m3        ; r2 = d1 c1 d0 c0
382    movq          m1, m0        ; r1 = b1 a1 b0 a0
383    punpckldq     m0, m2        ; r0 = d0 c0 b0 a0 = R0
384    punpckhdq     m1, m2        ; r1 = d1 c1 b1 a1 = R1
385    movq          m2, m4        ; r2 = b3 a3 b2 a2
386    movq        I(0), m0
387    punpckhwd     m5, m3        ; r5 = d3 c3 d2 c2
388    movq        I(1), m1
389    punpckhdq     m4, m5        ; r4 = d3 c3 b3 a3 = R3
390    punpckldq     m2, m5        ; r2 = d2 c2 b2 a2 = R2
391    movq        I(3), m4
392    movq        I(2), m2
393%endmacro
394
395%macro VP3_1D_IDCT_SSE2 0
396    movdqa        m2, I(3)      ; xmm2 = i3
397    movdqa        m6, C(3)      ; xmm6 = c3
398    movdqa        m4, m2        ; xmm4 = i3
399    movdqa        m7, I(5)      ; xmm7 = i5
400    pmulhw        m4, m6        ; xmm4 = c3 * i3 - i3
401    movdqa        m1, C(5)      ; xmm1 = c5
402    pmulhw        m6, m7        ; xmm6 = c3 * i5 - i5
403    movdqa        m5, m1        ; xmm5 = c5
404    pmulhw        m1, m2        ; xmm1 = c5 * i3 - i3
405    movdqa        m3, I(1)      ; xmm3 = i1
406    pmulhw        m5, m7        ; xmm5 = c5 * i5 - i5
407    movdqa        m0, C(1)      ; xmm0 = c1
408    paddw         m4, m2        ; xmm4 = c3 * i3
409    paddw         m6, m7        ; xmm6 = c3 * i5
410    paddw         m2, m1        ; xmm2 = c5 * i3
411    movdqa        m1, I(7)      ; xmm1 = i7
412    paddw         m7, m5        ; xmm7 = c5 * i5
413    movdqa        m5, m0        ; xmm5 = c1
414    pmulhw        m0, m3        ; xmm0 = c1 * i1 - i1
415    paddsw        m4, m7        ; xmm4 = c3 * i3 + c5 * i5 = C
416    pmulhw        m5, m1        ; xmm5 = c1 * i7 - i7
417    movdqa        m7, C(7)      ; xmm7 = c7
418    psubsw        m6, m2        ; xmm6 = c3 * i5 - c5 * i3 = D
419    paddw         m0, m3        ; xmm0 = c1 * i1
420    pmulhw        m3, m7        ; xmm3 = c7 * i1
421    movdqa        m2, I(2)      ; xmm2 = i2
422    pmulhw        m7, m1        ; xmm7 = c7 * i7
423    paddw         m5, m1        ; xmm5 = c1 * i7
424    movdqa        m1, m2        ; xmm1 = i2
425    pmulhw        m2, C(2)      ; xmm2 = i2 * c2 -i2
426    psubsw        m3, m5        ; xmm3 = c7 * i1 - c1 * i7 = B
427    movdqa        m5, I(6)      ; xmm5 = i6
428    paddsw        m0, m7        ; xmm0 = c1 * i1 + c7 * i7 = A
429    movdqa        m7, m5        ; xmm7 = i6
430    psubsw        m0, m4        ; xmm0 = A - C
431    pmulhw        m5, C(2)      ; xmm5 = c2 * i6 - i6
432    paddw         m2, m1        ; xmm2 = i2 * c2
433    pmulhw        m1, C(6)      ; xmm1 = c6 * i2
434    paddsw        m4, m4        ; xmm4 = C + C
435    paddsw        m4, m0        ; xmm4 = A + C = C.
436    psubsw        m3, m6        ; xmm3 = B - D
437    paddw         m5, m7        ; xmm5 = c2 * i6
438    paddsw        m6, m6        ; xmm6 = D + D
439    pmulhw        m7, C(6)      ; xmm7 = c6 * i6
440    paddsw        m6, m3        ; xmm6 = B + D = D.
441    movdqa      I(1), m4        ; Save C. at I(1)
442    psubsw        m1, m5        ; xmm1 = c6 * i2 - c2 * i6 = H
443    movdqa        m4, C(4)      ; xmm4 = C4
444    movdqa        m5, m3        ; xmm5 = B - D
445    pmulhw        m3, m4        ; xmm3 = ( c4 -1 ) * ( B - D )
446    paddsw        m7, m2        ; xmm7 = c2 * i2 + c6 * i6 = G
447    movdqa      I(2), m6        ; save D. at I(2)
448    movdqa        m2, m0        ; xmm2 = A - C
449    movdqa        m6, I(0)      ; xmm6 = i0
450    pmulhw        m0, m4        ; xmm0 = ( c4 - 1 ) * ( A - C ) = A.
451    paddw         m5, m3        ; xmm5 = c4 * ( B - D ) = B.
452    movdqa        m3, I(4)      ; xmm3 = i4
453    psubsw        m5, m1        ; xmm5 = B. - H = B..
454    paddw         m2, m0        ; xmm2 = c4 * ( A - C) = A.
455    psubsw        m6, m3        ; xmm6 = i0 - i4
456    movdqa        m0, m6        ; xmm0 = i0 - i4
457    pmulhw        m6, m4        ; xmm6 = (c4 - 1) * (i0 - i4) = F
458    paddsw        m3, m3        ; xmm3 = i4 + i4
459    paddsw        m1, m1        ; xmm1 = H + H
460    paddsw        m3, m0        ; xmm3 = i0 + i4
461    paddsw        m1, m5        ; xmm1 = B. + H = H.
462    pmulhw        m4, m3        ; xmm4 = ( c4 - 1 ) * ( i0 + i4 )
463    paddw         m6, m0        ; xmm6 = c4 * ( i0 - i4 )
464    psubsw        m6, m2        ; xmm6 = F - A. = F.
465    paddsw        m2, m2        ; xmm2 = A. + A.
466    movdqa        m0, I(1)      ; Load        C. from I(1)
467    paddsw        m2, m6        ; xmm2 = F + A. = A..
468    paddw         m4, m3        ; xmm4 = c4 * ( i0 + i4 ) = 3
469    psubsw        m2, m1        ; xmm2 = A.. - H. = R2
470    ADD(m2)                     ; Adjust R2 and R1 before shifting
471    paddsw        m1, m1        ; xmm1 = H. + H.
472    paddsw        m1, m2        ; xmm1 = A.. + H. = R1
473    SHIFT(m2)                   ; xmm2 = op2
474    psubsw        m4, m7        ; xmm4 = E - G = E.
475    SHIFT(m1)                   ; xmm1 = op1
476    movdqa        m3, I(2)      ; Load D. from I(2)
477    paddsw        m7, m7        ; xmm7 = G + G
478    paddsw        m7, m4        ; xmm7 = E + G = G.
479    psubsw        m4, m3        ; xmm4 = E. - D. = R4
480    ADD(m4)                     ; Adjust R4 and R3 before shifting
481    paddsw        m3, m3        ; xmm3 = D. + D.
482    paddsw        m3, m4        ; xmm3 = E. + D. = R3
483    SHIFT(m4)                   ; xmm4 = op4
484    psubsw        m6, m5        ; xmm6 = F. - B..= R6
485    SHIFT(m3)                   ; xmm3 = op3
486    ADD(m6)                     ; Adjust R6 and R5 before shifting
487    paddsw        m5, m5        ; xmm5 = B.. + B..
488    paddsw        m5, m6        ; xmm5 = F. + B.. = R5
489    SHIFT(m6)                   ; xmm6 = op6
490    SHIFT(m5)                   ; xmm5 = op5
491    psubsw        m7, m0        ; xmm7 = G. - C. = R7
492    ADD(m7)                     ; Adjust R7 and R0 before shifting
493    paddsw        m0, m0        ; xmm0 = C. + C.
494    paddsw        m0, m7        ; xmm0 = G. + C.
495    SHIFT(m7)                   ; xmm7 = op7
496    SHIFT(m0)                   ; xmm0 = op0
497%endmacro
498
499%macro PUT_BLOCK 8
500    movdqa      O(0), m%1
501    movdqa      O(1), m%2
502    movdqa      O(2), m%3
503    movdqa      O(3), m%4
504    movdqa      O(4), m%5
505    movdqa      O(5), m%6
506    movdqa      O(6), m%7
507    movdqa      O(7), m%8
508%endmacro
509
510%macro VP3_IDCT 1
511%if mmsize == 16
512%define I(x) [%1+16*x]
513%define O(x) [%1+16*x]
514%define C(x) [vp3_idct_data+16*(x-1)]
515%define SHIFT(x)
516%define ADD(x)
517        VP3_1D_IDCT_SSE2
518%if ARCH_X86_64
519        TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
520%else
521        TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%1], [%1+16]
522%endif
523        PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7
524
525%define SHIFT(x) psraw  x, 4
526%define ADD(x)   paddsw x, [pw_8]
527        VP3_1D_IDCT_SSE2
528        PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7
529%else ; mmsize == 8
530    ; eax = quantized input
531    ; ebx = dequantizer matrix
532    ; ecx = IDCT constants
533    ;  M(I) = ecx + MaskOffset(0) + I * 8
534    ;  C(I) = ecx + CosineOffset(32) + (I-1) * 8
535    ; edx = output
536    ; r0..r7 = mm0..mm7
537%define OC_8 [pw_8]
538%define C(x) [vp3_idct_data+16*(x-1)]
539
540    ; at this point, function has completed dequantization + dezigzag +
541    ; partial transposition; now do the idct itself
542%define I(x) [%1+16*x]
543%define J(x) [%1+16*x]
544    RowIDCT
545    Transpose
546
547%define I(x) [%1+16*x+8]
548%define J(x) [%1+16*x+8]
549    RowIDCT
550    Transpose
551
552%define I(x) [%1+16* x]
553%define J(x) [%1+16*(x-4)+8]
554    ColumnIDCT
555
556%define I(x) [%1+16* x   +64]
557%define J(x) [%1+16*(x-4)+72]
558    ColumnIDCT
559%endif ; mmsize == 16/8
560%endmacro
561
562%macro vp3_idct_funcs 0
563cglobal vp3_idct_put, 3, 4, 9
564    VP3_IDCT      r2
565
566    mova          m4, [pb_80]
567    lea           r3, [r1*3]
568%assign %%i 0
569%rep 16/mmsize
570    mova          m0, [r2+mmsize*0+%%i]
571    mova          m1, [r2+mmsize*2+%%i]
572    mova          m2, [r2+mmsize*4+%%i]
573    mova          m3, [r2+mmsize*6+%%i]
574%if mmsize == 8
575    packsswb      m0, [r2+mmsize*8+%%i]
576    packsswb      m1, [r2+mmsize*10+%%i]
577    packsswb      m2, [r2+mmsize*12+%%i]
578    packsswb      m3, [r2+mmsize*14+%%i]
579%else
580    packsswb      m0, [r2+mmsize*1+%%i]
581    packsswb      m1, [r2+mmsize*3+%%i]
582    packsswb      m2, [r2+mmsize*5+%%i]
583    packsswb      m3, [r2+mmsize*7+%%i]
584%endif
585    paddb         m0, m4
586    paddb         m1, m4
587    paddb         m2, m4
588    paddb         m3, m4
589    movq   [r0     ], m0
590%if mmsize == 8
591    movq   [r0+r1  ], m1
592    movq   [r0+r1*2], m2
593    movq   [r0+r3  ], m3
594%else
595    movhps [r0+r1  ], m0
596    movq   [r0+r1*2], m1
597    movhps [r0+r3  ], m1
598%endif
599%if %%i == 0
600    lea           r0, [r0+r1*4]
601%endif
602%if mmsize == 16
603    movq   [r0     ], m2
604    movhps [r0+r1  ], m2
605    movq   [r0+r1*2], m3
606    movhps [r0+r3  ], m3
607%endif
608%assign %%i %%i+8
609%endrep
610
611    pxor          m0, m0
612%assign %%offset 0
613%rep 128/mmsize
614    mova [r2+%%offset], m0
615%assign %%offset %%offset+mmsize
616%endrep
617    RET
618
619cglobal vp3_idct_add, 3, 4, 9
620    VP3_IDCT      r2
621
622    lea           r3, [r1*3]
623    pxor          m4, m4
624%if mmsize == 16
625%assign %%i 0
626%rep 2
627    movq          m0, [r0]
628    movq          m1, [r0+r1]
629    movq          m2, [r0+r1*2]
630    movq          m3, [r0+r3]
631    punpcklbw     m0, m4
632    punpcklbw     m1, m4
633    punpcklbw     m2, m4
634    punpcklbw     m3, m4
635    paddsw        m0, [r2+ 0+%%i]
636    paddsw        m1, [r2+16+%%i]
637    paddsw        m2, [r2+32+%%i]
638    paddsw        m3, [r2+48+%%i]
639    packuswb      m0, m1
640    packuswb      m2, m3
641    movq   [r0     ], m0
642    movhps [r0+r1  ], m0
643    movq   [r0+r1*2], m2
644    movhps [r0+r3  ], m2
645%if %%i == 0
646    lea           r0, [r0+r1*4]
647%endif
648%assign %%i %%i+64
649%endrep
650%else
651%assign %%i 0
652%rep 2
653    movq          m0, [r0]
654    movq          m1, [r0+r1]
655    movq          m2, [r0+r1*2]
656    movq          m3, [r0+r3]
657    movq          m5, m0
658    movq          m6, m1
659    movq          m7, m2
660    punpcklbw     m0, m4
661    punpcklbw     m1, m4
662    punpcklbw     m2, m4
663    punpckhbw     m5, m4
664    punpckhbw     m6, m4
665    punpckhbw     m7, m4
666    paddsw        m0, [r2+ 0+%%i]
667    paddsw        m1, [r2+16+%%i]
668    paddsw        m2, [r2+32+%%i]
669    paddsw        m5, [r2+64+%%i]
670    paddsw        m6, [r2+80+%%i]
671    paddsw        m7, [r2+96+%%i]
672    packuswb      m0, m5
673    movq          m5, m3
674    punpcklbw     m3, m4
675    punpckhbw     m5, m4
676    packuswb      m1, m6
677    paddsw        m3, [r2+48+%%i]
678    paddsw        m5, [r2+112+%%i]
679    packuswb      m2, m7
680    packuswb      m3, m5
681    movq   [r0     ], m0
682    movq   [r0+r1  ], m1
683    movq   [r0+r1*2], m2
684    movq   [r0+r3  ], m3
685%if %%i == 0
686    lea           r0, [r0+r1*4]
687%endif
688%assign %%i %%i+8
689%endrep
690%endif
691%assign %%i 0
692%rep 128/mmsize
693    mova    [r2+%%i], m4
694%assign %%i %%i+mmsize
695%endrep
696    RET
697%endmacro
698
699%if ARCH_X86_32
700INIT_MMX mmx
701vp3_idct_funcs
702%endif
703
704INIT_XMM sse2
705vp3_idct_funcs
706
707%macro DC_ADD 0
708    movq          m2, [r0     ]
709    movq          m3, [r0+r1  ]
710    paddusb       m2, m0
711    movq          m4, [r0+r1*2]
712    paddusb       m3, m0
713    movq          m5, [r0+r2  ]
714    paddusb       m4, m0
715    paddusb       m5, m0
716    psubusb       m2, m1
717    psubusb       m3, m1
718    movq   [r0     ], m2
719    psubusb       m4, m1
720    movq   [r0+r1  ], m3
721    psubusb       m5, m1
722    movq   [r0+r1*2], m4
723    movq   [r0+r2  ], m5
724%endmacro
725
726INIT_MMX mmxext
727cglobal vp3_idct_dc_add, 3, 4
728    movsx         r3, word [r2]
729    mov    word [r2], 0
730    lea           r2, [r1*3]
731    add           r3, 15
732    sar           r3, 5
733    movd          m0, r3d
734    pshufw        m0, m0, 0x0
735    pxor          m1, m1
736    psubw         m1, m0
737    packuswb      m0, m0
738    packuswb      m1, m1
739    DC_ADD
740    lea           r0, [r0+r1*4]
741    DC_ADD
742    RET
743