1;/****************************************************************************
2; *
3; *  XVID MPEG-4 VIDEO CODEC
4; *  - SSE2 forward discrete cosine transform -
5; *
6; *  Copyright(C) 2003 Pascal Massimino <skal@planet-d.net>
7; *
8; *  This program is free software; you can redistribute it and/or modify it
9; *  under the terms of the GNU General Public License as published by
10; *  the Free Software Foundation; either version 2 of the License, or
11; *  (at your option) any later version.
12; *
13; *  This program is distributed in the hope that it will be useful,
14; *  but WITHOUT ANY WARRANTY; without even the implied warranty of
15; *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16; *  GNU General Public License for more details.
17; *
18; *  You should have received a copy of the GNU General Public License
19; *  along with this program; if not, write to the Free Software
20; *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
21; *
22; * $Id: fdct_sse2_skal.asm,v 1.15 2009-09-16 17:07:58 Isibaar Exp $
23; *
24; ***************************************************************************/
25
26%include "nasm.inc"
27
28;-----------------------------------------------------------------------------
29;
30;                          -=FDCT=-
31;
32; Vertical pass is an implementation of the scheme:
33;  Loeffler C., Ligtenberg A., and Moschytz C.S.:
34;  Practical Fast 1D DCT Algorithm with Eleven Multiplications,
35;  Proc. ICASSP 1989, 988-991.
36;
37; Horizontal pass is a double 4x4 vector/matrix multiplication,
38; (see also Intel's Application Note 922:
39;  http://developer.intel.com/vtune/cbts/strmsimd/922down.htm
40;  Copyright (C) 1999 Intel Corporation)
41;
42; Notes:
43;  * tan(3pi/16) is greater than 0.5, and would use the
44;    sign bit when turned into 16b fixed-point precision. So,
45;    we use the trick: x*tan3 = x*(tan3-1)+x
46;
47;  * There's only one SSE-specific instruction (pshufw).
48;
49;  * There's still 1 or 2 ticks to save in fLLM_PASS, but
50;    I prefer having a readable code, instead of a tightly
51;    scheduled one...
52;
53;  * Quantization stage (as well as pre-transposition for the
54;    idct way back) can be included in the fTab* constants
55;    (with induced loss of precision, somehow)
56;
57;  * Some more details at: http://skal.planet-d.net/coding/dct.html
58;
59;-----------------------------------------------------------------------------
60;
61;                          -=IDCT=-
62;
63; A little slower than fdct, because the final stages (butterflies and
64; descaling) require some unpairable shifting and packing, all on
65; the same CPU unit.
66;
67;-----------------------------------------------------------------------------
68
69;=============================================================================
70; Read only data
71;=============================================================================
72
73DATA
74
75ALIGN SECTION_ALIGN
76tan1:    times 8 dw 0x32ec    ; tan( pi/16)
77tan2:    times 8 dw 0x6a0a    ; tan(2pi/16)  (=sqrt(2)-1)
78tan3:    times 8 dw 0xab0e    ; tan(3pi/16)-1
79sqrt2:   times 8 dw 0x5a82    ; 0.5/sqrt(2)
80
81;-----------------------------------------------------------------------------
82; Inverse DCT tables
83;-----------------------------------------------------------------------------
84
85ALIGN SECTION_ALIGN
86iTab1:
87  dw 0x4000, 0x539f, 0x4000, 0x22a3
88  dw 0x4000, 0xdd5d, 0x4000, 0xac61
89  dw 0x4000, 0x22a3, 0xc000, 0xac61
90  dw 0xc000, 0x539f, 0x4000, 0xdd5d
91  dw 0x58c5, 0x4b42, 0x4b42, 0xee58
92  dw 0x3249, 0xa73b, 0x11a8, 0xcdb7
93  dw 0x3249, 0x11a8, 0xa73b, 0xcdb7
94  dw 0x11a8, 0x4b42, 0x4b42, 0xa73b
95
96iTab2:
97  dw 0x58c5, 0x73fc, 0x58c5, 0x300b
98  dw 0x58c5, 0xcff5, 0x58c5, 0x8c04
99  dw 0x58c5, 0x300b, 0xa73b, 0x8c04
100  dw 0xa73b, 0x73fc, 0x58c5, 0xcff5
101  dw 0x7b21, 0x6862, 0x6862, 0xe782
102  dw 0x45bf, 0x84df, 0x187e, 0xba41
103  dw 0x45bf, 0x187e, 0x84df, 0xba41
104  dw 0x187e, 0x6862, 0x6862, 0x84df
105
106iTab3:
107  dw 0x539f, 0x6d41, 0x539f, 0x2d41
108  dw 0x539f, 0xd2bf, 0x539f, 0x92bf
109  dw 0x539f, 0x2d41, 0xac61, 0x92bf
110  dw 0xac61, 0x6d41, 0x539f, 0xd2bf
111  dw 0x73fc, 0x6254, 0x6254, 0xe8ee
112  dw 0x41b3, 0x8c04, 0x1712, 0xbe4d
113  dw 0x41b3, 0x1712, 0x8c04, 0xbe4d
114  dw 0x1712, 0x6254, 0x6254, 0x8c04
115
116iTab4:
117  dw 0x4b42, 0x6254, 0x4b42, 0x28ba
118  dw 0x4b42, 0xd746, 0x4b42, 0x9dac
119  dw 0x4b42, 0x28ba, 0xb4be, 0x9dac
120  dw 0xb4be, 0x6254, 0x4b42, 0xd746
121  dw 0x6862, 0x587e, 0x587e, 0xeb3d
122  dw 0x3b21, 0x979e, 0x14c3, 0xc4df
123  dw 0x3b21, 0x14c3, 0x979e, 0xc4df
124  dw 0x14c3, 0x587e, 0x587e, 0x979e
125
126ALIGN SECTION_ALIGN
127Walken_Idct_Rounders:
128  dd  65536, 65536, 65536, 65536
129  dd   3597,  3597,  3597,  3597
130  dd   2260,  2260,  2260,  2260
131  dd   1203,  1203,  1203,  1203
132  dd      0,     0,     0,     0
133  dd    120,   120,   120,   120
134  dd    512,   512,   512,   512
135  dd    512,   512,   512,   512
136
137  times 8 dw  (65536>>11)
138  times 8 dw  ( 3597>>11)
139  times 8 dw  ( 2260>>11)
140  ; other rounders are zero...
141
142;-----------------------------------------------------------------------------
143; Forward DCT tables
144;-----------------------------------------------------------------------------
145
146ALIGN SECTION_ALIGN
147fTab1:
148  dw 0x4000, 0x4000, 0x58c5, 0x4b42,
149  dw 0xdd5d, 0xac61, 0xa73b, 0xcdb7,
150  dw 0x4000, 0x4000, 0x3249, 0x11a8,
151  dw 0x539f, 0x22a3, 0x4b42, 0xee58,
152  dw 0x4000, 0xc000, 0x3249, 0xa73b,
153  dw 0x539f, 0xdd5d, 0x4b42, 0xa73b,
154  dw 0xc000, 0x4000, 0x11a8, 0x4b42,
155  dw 0x22a3, 0xac61, 0x11a8, 0xcdb7
156
157fTab2:
158  dw 0x58c5, 0x58c5, 0x7b21, 0x6862,
159  dw 0xcff5, 0x8c04, 0x84df, 0xba41,
160  dw 0x58c5, 0x58c5, 0x45bf, 0x187e,
161  dw 0x73fc, 0x300b, 0x6862, 0xe782,
162  dw 0x58c5, 0xa73b, 0x45bf, 0x84df,
163  dw 0x73fc, 0xcff5, 0x6862, 0x84df,
164  dw 0xa73b, 0x58c5, 0x187e, 0x6862,
165  dw 0x300b, 0x8c04, 0x187e, 0xba41
166
167fTab3:
168  dw 0x539f, 0x539f, 0x73fc, 0x6254,
169  dw 0xd2bf, 0x92bf, 0x8c04, 0xbe4d,
170  dw 0x539f, 0x539f, 0x41b3, 0x1712,
171  dw 0x6d41, 0x2d41, 0x6254, 0xe8ee,
172  dw 0x539f, 0xac61, 0x41b3, 0x8c04,
173  dw 0x6d41, 0xd2bf, 0x6254, 0x8c04,
174  dw 0xac61, 0x539f, 0x1712, 0x6254,
175  dw 0x2d41, 0x92bf, 0x1712, 0xbe4d
176
177fTab4:
178  dw 0x4b42, 0x4b42, 0x6862, 0x587e,
179  dw 0xd746, 0x9dac, 0x979e, 0xc4df,
180  dw 0x4b42, 0x4b42, 0x3b21, 0x14c3,
181  dw 0x6254, 0x28ba, 0x587e, 0xeb3d,
182  dw 0x4b42, 0xb4be, 0x3b21, 0x979e,
183  dw 0x6254, 0xd746, 0x587e, 0x979e,
184  dw 0xb4be, 0x4b42, 0x14c3, 0x587e,
185  dw 0x28ba, 0x9dac, 0x14c3, 0xc4df
186
187
188ALIGN SECTION_ALIGN
189Fdct_Rnd0: dw  6,8,8,8, 6,8,8,8
190Fdct_Rnd1: dw  8,8,8,8, 8,8,8,8
191Fdct_Rnd2: dw 10,8,8,8, 8,8,8,8
192Rounder1:  dw  1,1,1,1, 1,1,1,1
193
194;=============================================================================
195; Code
196;=============================================================================
197
198TEXT
199
200cglobal idct_sse2_skal
201cglobal fdct_sse2_skal
202
203;-----------------------------------------------------------------------------
204; Helper macro iMTX_MULT
205;-----------------------------------------------------------------------------
206
207%macro iMTX_MULT 4   ; %1=src, %2 = Table to use, %3=rounder, %4=Shift
208
209  movdqa  xmm0, [_ECX+%1*16]     ; xmm0 = [01234567]
210
211  pshuflw xmm0, xmm0, 11011000b ; [02134567]  ; these two shufflings could be
212  pshufhw xmm0, xmm0, 11011000b ; [02134657]  ; integrated in zig-zag orders
213
214  pshufd  xmm4, xmm0, 00000000b ; [02020202]
215  pshufd  xmm5, xmm0, 10101010b ; [46464646]
216  pshufd  xmm6, xmm0, 01010101b ; [13131313]
217  pshufd  xmm7, xmm0, 11111111b ; [57575757]
218
219  pmaddwd xmm4, [%2+ 0]   ; dot [M00,M01][M04,M05][M08,M09][M12,M13]
220  pmaddwd xmm5, [%2+16]   ; dot [M02,M03][M06,M07][M10,M11][M14,M15]
221  pmaddwd xmm6, [%2+32]   ; dot [M16,M17][M20,M21][M24,M25][M28,M29]
222  pmaddwd xmm7, [%2+48]   ; dot [M18,M19][M22,M23][M26,M27][M30,M31]
223  paddd   xmm4, [%3]      ; Round
224
225  paddd   xmm6, xmm7      ; [b0|b1|b2|b3]
226  paddd   xmm4, xmm5      ; [a0|a1|a2|a3]
227
228  movdqa  xmm7, xmm6
229  paddd   xmm6, xmm4      ; mm6=a+b
230  psubd   xmm4, xmm7      ; mm4=a-b
231  psrad   xmm6, %4        ; => out [0123]
232  psrad   xmm4, %4        ; => out [7654]
233
234  packssdw xmm6, xmm4     ; [01237654]
235
236  pshufhw xmm6, xmm6, 00011011b ; [01234567]
237
238  movdqa  [_ECX+%1*16], xmm6
239
240%endmacro
241
242;-----------------------------------------------------------------------------
243; Helper macro iLLM_PASS
244;-----------------------------------------------------------------------------
245
246%macro iLLM_PASS 1  ; %1: src/dst
247
248  movdqa xmm0, [tan3]     ; t3-1
249  movdqa xmm3, [%1+16*3]  ; x3
250  movdqa xmm1, xmm0       ; t3-1
251  movdqa xmm5, [%1+16*5]  ; x5
252
253  movdqa xmm4, [tan1]     ; t1
254  movdqa xmm6, [%1+16*1]  ; x1
255  movdqa xmm7, [%1+16*7]  ; x7
256  movdqa xmm2, xmm4       ; t1
257
258  pmulhw xmm0, xmm3       ; x3*(t3-1)
259  pmulhw xmm1, xmm5       ; x5*(t3-1)
260  paddsw xmm0, xmm3       ; x3*t3
261  paddsw xmm1, xmm5       ; x5*t3
262  psubsw xmm0, xmm5       ; x3*t3-x5 = tm35
263  paddsw xmm1, xmm3       ; x3+x5*t3 = tp35
264
265  pmulhw xmm4, xmm7       ; x7*t1
266  pmulhw xmm2, xmm6       ; x1*t1
267  paddsw xmm4, xmm6       ; x1+t1*x7 = tp17
268  psubsw xmm2, xmm7       ; x1*t1-x7 = tm17
269
270
271  movdqa xmm3, [sqrt2]
272  movdqa xmm7, xmm4
273  movdqa xmm6, xmm2
274  psubsw xmm4, xmm1       ; tp17-tp35 = t1
275  psubsw xmm2, xmm0       ; tm17-tm35 = b3
276  paddsw xmm1, xmm7       ; tp17+tp35 = b0
277  paddsw xmm0, xmm6       ; tm17+tm35 = t2
278
279    ; xmm1 = b0, xmm2 = b3. preserved
280
281  movdqa xmm6, xmm4
282  psubsw xmm4, xmm0       ; t1-t2
283  paddsw xmm0, xmm6       ; t1+t2
284
285  pmulhw xmm4, xmm3       ; (t1-t2)/(2.sqrt2)
286  pmulhw xmm0, xmm3       ; (t1+t2)/(2.sqrt2)
287
288  paddsw xmm0, xmm0       ; 2.(t1+t2) = b1
289  paddsw xmm4, xmm4       ; 2.(t1-t2) = b2
290
291  movdqa xmm7, [tan2]     ; t2
292  movdqa xmm3, [%1+2*16]  ; x2
293  movdqa xmm6, [%1+6*16]  ; x6
294  movdqa xmm5, xmm7       ; t2
295
296  pmulhw xmm7, xmm6       ; x6*t2
297  pmulhw xmm5, xmm3       ; x2*t2
298
299  paddsw xmm7, xmm3       ; x2+x6*t2 = tp26
300  psubsw xmm5, xmm6       ; x2*t2-x6 = tm26
301
302
303   ; use:xmm3,xmm5,xmm6,xmm7   frozen: xmm0,xmm4,xmm1,xmm2
304
305  movdqa xmm3, [%1+0*16] ; x0
306  movdqa xmm6, [%1+4*16] ; x4
307
308  movdqa [%1   ], xmm2  ; we spill 1 reg to perform safe butterflies
309
310  movdqa xmm2, xmm3
311  psubsw xmm3, xmm6   ; x0-x4 = tm04
312  paddsw xmm6, xmm2   ; x0+x4 = tp04
313
314  movdqa xmm2, xmm6
315  psubsw xmm6, xmm7
316  paddsw xmm7, xmm2
317  movdqa xmm2, xmm3
318  psubsw xmm3, xmm5
319  paddsw xmm5, xmm2
320
321  movdqa xmm2, xmm5
322  psubsw xmm5, xmm0
323  paddsw xmm0, xmm2
324  movdqa xmm2, xmm3
325  psubsw xmm3, xmm4
326  paddsw xmm4, xmm2
327
328  movdqa xmm2, [%1]
329
330  psraw  xmm5, 6      ; out6
331  psraw  xmm3, 6      ; out5
332  psraw  xmm0, 6      ; out1
333  psraw  xmm4, 6      ; out2
334
335  movdqa [%1+6*16], xmm5
336  movdqa [%1+5*16], xmm3
337  movdqa [%1+1*16], xmm0
338  movdqa [%1+2*16], xmm4
339
340    ; reminder: xmm1=b0, xmm2=b3, xmm7=a0, xmm6=a3
341
342  movdqa xmm0, xmm7
343  movdqa xmm4, xmm6
344  psubsw xmm7, xmm1   ; a0-b0
345  psubsw xmm6, xmm2   ; a3-b3
346  paddsw xmm1, xmm0   ; a0+b0
347  paddsw xmm2, xmm4   ; a3+b3
348
349  psraw  xmm1, 6      ; out0
350  psraw  xmm7, 6      ; out7
351  psraw  xmm2, 6      ; out3
352  psraw  xmm6, 6      ; out4
353
354    ; store result
355
356  movdqa [%1+0*16], xmm1
357  movdqa [%1+3*16], xmm2
358  movdqa [%1+4*16], xmm6
359  movdqa [%1+7*16], xmm7
360
361%endmacro
362
363;-----------------------------------------------------------------------------
364; Helper macro TEST_ROW (test a null row)
365;-----------------------------------------------------------------------------
366
367%macro TEST_ROW 2     ; %1:src,  %2:label x8
368  mov _EAX, [%1   ]
369  mov _EDX, [%1+ 8]
370  or  _EAX, [%1+ 4]
371  or  _EDX, [%1+12]
372  or  _EAX, _EDX
373  jz near %2
374%endmacro
375
376;-----------------------------------------------------------------------------
377; Function idct (this one skips null rows)
378;-----------------------------------------------------------------------------
379; IEEE1180 and Walken compatible version
380
381ALIGN SECTION_ALIGN
382idct_sse2_skal:
383
384  PUSH_XMM6_XMM7
385
386  mov _ECX, prm1  ; Src
387
388  TEST_ROW _ECX, .Row0_Round
389  iMTX_MULT  0, iTab1, Walken_Idct_Rounders + 16*0, 11
390  jmp .Row1
391.Row0_Round:
392  movdqa xmm0, [Walken_Idct_Rounders + 16*8 + 8*0]
393  movdqa [_ECX  ], xmm0
394
395.Row1:
396  TEST_ROW _ECX+16, .Row1_Round
397  iMTX_MULT  1, iTab2, Walken_Idct_Rounders + 16*1, 11
398  jmp .Row2
399.Row1_Round:
400  movdqa xmm0, [Walken_Idct_Rounders + 16*8 + 16*1]
401  movdqa [_ECX+16  ], xmm0
402
403.Row2:
404  TEST_ROW _ECX+32, .Row2_Round
405  iMTX_MULT  2, iTab3, Walken_Idct_Rounders + 16*2, 11
406  jmp .Row3
407.Row2_Round:
408  movdqa xmm0, [Walken_Idct_Rounders + 16*8 + 16*2]
409  movdqa [_ECX+32  ], xmm0
410
411.Row3:
412  TEST_ROW _ECX+48, .Row4
413  iMTX_MULT  3, iTab4, Walken_Idct_Rounders + 16*3, 11
414
415.Row4:
416  TEST_ROW _ECX+64, .Row5
417  iMTX_MULT  4, iTab1, Walken_Idct_Rounders + 16*4, 11
418
419.Row5:
420  TEST_ROW _ECX+80, .Row6
421  iMTX_MULT  5, iTab4, Walken_Idct_Rounders + 16*5, 11
422
423.Row6:
424  TEST_ROW _ECX+96, .Row7
425  iMTX_MULT  6, iTab3, Walken_Idct_Rounders + 16*6, 11
426
427.Row7:
428  TEST_ROW _ECX+112, .End
429  iMTX_MULT  7, iTab2, Walken_Idct_Rounders + 16*7, 11
430.End:
431
432  iLLM_PASS _ECX
433
434  POP_XMM6_XMM7
435  ret
436ENDFUNC
437
438;-----------------------------------------------------------------------------
439; Helper macro fLLM_PASS
440;-----------------------------------------------------------------------------
441
442%macro fLLM_PASS 2  ; %1: src/dst, %2:Shift
443
444  movdqa xmm0, [%1+0*16]   ; In0
445  movdqa xmm2, [%1+2*16]   ; In2
446  movdqa xmm3, xmm0
447  movdqa xmm4, xmm2
448  movdqa xmm7, [%1+7*16]   ; In7
449  movdqa xmm5, [%1+5*16]   ; In5
450
451  psubsw xmm0, xmm7         ; t7 = In0-In7
452  paddsw xmm7, xmm3         ; t0 = In0+In7
453  psubsw xmm2, xmm5         ; t5 = In2-In5
454  paddsw xmm5, xmm4         ; t2 = In2+In5
455
456  movdqa xmm3, [%1+3*16]   ; In3
457  movdqa xmm4, [%1+4*16]   ; In4
458  movdqa xmm1, xmm3
459  psubsw xmm3, xmm4         ; t4 = In3-In4
460  paddsw xmm4, xmm1         ; t3 = In3+In4
461  movdqa xmm6, [%1+6*16]   ; In6
462  movdqa xmm1, [%1+1*16]   ; In1
463  psubsw xmm1, xmm6         ; t6 = In1-In6
464  paddsw xmm6, [%1+1*16]   ; t1 = In1+In6
465
466  psubsw xmm7, xmm4         ; tm03 = t0-t3
467  psubsw xmm6, xmm5         ; tm12 = t1-t2
468  paddsw xmm4, xmm4         ; 2.t3
469  paddsw xmm5, xmm5         ; 2.t2
470  paddsw xmm4, xmm7         ; tp03 = t0+t3
471  paddsw xmm5, xmm6         ; tp12 = t1+t2
472
473  psllw  xmm2, %2+1        ; shift t5 (shift +1 to..
474  psllw  xmm1, %2+1        ; shift t6  ..compensate cos4/2)
475  psllw  xmm4, %2          ; shift t3
476  psllw  xmm5, %2          ; shift t2
477  psllw  xmm7, %2          ; shift t0
478  psllw  xmm6, %2          ; shift t1
479  psllw  xmm3, %2          ; shift t4
480  psllw  xmm0, %2          ; shift t7
481
482  psubsw xmm4, xmm5         ; out4 = tp03-tp12
483  psubsw xmm1, xmm2         ; xmm1: t6-t5
484  paddsw xmm5, xmm5
485  paddsw xmm2, xmm2
486  paddsw xmm5, xmm4         ; out0 = tp03+tp12
487  movdqa [%1+4*16], xmm4   ; => out4
488  paddsw xmm2, xmm1         ; xmm2: t6+t5
489  movdqa [%1+0*16], xmm5   ; => out0
490
491  movdqa xmm4, [tan2]      ; xmm4 <= tan2
492  pmulhw xmm4, xmm7         ; tm03*tan2
493  movdqa xmm5, [tan2]      ; xmm5 <= tan2
494  psubsw xmm4, xmm6         ; out6 = tm03*tan2 - tm12
495  pmulhw xmm5, xmm6         ; tm12*tan2
496  paddsw xmm5, xmm7         ; out2 = tm12*tan2 + tm03
497
498  movdqa xmm6, [sqrt2]
499  movdqa xmm7, [Rounder1]
500
501  pmulhw xmm2, xmm6         ; xmm2: tp65 = (t6 + t5)*cos4
502  por    xmm5, xmm7         ; correct out2
503  por    xmm4, xmm7         ; correct out6
504  pmulhw xmm1, xmm6         ; xmm1: tm65 = (t6 - t5)*cos4
505  por    xmm2, xmm7         ; correct tp65
506
507  movdqa [%1+2*16], xmm5   ; => out2
508  movdqa xmm5, xmm3         ; save t4
509  movdqa [%1+6*16], xmm4   ; => out6
510  movdqa xmm4, xmm0         ; save t7
511
512  psubsw xmm3, xmm1         ; xmm3: tm465 = t4 - tm65
513  psubsw xmm0, xmm2         ; xmm0: tm765 = t7 - tp65
514  paddsw xmm2, xmm4         ; xmm2: tp765 = t7 + tp65
515  paddsw xmm1, xmm5         ; xmm1: tp465 = t4 + tm65
516
517  movdqa xmm4, [tan3]      ; tan3 - 1
518  movdqa xmm5, [tan1]      ; tan1
519
520  movdqa xmm7, xmm3         ; save tm465
521  pmulhw xmm3, xmm4         ; tm465*(tan3-1)
522  movdqa xmm6, xmm1         ; save tp465
523  pmulhw xmm1, xmm5         ; tp465*tan1
524
525  paddsw xmm3, xmm7         ; tm465*tan3
526  pmulhw xmm4, xmm0         ; tm765*(tan3-1)
527  paddsw xmm4, xmm0         ; tm765*tan3
528  pmulhw xmm5, xmm2         ; tp765*tan1
529
530  paddsw xmm1, xmm2         ; out1 = tp765 + tp465*tan1
531  psubsw xmm0, xmm3         ; out3 = tm765 - tm465*tan3
532  paddsw xmm7, xmm4         ; out5 = tm465 + tm765*tan3
533  psubsw xmm5, xmm6         ; out7 =-tp465 + tp765*tan1
534
535  movdqa [%1+1*16], xmm1   ; => out1
536  movdqa [%1+3*16], xmm0   ; => out3
537  movdqa [%1+5*16], xmm7   ; => out5
538  movdqa [%1+7*16], xmm5   ; => out7
539
540%endmacro
541
542;-----------------------------------------------------------------------------
543;Helper macro fMTX_MULT
544;-----------------------------------------------------------------------------
545
546%macro fMTX_MULT 3   ; %1=src, %2 = Coeffs, %3=rounders
547
548  movdqa   xmm0, [_ECX+%1*16+0]   ; xmm0 = [0123][4567]
549  pshufhw  xmm1, xmm0, 00011011b ; xmm1 = [----][7654]
550  pshufd   xmm0, xmm0, 01000100b
551  pshufd   xmm1, xmm1, 11101110b
552
553  movdqa   xmm2, xmm0
554  paddsw  xmm0, xmm1              ; xmm0 = [a0 a1 a2 a3]
555  psubsw  xmm2, xmm1              ; xmm2 = [b0 b1 b2 b3]
556
557  punpckldq xmm0, xmm2            ; xmm0 = [a0 a1 b0 b1][a2 a3 b2 b3]
558  pshufd    xmm2, xmm0, 01001110b ; xmm2 = [a2 a3 b2 b3][a0 a1 b0 b1]
559
560    ;  [M00 M01    M16 M17] [M06 M07    M22 M23]  x mm0 = [0 /1 /2'/3']
561    ;  [M02 M03    M18 M19] [M04 M05    M20 M21]  x mm2 = [0'/1'/2 /3 ]
562    ;  [M08 M09    M24 M25] [M14 M15    M30 M31]  x mm0 = [4 /5 /6'/7']
563    ;  [M10 M11    M26 M27] [M12 M13    M28 M29]  x mm2 = [4'/5'/6 /7 ]
564
565  movdqa  xmm1, [%2+16]
566  movdqa  xmm3, [%2+32]
567  pmaddwd xmm1, xmm2
568  pmaddwd xmm3, xmm0
569  pmaddwd xmm2, [%2+48]
570  pmaddwd xmm0, [%2+ 0]
571
572  paddd   xmm0, xmm1             ;  [ out0 | out1 ][ out2 | out3 ]
573  paddd   xmm2, xmm3             ;  [ out4 | out5 ][ out6 | out7 ]
574  psrad   xmm0, 16
575  psrad   xmm2, 16
576
577  packssdw xmm0, xmm2            ;  [ out0 .. out7 ]
578  paddsw   xmm0, [%3]            ;  Round
579
580  psraw    xmm0, 4               ; => [-2048, 2047]
581
582  movdqa  [_ECX+%1*16+0], xmm0
583%endmacro
584
585;-----------------------------------------------------------------------------
586; Function Forward DCT
587;-----------------------------------------------------------------------------
588
589ALIGN SECTION_ALIGN
590fdct_sse2_skal:
591  PUSH_XMM6_XMM7
592  mov _ECX, prm1
593  fLLM_PASS _ECX+0, 3
594  fMTX_MULT  0, fTab1, Fdct_Rnd0
595  fMTX_MULT  1, fTab2, Fdct_Rnd2
596  fMTX_MULT  2, fTab3, Fdct_Rnd1
597  fMTX_MULT  3, fTab4, Fdct_Rnd1
598  fMTX_MULT  4, fTab1, Fdct_Rnd0
599  fMTX_MULT  5, fTab4, Fdct_Rnd1
600  fMTX_MULT  6, fTab3, Fdct_Rnd1
601  fMTX_MULT  7, fTab2, Fdct_Rnd1
602
603  POP_XMM6_XMM7
604  ret
605ENDFUNC
606
607; Mac-specific workaround for misaligned DCT tables
608ALIGN SECTION_ALIGN
609  times 8 dw 0
610
611NON_EXEC_STACK
612