1;/**************************************************************************** 2; * 3; * XVID MPEG-4 VIDEO CODEC 4; * - SSE2 forward discrete cosine transform - 5; * 6; * Copyright(C) 2003 Pascal Massimino <skal@planet-d.net> 7; * 8; * This program is free software; you can redistribute it and/or modify it 9; * under the terms of the GNU General Public License as published by 10; * the Free Software Foundation; either version 2 of the License, or 11; * (at your option) any later version. 12; * 13; * This program is distributed in the hope that it will be useful, 14; * but WITHOUT ANY WARRANTY; without even the implied warranty of 15; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16; * GNU General Public License for more details. 17; * 18; * You should have received a copy of the GNU General Public License 19; * along with this program; if not, write to the Free Software 20; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 21; * 22; * $Id: fdct_sse2_skal.asm,v 1.15 2009-09-16 17:07:58 Isibaar Exp $ 23; * 24; ***************************************************************************/ 25 26%include "nasm.inc" 27 28;----------------------------------------------------------------------------- 29; 30; -=FDCT=- 31; 32; Vertical pass is an implementation of the scheme: 33; Loeffler C., Ligtenberg A., and Moschytz C.S.: 34; Practical Fast 1D DCT Algorithm with Eleven Multiplications, 35; Proc. ICASSP 1989, 988-991. 36; 37; Horizontal pass is a double 4x4 vector/matrix multiplication, 38; (see also Intel's Application Note 922: 39; http://developer.intel.com/vtune/cbts/strmsimd/922down.htm 40; Copyright (C) 1999 Intel Corporation) 41; 42; Notes: 43; * tan(3pi/16) is greater than 0.5, and would use the 44; sign bit when turned into 16b fixed-point precision. So, 45; we use the trick: x*tan3 = x*(tan3-1)+x 46; 47; * There's only one SSE-specific instruction (pshufw). 48; 49; * There's still 1 or 2 ticks to save in fLLM_PASS, but 50; I prefer having a readable code, instead of a tightly 51; scheduled one... 52; 53; * Quantization stage (as well as pre-transposition for the 54; idct way back) can be included in the fTab* constants 55; (with induced loss of precision, somehow) 56; 57; * Some more details at: http://skal.planet-d.net/coding/dct.html 58; 59;----------------------------------------------------------------------------- 60; 61; -=IDCT=- 62; 63; A little slower than fdct, because the final stages (butterflies and 64; descaling) require some unpairable shifting and packing, all on 65; the same CPU unit. 66; 67;----------------------------------------------------------------------------- 68 69;============================================================================= 70; Read only data 71;============================================================================= 72 73DATA 74 75ALIGN SECTION_ALIGN 76tan1: times 8 dw 0x32ec ; tan( pi/16) 77tan2: times 8 dw 0x6a0a ; tan(2pi/16) (=sqrt(2)-1) 78tan3: times 8 dw 0xab0e ; tan(3pi/16)-1 79sqrt2: times 8 dw 0x5a82 ; 0.5/sqrt(2) 80 81;----------------------------------------------------------------------------- 82; Inverse DCT tables 83;----------------------------------------------------------------------------- 84 85ALIGN SECTION_ALIGN 86iTab1: 87 dw 0x4000, 0x539f, 0x4000, 0x22a3 88 dw 0x4000, 0xdd5d, 0x4000, 0xac61 89 dw 0x4000, 0x22a3, 0xc000, 0xac61 90 dw 0xc000, 0x539f, 0x4000, 0xdd5d 91 dw 0x58c5, 0x4b42, 0x4b42, 0xee58 92 dw 0x3249, 0xa73b, 0x11a8, 0xcdb7 93 dw 0x3249, 0x11a8, 0xa73b, 0xcdb7 94 dw 0x11a8, 0x4b42, 0x4b42, 0xa73b 95 96iTab2: 97 dw 0x58c5, 0x73fc, 0x58c5, 0x300b 98 dw 0x58c5, 0xcff5, 0x58c5, 0x8c04 99 dw 0x58c5, 0x300b, 0xa73b, 0x8c04 100 dw 0xa73b, 0x73fc, 0x58c5, 0xcff5 101 dw 0x7b21, 0x6862, 0x6862, 0xe782 102 dw 0x45bf, 0x84df, 0x187e, 0xba41 103 dw 0x45bf, 0x187e, 0x84df, 0xba41 104 dw 0x187e, 0x6862, 0x6862, 0x84df 105 106iTab3: 107 dw 0x539f, 0x6d41, 0x539f, 0x2d41 108 dw 0x539f, 0xd2bf, 0x539f, 0x92bf 109 dw 0x539f, 0x2d41, 0xac61, 0x92bf 110 dw 0xac61, 0x6d41, 0x539f, 0xd2bf 111 dw 0x73fc, 0x6254, 0x6254, 0xe8ee 112 dw 0x41b3, 0x8c04, 0x1712, 0xbe4d 113 dw 0x41b3, 0x1712, 0x8c04, 0xbe4d 114 dw 0x1712, 0x6254, 0x6254, 0x8c04 115 116iTab4: 117 dw 0x4b42, 0x6254, 0x4b42, 0x28ba 118 dw 0x4b42, 0xd746, 0x4b42, 0x9dac 119 dw 0x4b42, 0x28ba, 0xb4be, 0x9dac 120 dw 0xb4be, 0x6254, 0x4b42, 0xd746 121 dw 0x6862, 0x587e, 0x587e, 0xeb3d 122 dw 0x3b21, 0x979e, 0x14c3, 0xc4df 123 dw 0x3b21, 0x14c3, 0x979e, 0xc4df 124 dw 0x14c3, 0x587e, 0x587e, 0x979e 125 126ALIGN SECTION_ALIGN 127Walken_Idct_Rounders: 128 dd 65536, 65536, 65536, 65536 129 dd 3597, 3597, 3597, 3597 130 dd 2260, 2260, 2260, 2260 131 dd 1203, 1203, 1203, 1203 132 dd 0, 0, 0, 0 133 dd 120, 120, 120, 120 134 dd 512, 512, 512, 512 135 dd 512, 512, 512, 512 136 137 times 8 dw (65536>>11) 138 times 8 dw ( 3597>>11) 139 times 8 dw ( 2260>>11) 140 ; other rounders are zero... 141 142;----------------------------------------------------------------------------- 143; Forward DCT tables 144;----------------------------------------------------------------------------- 145 146ALIGN SECTION_ALIGN 147fTab1: 148 dw 0x4000, 0x4000, 0x58c5, 0x4b42, 149 dw 0xdd5d, 0xac61, 0xa73b, 0xcdb7, 150 dw 0x4000, 0x4000, 0x3249, 0x11a8, 151 dw 0x539f, 0x22a3, 0x4b42, 0xee58, 152 dw 0x4000, 0xc000, 0x3249, 0xa73b, 153 dw 0x539f, 0xdd5d, 0x4b42, 0xa73b, 154 dw 0xc000, 0x4000, 0x11a8, 0x4b42, 155 dw 0x22a3, 0xac61, 0x11a8, 0xcdb7 156 157fTab2: 158 dw 0x58c5, 0x58c5, 0x7b21, 0x6862, 159 dw 0xcff5, 0x8c04, 0x84df, 0xba41, 160 dw 0x58c5, 0x58c5, 0x45bf, 0x187e, 161 dw 0x73fc, 0x300b, 0x6862, 0xe782, 162 dw 0x58c5, 0xa73b, 0x45bf, 0x84df, 163 dw 0x73fc, 0xcff5, 0x6862, 0x84df, 164 dw 0xa73b, 0x58c5, 0x187e, 0x6862, 165 dw 0x300b, 0x8c04, 0x187e, 0xba41 166 167fTab3: 168 dw 0x539f, 0x539f, 0x73fc, 0x6254, 169 dw 0xd2bf, 0x92bf, 0x8c04, 0xbe4d, 170 dw 0x539f, 0x539f, 0x41b3, 0x1712, 171 dw 0x6d41, 0x2d41, 0x6254, 0xe8ee, 172 dw 0x539f, 0xac61, 0x41b3, 0x8c04, 173 dw 0x6d41, 0xd2bf, 0x6254, 0x8c04, 174 dw 0xac61, 0x539f, 0x1712, 0x6254, 175 dw 0x2d41, 0x92bf, 0x1712, 0xbe4d 176 177fTab4: 178 dw 0x4b42, 0x4b42, 0x6862, 0x587e, 179 dw 0xd746, 0x9dac, 0x979e, 0xc4df, 180 dw 0x4b42, 0x4b42, 0x3b21, 0x14c3, 181 dw 0x6254, 0x28ba, 0x587e, 0xeb3d, 182 dw 0x4b42, 0xb4be, 0x3b21, 0x979e, 183 dw 0x6254, 0xd746, 0x587e, 0x979e, 184 dw 0xb4be, 0x4b42, 0x14c3, 0x587e, 185 dw 0x28ba, 0x9dac, 0x14c3, 0xc4df 186 187 188ALIGN SECTION_ALIGN 189Fdct_Rnd0: dw 6,8,8,8, 6,8,8,8 190Fdct_Rnd1: dw 8,8,8,8, 8,8,8,8 191Fdct_Rnd2: dw 10,8,8,8, 8,8,8,8 192Rounder1: dw 1,1,1,1, 1,1,1,1 193 194;============================================================================= 195; Code 196;============================================================================= 197 198TEXT 199 200cglobal idct_sse2_skal 201cglobal fdct_sse2_skal 202 203;----------------------------------------------------------------------------- 204; Helper macro iMTX_MULT 205;----------------------------------------------------------------------------- 206 207%macro iMTX_MULT 4 ; %1=src, %2 = Table to use, %3=rounder, %4=Shift 208 209 movdqa xmm0, [_ECX+%1*16] ; xmm0 = [01234567] 210 211 pshuflw xmm0, xmm0, 11011000b ; [02134567] ; these two shufflings could be 212 pshufhw xmm0, xmm0, 11011000b ; [02134657] ; integrated in zig-zag orders 213 214 pshufd xmm4, xmm0, 00000000b ; [02020202] 215 pshufd xmm5, xmm0, 10101010b ; [46464646] 216 pshufd xmm6, xmm0, 01010101b ; [13131313] 217 pshufd xmm7, xmm0, 11111111b ; [57575757] 218 219 pmaddwd xmm4, [%2+ 0] ; dot [M00,M01][M04,M05][M08,M09][M12,M13] 220 pmaddwd xmm5, [%2+16] ; dot [M02,M03][M06,M07][M10,M11][M14,M15] 221 pmaddwd xmm6, [%2+32] ; dot [M16,M17][M20,M21][M24,M25][M28,M29] 222 pmaddwd xmm7, [%2+48] ; dot [M18,M19][M22,M23][M26,M27][M30,M31] 223 paddd xmm4, [%3] ; Round 224 225 paddd xmm6, xmm7 ; [b0|b1|b2|b3] 226 paddd xmm4, xmm5 ; [a0|a1|a2|a3] 227 228 movdqa xmm7, xmm6 229 paddd xmm6, xmm4 ; mm6=a+b 230 psubd xmm4, xmm7 ; mm4=a-b 231 psrad xmm6, %4 ; => out [0123] 232 psrad xmm4, %4 ; => out [7654] 233 234 packssdw xmm6, xmm4 ; [01237654] 235 236 pshufhw xmm6, xmm6, 00011011b ; [01234567] 237 238 movdqa [_ECX+%1*16], xmm6 239 240%endmacro 241 242;----------------------------------------------------------------------------- 243; Helper macro iLLM_PASS 244;----------------------------------------------------------------------------- 245 246%macro iLLM_PASS 1 ; %1: src/dst 247 248 movdqa xmm0, [tan3] ; t3-1 249 movdqa xmm3, [%1+16*3] ; x3 250 movdqa xmm1, xmm0 ; t3-1 251 movdqa xmm5, [%1+16*5] ; x5 252 253 movdqa xmm4, [tan1] ; t1 254 movdqa xmm6, [%1+16*1] ; x1 255 movdqa xmm7, [%1+16*7] ; x7 256 movdqa xmm2, xmm4 ; t1 257 258 pmulhw xmm0, xmm3 ; x3*(t3-1) 259 pmulhw xmm1, xmm5 ; x5*(t3-1) 260 paddsw xmm0, xmm3 ; x3*t3 261 paddsw xmm1, xmm5 ; x5*t3 262 psubsw xmm0, xmm5 ; x3*t3-x5 = tm35 263 paddsw xmm1, xmm3 ; x3+x5*t3 = tp35 264 265 pmulhw xmm4, xmm7 ; x7*t1 266 pmulhw xmm2, xmm6 ; x1*t1 267 paddsw xmm4, xmm6 ; x1+t1*x7 = tp17 268 psubsw xmm2, xmm7 ; x1*t1-x7 = tm17 269 270 271 movdqa xmm3, [sqrt2] 272 movdqa xmm7, xmm4 273 movdqa xmm6, xmm2 274 psubsw xmm4, xmm1 ; tp17-tp35 = t1 275 psubsw xmm2, xmm0 ; tm17-tm35 = b3 276 paddsw xmm1, xmm7 ; tp17+tp35 = b0 277 paddsw xmm0, xmm6 ; tm17+tm35 = t2 278 279 ; xmm1 = b0, xmm2 = b3. preserved 280 281 movdqa xmm6, xmm4 282 psubsw xmm4, xmm0 ; t1-t2 283 paddsw xmm0, xmm6 ; t1+t2 284 285 pmulhw xmm4, xmm3 ; (t1-t2)/(2.sqrt2) 286 pmulhw xmm0, xmm3 ; (t1+t2)/(2.sqrt2) 287 288 paddsw xmm0, xmm0 ; 2.(t1+t2) = b1 289 paddsw xmm4, xmm4 ; 2.(t1-t2) = b2 290 291 movdqa xmm7, [tan2] ; t2 292 movdqa xmm3, [%1+2*16] ; x2 293 movdqa xmm6, [%1+6*16] ; x6 294 movdqa xmm5, xmm7 ; t2 295 296 pmulhw xmm7, xmm6 ; x6*t2 297 pmulhw xmm5, xmm3 ; x2*t2 298 299 paddsw xmm7, xmm3 ; x2+x6*t2 = tp26 300 psubsw xmm5, xmm6 ; x2*t2-x6 = tm26 301 302 303 ; use:xmm3,xmm5,xmm6,xmm7 frozen: xmm0,xmm4,xmm1,xmm2 304 305 movdqa xmm3, [%1+0*16] ; x0 306 movdqa xmm6, [%1+4*16] ; x4 307 308 movdqa [%1 ], xmm2 ; we spill 1 reg to perform safe butterflies 309 310 movdqa xmm2, xmm3 311 psubsw xmm3, xmm6 ; x0-x4 = tm04 312 paddsw xmm6, xmm2 ; x0+x4 = tp04 313 314 movdqa xmm2, xmm6 315 psubsw xmm6, xmm7 316 paddsw xmm7, xmm2 317 movdqa xmm2, xmm3 318 psubsw xmm3, xmm5 319 paddsw xmm5, xmm2 320 321 movdqa xmm2, xmm5 322 psubsw xmm5, xmm0 323 paddsw xmm0, xmm2 324 movdqa xmm2, xmm3 325 psubsw xmm3, xmm4 326 paddsw xmm4, xmm2 327 328 movdqa xmm2, [%1] 329 330 psraw xmm5, 6 ; out6 331 psraw xmm3, 6 ; out5 332 psraw xmm0, 6 ; out1 333 psraw xmm4, 6 ; out2 334 335 movdqa [%1+6*16], xmm5 336 movdqa [%1+5*16], xmm3 337 movdqa [%1+1*16], xmm0 338 movdqa [%1+2*16], xmm4 339 340 ; reminder: xmm1=b0, xmm2=b3, xmm7=a0, xmm6=a3 341 342 movdqa xmm0, xmm7 343 movdqa xmm4, xmm6 344 psubsw xmm7, xmm1 ; a0-b0 345 psubsw xmm6, xmm2 ; a3-b3 346 paddsw xmm1, xmm0 ; a0+b0 347 paddsw xmm2, xmm4 ; a3+b3 348 349 psraw xmm1, 6 ; out0 350 psraw xmm7, 6 ; out7 351 psraw xmm2, 6 ; out3 352 psraw xmm6, 6 ; out4 353 354 ; store result 355 356 movdqa [%1+0*16], xmm1 357 movdqa [%1+3*16], xmm2 358 movdqa [%1+4*16], xmm6 359 movdqa [%1+7*16], xmm7 360 361%endmacro 362 363;----------------------------------------------------------------------------- 364; Helper macro TEST_ROW (test a null row) 365;----------------------------------------------------------------------------- 366 367%macro TEST_ROW 2 ; %1:src, %2:label x8 368 mov _EAX, [%1 ] 369 mov _EDX, [%1+ 8] 370 or _EAX, [%1+ 4] 371 or _EDX, [%1+12] 372 or _EAX, _EDX 373 jz near %2 374%endmacro 375 376;----------------------------------------------------------------------------- 377; Function idct (this one skips null rows) 378;----------------------------------------------------------------------------- 379; IEEE1180 and Walken compatible version 380 381ALIGN SECTION_ALIGN 382idct_sse2_skal: 383 384 PUSH_XMM6_XMM7 385 386 mov _ECX, prm1 ; Src 387 388 TEST_ROW _ECX, .Row0_Round 389 iMTX_MULT 0, iTab1, Walken_Idct_Rounders + 16*0, 11 390 jmp .Row1 391.Row0_Round: 392 movdqa xmm0, [Walken_Idct_Rounders + 16*8 + 8*0] 393 movdqa [_ECX ], xmm0 394 395.Row1: 396 TEST_ROW _ECX+16, .Row1_Round 397 iMTX_MULT 1, iTab2, Walken_Idct_Rounders + 16*1, 11 398 jmp .Row2 399.Row1_Round: 400 movdqa xmm0, [Walken_Idct_Rounders + 16*8 + 16*1] 401 movdqa [_ECX+16 ], xmm0 402 403.Row2: 404 TEST_ROW _ECX+32, .Row2_Round 405 iMTX_MULT 2, iTab3, Walken_Idct_Rounders + 16*2, 11 406 jmp .Row3 407.Row2_Round: 408 movdqa xmm0, [Walken_Idct_Rounders + 16*8 + 16*2] 409 movdqa [_ECX+32 ], xmm0 410 411.Row3: 412 TEST_ROW _ECX+48, .Row4 413 iMTX_MULT 3, iTab4, Walken_Idct_Rounders + 16*3, 11 414 415.Row4: 416 TEST_ROW _ECX+64, .Row5 417 iMTX_MULT 4, iTab1, Walken_Idct_Rounders + 16*4, 11 418 419.Row5: 420 TEST_ROW _ECX+80, .Row6 421 iMTX_MULT 5, iTab4, Walken_Idct_Rounders + 16*5, 11 422 423.Row6: 424 TEST_ROW _ECX+96, .Row7 425 iMTX_MULT 6, iTab3, Walken_Idct_Rounders + 16*6, 11 426 427.Row7: 428 TEST_ROW _ECX+112, .End 429 iMTX_MULT 7, iTab2, Walken_Idct_Rounders + 16*7, 11 430.End: 431 432 iLLM_PASS _ECX 433 434 POP_XMM6_XMM7 435 ret 436ENDFUNC 437 438;----------------------------------------------------------------------------- 439; Helper macro fLLM_PASS 440;----------------------------------------------------------------------------- 441 442%macro fLLM_PASS 2 ; %1: src/dst, %2:Shift 443 444 movdqa xmm0, [%1+0*16] ; In0 445 movdqa xmm2, [%1+2*16] ; In2 446 movdqa xmm3, xmm0 447 movdqa xmm4, xmm2 448 movdqa xmm7, [%1+7*16] ; In7 449 movdqa xmm5, [%1+5*16] ; In5 450 451 psubsw xmm0, xmm7 ; t7 = In0-In7 452 paddsw xmm7, xmm3 ; t0 = In0+In7 453 psubsw xmm2, xmm5 ; t5 = In2-In5 454 paddsw xmm5, xmm4 ; t2 = In2+In5 455 456 movdqa xmm3, [%1+3*16] ; In3 457 movdqa xmm4, [%1+4*16] ; In4 458 movdqa xmm1, xmm3 459 psubsw xmm3, xmm4 ; t4 = In3-In4 460 paddsw xmm4, xmm1 ; t3 = In3+In4 461 movdqa xmm6, [%1+6*16] ; In6 462 movdqa xmm1, [%1+1*16] ; In1 463 psubsw xmm1, xmm6 ; t6 = In1-In6 464 paddsw xmm6, [%1+1*16] ; t1 = In1+In6 465 466 psubsw xmm7, xmm4 ; tm03 = t0-t3 467 psubsw xmm6, xmm5 ; tm12 = t1-t2 468 paddsw xmm4, xmm4 ; 2.t3 469 paddsw xmm5, xmm5 ; 2.t2 470 paddsw xmm4, xmm7 ; tp03 = t0+t3 471 paddsw xmm5, xmm6 ; tp12 = t1+t2 472 473 psllw xmm2, %2+1 ; shift t5 (shift +1 to.. 474 psllw xmm1, %2+1 ; shift t6 ..compensate cos4/2) 475 psllw xmm4, %2 ; shift t3 476 psllw xmm5, %2 ; shift t2 477 psllw xmm7, %2 ; shift t0 478 psllw xmm6, %2 ; shift t1 479 psllw xmm3, %2 ; shift t4 480 psllw xmm0, %2 ; shift t7 481 482 psubsw xmm4, xmm5 ; out4 = tp03-tp12 483 psubsw xmm1, xmm2 ; xmm1: t6-t5 484 paddsw xmm5, xmm5 485 paddsw xmm2, xmm2 486 paddsw xmm5, xmm4 ; out0 = tp03+tp12 487 movdqa [%1+4*16], xmm4 ; => out4 488 paddsw xmm2, xmm1 ; xmm2: t6+t5 489 movdqa [%1+0*16], xmm5 ; => out0 490 491 movdqa xmm4, [tan2] ; xmm4 <= tan2 492 pmulhw xmm4, xmm7 ; tm03*tan2 493 movdqa xmm5, [tan2] ; xmm5 <= tan2 494 psubsw xmm4, xmm6 ; out6 = tm03*tan2 - tm12 495 pmulhw xmm5, xmm6 ; tm12*tan2 496 paddsw xmm5, xmm7 ; out2 = tm12*tan2 + tm03 497 498 movdqa xmm6, [sqrt2] 499 movdqa xmm7, [Rounder1] 500 501 pmulhw xmm2, xmm6 ; xmm2: tp65 = (t6 + t5)*cos4 502 por xmm5, xmm7 ; correct out2 503 por xmm4, xmm7 ; correct out6 504 pmulhw xmm1, xmm6 ; xmm1: tm65 = (t6 - t5)*cos4 505 por xmm2, xmm7 ; correct tp65 506 507 movdqa [%1+2*16], xmm5 ; => out2 508 movdqa xmm5, xmm3 ; save t4 509 movdqa [%1+6*16], xmm4 ; => out6 510 movdqa xmm4, xmm0 ; save t7 511 512 psubsw xmm3, xmm1 ; xmm3: tm465 = t4 - tm65 513 psubsw xmm0, xmm2 ; xmm0: tm765 = t7 - tp65 514 paddsw xmm2, xmm4 ; xmm2: tp765 = t7 + tp65 515 paddsw xmm1, xmm5 ; xmm1: tp465 = t4 + tm65 516 517 movdqa xmm4, [tan3] ; tan3 - 1 518 movdqa xmm5, [tan1] ; tan1 519 520 movdqa xmm7, xmm3 ; save tm465 521 pmulhw xmm3, xmm4 ; tm465*(tan3-1) 522 movdqa xmm6, xmm1 ; save tp465 523 pmulhw xmm1, xmm5 ; tp465*tan1 524 525 paddsw xmm3, xmm7 ; tm465*tan3 526 pmulhw xmm4, xmm0 ; tm765*(tan3-1) 527 paddsw xmm4, xmm0 ; tm765*tan3 528 pmulhw xmm5, xmm2 ; tp765*tan1 529 530 paddsw xmm1, xmm2 ; out1 = tp765 + tp465*tan1 531 psubsw xmm0, xmm3 ; out3 = tm765 - tm465*tan3 532 paddsw xmm7, xmm4 ; out5 = tm465 + tm765*tan3 533 psubsw xmm5, xmm6 ; out7 =-tp465 + tp765*tan1 534 535 movdqa [%1+1*16], xmm1 ; => out1 536 movdqa [%1+3*16], xmm0 ; => out3 537 movdqa [%1+5*16], xmm7 ; => out5 538 movdqa [%1+7*16], xmm5 ; => out7 539 540%endmacro 541 542;----------------------------------------------------------------------------- 543;Helper macro fMTX_MULT 544;----------------------------------------------------------------------------- 545 546%macro fMTX_MULT 3 ; %1=src, %2 = Coeffs, %3=rounders 547 548 movdqa xmm0, [_ECX+%1*16+0] ; xmm0 = [0123][4567] 549 pshufhw xmm1, xmm0, 00011011b ; xmm1 = [----][7654] 550 pshufd xmm0, xmm0, 01000100b 551 pshufd xmm1, xmm1, 11101110b 552 553 movdqa xmm2, xmm0 554 paddsw xmm0, xmm1 ; xmm0 = [a0 a1 a2 a3] 555 psubsw xmm2, xmm1 ; xmm2 = [b0 b1 b2 b3] 556 557 punpckldq xmm0, xmm2 ; xmm0 = [a0 a1 b0 b1][a2 a3 b2 b3] 558 pshufd xmm2, xmm0, 01001110b ; xmm2 = [a2 a3 b2 b3][a0 a1 b0 b1] 559 560 ; [M00 M01 M16 M17] [M06 M07 M22 M23] x mm0 = [0 /1 /2'/3'] 561 ; [M02 M03 M18 M19] [M04 M05 M20 M21] x mm2 = [0'/1'/2 /3 ] 562 ; [M08 M09 M24 M25] [M14 M15 M30 M31] x mm0 = [4 /5 /6'/7'] 563 ; [M10 M11 M26 M27] [M12 M13 M28 M29] x mm2 = [4'/5'/6 /7 ] 564 565 movdqa xmm1, [%2+16] 566 movdqa xmm3, [%2+32] 567 pmaddwd xmm1, xmm2 568 pmaddwd xmm3, xmm0 569 pmaddwd xmm2, [%2+48] 570 pmaddwd xmm0, [%2+ 0] 571 572 paddd xmm0, xmm1 ; [ out0 | out1 ][ out2 | out3 ] 573 paddd xmm2, xmm3 ; [ out4 | out5 ][ out6 | out7 ] 574 psrad xmm0, 16 575 psrad xmm2, 16 576 577 packssdw xmm0, xmm2 ; [ out0 .. out7 ] 578 paddsw xmm0, [%3] ; Round 579 580 psraw xmm0, 4 ; => [-2048, 2047] 581 582 movdqa [_ECX+%1*16+0], xmm0 583%endmacro 584 585;----------------------------------------------------------------------------- 586; Function Forward DCT 587;----------------------------------------------------------------------------- 588 589ALIGN SECTION_ALIGN 590fdct_sse2_skal: 591 PUSH_XMM6_XMM7 592 mov _ECX, prm1 593 fLLM_PASS _ECX+0, 3 594 fMTX_MULT 0, fTab1, Fdct_Rnd0 595 fMTX_MULT 1, fTab2, Fdct_Rnd2 596 fMTX_MULT 2, fTab3, Fdct_Rnd1 597 fMTX_MULT 3, fTab4, Fdct_Rnd1 598 fMTX_MULT 4, fTab1, Fdct_Rnd0 599 fMTX_MULT 5, fTab4, Fdct_Rnd1 600 fMTX_MULT 6, fTab3, Fdct_Rnd1 601 fMTX_MULT 7, fTab2, Fdct_Rnd1 602 603 POP_XMM6_XMM7 604 ret 605ENDFUNC 606 607; Mac-specific workaround for misaligned DCT tables 608ALIGN SECTION_ALIGN 609 times 8 dw 0 610 611NON_EXEC_STACK 612