1;/**************************************************************************** 2; * 3; * XVID MPEG-4 VIDEO CODEC 4; * - SSE2 optimized SAD operators - 5; * 6; * Copyright(C) 2003-2010 Pascal Massimino <skal@planet-d.net> 7; * 2008-2010 Michael Militzer <michael@xvid.org> 8; * 9; * 10; * This program is free software; you can redistribute it and/or modify it 11; * under the terms of the GNU General Public License as published by 12; * the Free Software Foundation; either version 2 of the License, or 13; * (at your option) any later version. 14; * 15; * This program is distributed in the hope that it will be useful, 16; * but WITHOUT ANY WARRANTY; without even the implied warranty of 17; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18; * GNU General Public License for more details. 19; * 20; * You should have received a copy of the GNU General Public License 21; * along with this program; if not, write to the Free Software 22; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 23; * 24; * $Id: sad_sse2.asm,v 1.21 2010-11-28 15:18:21 Isibaar Exp $ 25; * 26; ***************************************************************************/ 27 28%include "nasm.inc" 29 30;============================================================================= 31; Read only data 32;============================================================================= 33 34DATA 35 36ALIGN SECTION_ALIGN 37zero times 4 dd 0 38 39ALIGN SECTION_ALIGN 40ones times 8 dw 1 41 42ALIGN SECTION_ALIGN 43round32 times 4 dd 32 44 45;============================================================================= 46; Coeffs for MSE_H calculation 47;============================================================================= 48 49ALIGN SECTION_ALIGN 50iMask_Coeff: 51 dw 0, 29788, 32767, 20479, 13653, 8192, 6425, 5372, 52 dw 27306, 27306, 23405, 17246, 12603, 5650, 5461, 5958, 53 dw 23405, 25205, 20479, 13653, 8192, 5749, 4749, 5851, 54 dw 23405, 19275, 14894, 11299, 6425, 3766, 4096, 5285, 55 dw 18204, 14894, 8856, 5851, 4819, 3006, 3181, 4255, 56 dw 13653, 9362, 5958, 5120, 4045, 3151, 2900, 3562, 57 dw 6687, 5120, 4201, 3766, 3181, 2708, 2730, 3244, 58 dw 4551, 3562, 3449, 3344, 2926, 3277, 3181, 3310 59 60ALIGN SECTION_ALIGN 61Inv_iMask_Coeff: 62 dd 0, 155, 128, 328, 737, 2048, 3329, 4763, 63 dd 184, 184, 251, 462, 865, 4306, 4608, 3872, 64 dd 251, 216, 328, 737, 2048, 4159, 6094, 4014, 65 dd 251, 370, 620, 1076, 3329, 9688, 8192, 4920, 66 dd 415, 620, 1752, 4014, 5919, 15207, 13579, 7589, 67 dd 737, 1568, 3872, 5243, 8398, 13844, 16345, 10834, 68 dd 3073, 5243, 7787, 9688, 13579, 18741, 18433, 13057, 69 dd 6636, 10834, 11552, 12294, 16056, 12800, 13579, 12545 70 71ALIGN SECTION_ALIGN 72iCSF_Coeff: 73 dw 26353, 38331, 42164, 26353, 17568, 10541, 8268, 6912, 74 dw 35137, 35137, 30117, 22192, 16217, 7270, 7027, 7666, 75 dw 30117, 32434, 26353, 17568, 10541, 7397, 6111, 7529, 76 dw 30117, 24803, 19166, 14539, 8268, 4846, 5271, 6801, 77 dw 23425, 19166, 11396, 7529, 6201, 3868, 4094, 5476, 78 dw 17568, 12047, 7666, 6588, 5205, 4054, 3731, 4583, 79 dw 8605, 6588, 5406, 4846, 4094, 3485, 3514, 4175, 80 dw 5856, 4583, 4438, 4302, 3765, 4216, 4094, 4259 81 82ALIGN SECTION_ALIGN 83iCSF_Round: 84 dw 1, 1, 1, 1, 2, 3, 4, 5, 85 dw 1, 1, 1, 1, 2, 5, 5, 4, 86 dw 1, 1, 1, 2, 3, 4, 5, 4, 87 dw 1, 1, 2, 2, 4, 7, 6, 5, 88 dw 1, 2, 3, 4, 5, 8, 8, 6, 89 dw 2, 3, 4, 5, 6, 8, 9, 7, 90 dw 4, 5, 6, 7, 8, 9, 9, 8, 91 dw 6, 7, 7, 8, 9, 8, 8, 8 92 93 94;============================================================================= 95; Code 96;============================================================================= 97 98TEXT 99 100cglobal sad16_sse2 101cglobal dev16_sse2 102 103cglobal sad16_sse3 104cglobal dev16_sse3 105 106cglobal sseh8_16bit_sse2 107cglobal coeff8_energy_sse2 108cglobal blocksum8_sse2 109 110;----------------------------------------------------------------------------- 111; uint32_t sad16_sse2 (const uint8_t * const cur, <- assumed aligned! 112; const uint8_t * const ref, 113; const uint32_t stride, 114; const uint32_t /*ignored*/); 115;----------------------------------------------------------------------------- 116 117 118%macro SAD_16x16_SSE2 1 119 %1 xmm0, [TMP1] 120 %1 xmm1, [TMP1+TMP0] 121 lea TMP1,[TMP1+2*TMP0] 122 movdqa xmm2, [_EAX] 123 movdqa xmm3, [_EAX+TMP0] 124 lea _EAX,[_EAX+2*TMP0] 125 psadbw xmm0, xmm2 126 paddusw xmm4,xmm0 127 psadbw xmm1, xmm3 128 paddusw xmm4,xmm1 129%endmacro 130 131%macro SAD16_SSE2_SSE3 1 132 mov _EAX, prm1 ; cur (assumed aligned) 133 mov TMP1, prm2 ; ref 134 mov TMP0, prm3 ; stride 135 136 pxor xmm4, xmm4 ; accum 137 138 SAD_16x16_SSE2 %1 139 SAD_16x16_SSE2 %1 140 SAD_16x16_SSE2 %1 141 SAD_16x16_SSE2 %1 142 SAD_16x16_SSE2 %1 143 SAD_16x16_SSE2 %1 144 SAD_16x16_SSE2 %1 145 SAD_16x16_SSE2 %1 146 147 pshufd xmm5, xmm4, 00000010b 148 paddusw xmm4, xmm5 149 pextrw eax, xmm4, 0 150 151 ret 152%endmacro 153 154ALIGN SECTION_ALIGN 155sad16_sse2: 156 SAD16_SSE2_SSE3 movdqu 157ENDFUNC 158 159 160ALIGN SECTION_ALIGN 161sad16_sse3: 162 SAD16_SSE2_SSE3 lddqu 163ENDFUNC 164 165 166;----------------------------------------------------------------------------- 167; uint32_t dev16_sse2(const uint8_t * const cur, const uint32_t stride); 168;----------------------------------------------------------------------------- 169 170%macro MEAN_16x16_SSE2 1 ; _EAX: src, TMP0:stride, mm7: zero or mean => mm6: result 171 %1 xmm0, [_EAX] 172 %1 xmm1, [_EAX+TMP0] 173 lea _EAX, [_EAX+2*TMP0] ; + 2*stride 174 psadbw xmm0, xmm5 175 paddusw xmm4, xmm0 176 psadbw xmm1, xmm5 177 paddusw xmm4, xmm1 178%endmacro 179 180 181%macro MEAN16_SSE2_SSE3 1 182 mov _EAX, prm1 ; src 183 mov TMP0, prm2 ; stride 184 185 pxor xmm4, xmm4 ; accum 186 pxor xmm5, xmm5 ; zero 187 188 MEAN_16x16_SSE2 %1 189 MEAN_16x16_SSE2 %1 190 MEAN_16x16_SSE2 %1 191 MEAN_16x16_SSE2 %1 192 193 MEAN_16x16_SSE2 %1 194 MEAN_16x16_SSE2 %1 195 MEAN_16x16_SSE2 %1 196 MEAN_16x16_SSE2 %1 197 198 mov _EAX, prm1 ; src again 199 200 pshufd xmm5, xmm4, 10b 201 paddusw xmm5, xmm4 202 pxor xmm4, xmm4 ; zero accum 203 psrlw xmm5, 8 ; => Mean 204 pshuflw xmm5, xmm5, 0 ; replicate Mean 205 packuswb xmm5, xmm5 206 pshufd xmm5, xmm5, 00000000b 207 208 MEAN_16x16_SSE2 %1 209 MEAN_16x16_SSE2 %1 210 MEAN_16x16_SSE2 %1 211 MEAN_16x16_SSE2 %1 212 213 MEAN_16x16_SSE2 %1 214 MEAN_16x16_SSE2 %1 215 MEAN_16x16_SSE2 %1 216 MEAN_16x16_SSE2 %1 217 218 pshufd xmm5, xmm4, 10b 219 paddusw xmm5, xmm4 220 pextrw eax, xmm5, 0 221 222 ret 223%endmacro 224 225ALIGN SECTION_ALIGN 226dev16_sse2: 227 MEAN16_SSE2_SSE3 movdqu 228ENDFUNC 229 230ALIGN SECTION_ALIGN 231dev16_sse3: 232 MEAN16_SSE2_SSE3 lddqu 233ENDFUNC 234 235;----------------------------------------------------------------------------- 236; uint32_t coeff8_energy_sse2(const int16_t * dct); 237;----------------------------------------------------------------------------- 238 239%macro DCT_ENERGY_SSE2 4 240 241 movdqa %1, [%3 + %4] 242 movdqa %2, [%3 + %4 + 16] 243 244 psllw %1, 4 245 psllw %2, 4 246 247 pmulhw %1, [iMask_Coeff + %4] 248 pmulhw %2, [iMask_Coeff + %4 + 16] 249 250 pmaddwd %1, %1 251 pmaddwd %2, %2 252 253 paddd %1, %2 254 psrld %1, 3 255 256%endmacro 257 258ALIGN SECTION_ALIGN 259coeff8_energy_sse2: 260 261 mov TMP0, prm1 ; DCT_A 262 263 DCT_ENERGY_SSE2 xmm0, xmm1, TMP0, 0 264 DCT_ENERGY_SSE2 xmm1, xmm2, TMP0, 32 265 266 DCT_ENERGY_SSE2 xmm2, xmm3, TMP0, 64 267 DCT_ENERGY_SSE2 xmm3, xmm4, TMP0, 96 268 269 paddd xmm0, xmm1 270 paddd xmm2, xmm3 271 272 paddd xmm0, xmm2 ; A B C D 273 274 ; convolute 275 pshufd xmm1, xmm0, 238 276 paddd xmm0, xmm1 277 278 pshufd xmm2, xmm0, 85 279 paddd xmm0, xmm2 280 281 movd eax, xmm0 282 283 ret 284ENDFUNC 285 286;----------------------------------------------------------------------------------- 287; uint32_t mseh8_16bit_sse2(const int16_t * cur, const int16_t * ref, uint16_t mask) 288;----------------------------------------------------------------------------------- 289 290%macro SSEH_SSE2 4 291 movdqa xmm0, [%1 + %3] 292 movdqa xmm1, [%2 + %3] 293 294 movdqa xmm2, [%1 + %3 + 16] 295 movdqa xmm3, [%2 + %3 + 16] 296 297 298 movdqa xmm4, xmm7 ; MASK 299 movdqa xmm5, xmm7 300 301 psubsw xmm0, xmm1 ; A - B 302 psubsw xmm2, xmm3 303 304 305 ; ABS 306 pxor xmm1, xmm1 307 pxor xmm3, xmm3 308 309 pcmpgtw xmm1, xmm0 310 pcmpgtw xmm3, xmm2 311 312 pxor xmm0, xmm1 ; change sign if negative 313 pxor xmm2, xmm3 ; 314 315 psubw xmm0, xmm1 ; ABS (A - B) 316 psubw xmm2, xmm3 ; ABS (A - B) 317 318 319 movdqa xmm1, xmm7 ; MASK 320 movdqa xmm3, xmm7 321 322 pmaddwd xmm4, [Inv_iMask_Coeff + 2*(%3)] 323 pmaddwd xmm5, [Inv_iMask_Coeff + 2*(%3) + 16] 324 325 pmaddwd xmm1, [Inv_iMask_Coeff + 2*(%3) + 32] 326 pmaddwd xmm3, [Inv_iMask_Coeff + 2*(%3) + 48] 327 328 psllw xmm0, 4 329 psllw xmm2, 4 330 331 paddd xmm4, [round32] 332 paddd xmm5, [round32] 333 334 paddd xmm1, [round32] 335 paddd xmm3, [round32] 336 337 psrad xmm4, 7 338 psrad xmm5, 7 339 340 psrad xmm1, 7 341 psrad xmm3, 7 342 343 packssdw xmm4, xmm5 ; Thresh 344 packssdw xmm1, xmm3 ; Thresh 345 346 347 psubusw xmm0, xmm4 ; Decimate by masking effect 348 psubusw xmm2, xmm1 349 350 paddusw xmm0, [iCSF_Round + %3] 351 paddusw xmm2, [iCSF_Round + %3 + 16] 352 353 pmulhuw xmm0, [iCSF_Coeff + %3] 354 pmulhuw xmm2, [iCSF_Coeff + %3 + 16] 355 356 pmaddwd xmm0, xmm0 357 pmaddwd xmm2, xmm2 358 359 paddd xmm0, xmm2 360%endmacro 361 362 363ALIGN SECTION_ALIGN 364sseh8_16bit_sse2: 365 366 PUSH_XMM6_XMM7 367 368 mov TMP0, prm1 ; DCT_A 369 mov TMP1, prm2 ; DCT_B 370 mov _EAX, prm3 ; MASK 371 372 movd xmm7, eax 373 pshufd xmm7, xmm7, 0 374 375 SSEH_SSE2 TMP0, TMP1, 0, xmm7 376 movdqa xmm6, xmm0 377 SSEH_SSE2 TMP0, TMP1, 32, xmm7 378 paddd xmm6, xmm0 379 SSEH_SSE2 TMP0, TMP1, 64, xmm7 380 paddd xmm6, xmm0 381 SSEH_SSE2 TMP0, TMP1, 96, xmm7 382 paddd xmm6, xmm0 383 384 ; convolute 385 pshufd xmm1, xmm6, 238 386 paddd xmm6, xmm1 387 388 pshufd xmm2, xmm6, 85 389 paddd xmm6, xmm2 390 391 392 movd eax, xmm6 393 394 POP_XMM6_XMM7 395 ret 396ENDFUNC 397 398;-------------------------------------------------------------------------------------------- 399; uint32_t blocksum8_c(const int8_t * cur, int stride, uint16_t sums[4], uint32_t squares[4]) 400;-------------------------------------------------------------------------------------------- 401 402%macro BLOCKSUM_SSE2 3 403 movq xmm0, [%1 ] ; 0 0 B A 404 movq xmm2, [%1 + %2] ; 0 0 B A 405 movq xmm1, [%1 + 2*%2] 406 movq xmm3, [%1 + %3] 407 408 punpckldq xmm0, xmm2 ; B B A A 409 punpckldq xmm1, xmm3 ; B B A A 410 411 movdqa xmm2, xmm0 412 movdqa xmm3, xmm1 413 414 psadbw xmm0, xmm7 ; 000b000a 415 psadbw xmm1, xmm7 416 417 movdqa xmm4, xmm2 418 movdqa xmm5, xmm3 419 420 punpcklbw xmm2, xmm7 ; aaaaaaaa 421 punpcklbw xmm3, xmm7 422 423 punpckhbw xmm4, xmm7 ; bbbbbbbb 424 punpckhbw xmm5, xmm7 425 426 pmaddwd xmm2, xmm2 ; a*a+a*a a*a+a*a a*a+a*a a*a+a*a 427 pmaddwd xmm3, xmm3 428 429 pmaddwd xmm4, xmm4 ; b*b+b*b b*b+b*b b*b+b*b b*b+b*b 430 pmaddwd xmm5, xmm5 431 432 paddd xmm2, xmm3 433 paddd xmm4, xmm5 434 435 movdqa xmm3, xmm2 436 punpckldq xmm2, xmm4 ; BABA 437 punpckhdq xmm3, xmm4 ; BABA 438 439 paddd xmm2, xmm3 440 441 lea %1, [%1 + 4*%2] 442 443 movdqa xmm4, xmm2 444 punpckhqdq xmm4, xmm7 ; 445 446 paddd xmm2, xmm4 447 448 ; 449 movq xmm3, [%1 ] ; 0 0 D C 450 movq xmm5, [%1 + %2] ; 0 0 D C 451 movq xmm4, [%1 + 2*%2] 452 movq xmm6, [%1 + %3] 453 454 punpckldq xmm3, xmm5 ; D D C C 455 punpckldq xmm4, xmm6 ; D D C C 456 457 movdqa xmm5, xmm3 458 movdqa xmm6, xmm4 459 460 psadbw xmm3, xmm7 ; 000d000c 461 psadbw xmm4, xmm7 462 463 packssdw xmm0, xmm3 ; 0d0c0b0a 464 packssdw xmm1, xmm4 ; 465 466 paddusw xmm0, xmm1 467 packssdw xmm0, xmm7 ; 0000dcba 468 469 470 movdqa xmm3, xmm5 471 movdqa xmm4, xmm6 472 473 punpcklbw xmm3, xmm7 474 punpcklbw xmm4, xmm7 475 476 punpckhbw xmm5, xmm7 477 punpckhbw xmm6, xmm7 478 479 pmaddwd xmm3, xmm3 ; C*C+C*C 480 pmaddwd xmm4, xmm4 481 482 pmaddwd xmm5, xmm5 ; D*D+D*D 483 pmaddwd xmm6, xmm6 484 485 paddd xmm3, xmm4 486 paddd xmm5, xmm6 487 488 movdqa xmm1, xmm3 489 punpckldq xmm3, xmm5 ; DCDC 490 punpckhdq xmm1, xmm5 ; DCDC 491 492 paddd xmm3, xmm1 493 494 movdqa xmm4, xmm3 495 punpckhqdq xmm4, xmm7 ; 496 497 paddd xmm3, xmm4 498 punpcklqdq xmm2, xmm3 499%endmacro 500 501 502ALIGN SECTION_ALIGN 503blocksum8_sse2: 504 505 PUSH_XMM6_XMM7 506 507 mov TMP0, prm1 ; cur 508 mov TMP1, prm2 ; stride 509 mov _EAX, prm3 ; sums 510 511 push _EBP 512 lea _EBP, [TMP1 + 2*TMP1] 513 514 pxor xmm7, xmm7 515 516 BLOCKSUM_SSE2 TMP0, TMP1, _EBP 517 518 pop _EBP 519 mov TMP0, prm4 ; squares 520 521 movq [_EAX], xmm0 ; sums of the 4x4 sub-blocks 522 movdqa [TMP0], xmm2 ; squares of the 4x4 sub-blocks 523 524 pmaddwd xmm0, [ones] 525 packssdw xmm0, xmm7 526 527 pmaddwd xmm0, [ones] 528 movd eax, xmm0 529 530 POP_XMM6_XMM7 531 ret 532ENDFUNC 533 534NON_EXEC_STACK 535