1;***************************************************************************** 2;* sad-a.asm: x86 sad functions 3;***************************************************************************** 4;* Copyright (C) 2003-2013 x264 project 5;* 6;* Authors: Loren Merritt <lorenm@u.washington.edu> 7;* Fiona Glaser <fiona@x264.com> 8;* Laurent Aimar <fenrir@via.ecp.fr> 9;* Alex Izvorski <aizvorksi@gmail.com> 10;* Min Chen <chenm003@163.com> 11;* 12;* This program is free software; you can redistribute it and/or modify 13;* it under the terms of the GNU General Public License as published by 14;* the Free Software Foundation; either version 2 of the License, or 15;* (at your option) any later version. 16;* 17;* This program is distributed in the hope that it will be useful, 18;* but WITHOUT ANY WARRANTY; without even the implied warranty of 19;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 20;* GNU General Public License for more details. 21;* 22;* You should have received a copy of the GNU General Public License 23;* along with this program; if not, write to the Free Software 24;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. 25;* 26;* This program is also available under a commercial proprietary license. 27;* For more information, contact us at license @ x265.com. 28;***************************************************************************** 29 30%include "x86inc.asm" 31%include "x86util.asm" 32 33SECTION_RODATA 32 34 35MSK: db 255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0 36 37SECTION .text 38 39cextern pb_3 40cextern pb_shuf8x8c 41cextern pw_8 42cextern pd_64 43 44;============================================================================= 45; SAD MMX 46;============================================================================= 47 48%macro SAD_INC_2x16P 0 49 movq mm1, [r0] 50 movq mm2, [r0+8] 51 movq mm3, [r0+r1] 52 movq mm4, [r0+r1+8] 53 psadbw mm1, [r2] 54 psadbw mm2, [r2+8] 55 psadbw mm3, [r2+r3] 56 psadbw mm4, [r2+r3+8] 57 lea r0, [r0+2*r1] 58 paddw mm1, mm2 59 paddw mm3, mm4 60 lea r2, [r2+2*r3] 61 paddw mm0, mm1 62 paddw mm0, mm3 63%endmacro 64 65%macro SAD_INC_2x8P 0 66 movq mm1, [r0] 67 movq mm2, [r0+r1] 68 psadbw mm1, [r2] 69 psadbw mm2, [r2+r3] 70 lea r0, [r0+2*r1] 71 paddw mm0, mm1 72 paddw mm0, mm2 73 lea r2, [r2+2*r3] 74%endmacro 75 76%macro SAD_INC_2x4P 0 77 movd mm1, [r0] 78 movd mm2, [r2] 79 punpckldq mm1, [r0+r1] 80 punpckldq mm2, [r2+r3] 81 psadbw mm1, mm2 82 paddw mm0, mm1 83 lea r0, [r0+2*r1] 84 lea r2, [r2+2*r3] 85%endmacro 86 87;----------------------------------------------------------------------------- 88; int pixel_sad_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t ) 89;----------------------------------------------------------------------------- 90%macro SAD 2 91cglobal pixel_sad_%1x%2_mmx2, 4,4 92 pxor mm0, mm0 93%rep %2/2 94 SAD_INC_2x%1P 95%endrep 96 movd eax, mm0 97 RET 98%endmacro 99 100SAD 16, 16 101SAD 16, 8 102SAD 8, 16 103SAD 8, 8 104SAD 8, 4 105SAD 4, 16 106SAD 4, 8 107SAD 4, 4 108 109 110 111;============================================================================= 112; SAD XMM 113;============================================================================= 114 115%macro SAD_END_SSE2 0 116 movhlps m1, m0 117 paddw m0, m1 118 movd eax, m0 119 RET 120%endmacro 121 122%macro PROCESS_SAD_12x4 0 123 movu m1, [r2] 124 movu m2, [r0] 125 pand m1, m4 126 pand m2, m4 127 psadbw m1, m2 128 paddd m0, m1 129 lea r2, [r2 + r3] 130 lea r0, [r0 + r1] 131 movu m1, [r2] 132 movu m2, [r0] 133 pand m1, m4 134 pand m2, m4 135 psadbw m1, m2 136 paddd m0, m1 137 lea r2, [r2 + r3] 138 lea r0, [r0 + r1] 139 movu m1, [r2] 140 movu m2, [r0] 141 pand m1, m4 142 pand m2, m4 143 psadbw m1, m2 144 paddd m0, m1 145 lea r2, [r2 + r3] 146 lea r0, [r0 + r1] 147 movu m1, [r2] 148 movu m2, [r0] 149 pand m1, m4 150 pand m2, m4 151 psadbw m1, m2 152 paddd m0, m1 153%endmacro 154 155%macro PROCESS_SAD_16x4 0 156 movu m1, [r2] 157 movu m2, [r2 + r3] 158 psadbw m1, [r0] 159 psadbw m2, [r0 + r1] 160 paddd m1, m2 161 paddd m0, m1 162 lea r2, [r2 + 2 * r3] 163 lea r0, [r0 + 2 * r1] 164 movu m1, [r2] 165 movu m2, [r2 + r3] 166 psadbw m1, [r0] 167 psadbw m2, [r0 + r1] 168 paddd m1, m2 169 paddd m0, m1 170 lea r2, [r2 + 2 * r3] 171 lea r0, [r0 + 2 * r1] 172%endmacro 173 174%macro PROCESS_SAD_24x4 0 175 movu m1, [r2] 176 movq m2, [r2 + 16] 177 lea r2, [r2 + r3] 178 movu m3, [r2] 179 movq m4, [r2 + 16] 180 psadbw m1, [r0] 181 psadbw m3, [r0 + r1] 182 paddd m0, m1 183 paddd m0, m3 184 movq m1, [r0 + 16] 185 lea r0, [r0 + r1] 186 movq m3, [r0 + 16] 187 punpcklqdq m2, m4 188 punpcklqdq m1, m3 189 psadbw m2, m1 190 paddd m0, m2 191 lea r2, [r2 + r3] 192 lea r0, [r0 + r1] 193 194 movu m1, [r2] 195 movq m2, [r2 + 16] 196 lea r2, [r2 + r3] 197 movu m3, [r2] 198 movq m4, [r2 + 16] 199 psadbw m1, [r0] 200 psadbw m3, [r0 + r1] 201 paddd m0, m1 202 paddd m0, m3 203 movq m1, [r0 + 16] 204 lea r0, [r0 + r1] 205 movq m3, [r0 + 16] 206 punpcklqdq m2, m4 207 punpcklqdq m1, m3 208 psadbw m2, m1 209 paddd m0, m2 210%endmacro 211 212%macro PROCESS_SAD_32x4 0 213 movu m1, [r2] 214 movu m2, [r2 + 16] 215 psadbw m1, [r0] 216 psadbw m2, [r0 + 16] 217 paddd m1, m2 218 paddd m0, m1 219 lea r2, [r2 + r3] 220 lea r0, [r0 + r1] 221 movu m1, [r2] 222 movu m2, [r2 + 16] 223 psadbw m1, [r0] 224 psadbw m2, [r0 + 16] 225 paddd m1, m2 226 paddd m0, m1 227 lea r2, [r2 + r3] 228 lea r0, [r0 + r1] 229 movu m1, [r2] 230 movu m2, [r2 + 16] 231 psadbw m1, [r0] 232 psadbw m2, [r0 + 16] 233 paddd m1, m2 234 paddd m0, m1 235 lea r2, [r2 + r3] 236 lea r0, [r0 + r1] 237 movu m1, [r2] 238 movu m2, [r2 + 16] 239 psadbw m1, [r0] 240 psadbw m2, [r0 + 16] 241 paddd m1, m2 242 paddd m0, m1 243 lea r2, [r2 + r3] 244 lea r0, [r0 + r1] 245%endmacro 246 247%macro PROCESS_SAD_48x4 0 248 movu m1, [r2] 249 movu m2, [r2 + 16] 250 movu m3, [r2 + 32] 251 psadbw m1, [r0] 252 psadbw m2, [r0 + 16] 253 psadbw m3, [r0 + 32] 254 paddd m1, m2 255 paddd m0, m1 256 paddd m0, m3 257 lea r2, [r2 + r3] 258 lea r0, [r0 + r1] 259 260 movu m1, [r2] 261 movu m2, [r2 + 16] 262 movu m3, [r2 + 32] 263 psadbw m1, [r0] 264 psadbw m2, [r0 + 16] 265 psadbw m3, [r0 + 32] 266 paddd m1, m2 267 paddd m0, m1 268 paddd m0, m3 269 lea r2, [r2 + r3] 270 lea r0, [r0 + r1] 271 272 movu m1, [r2] 273 movu m2, [r2 + 16] 274 movu m3, [r2 + 32] 275 psadbw m1, [r0] 276 psadbw m2, [r0 + 16] 277 psadbw m3, [r0 + 32] 278 paddd m1, m2 279 paddd m0, m1 280 paddd m0, m3 281 lea r2, [r2 + r3] 282 lea r0, [r0 + r1] 283 284 movu m1, [r2] 285 movu m2, [r2 + 16] 286 movu m3, [r2 + 32] 287 psadbw m1, [r0] 288 psadbw m2, [r0 + 16] 289 psadbw m3, [r0 + 32] 290 paddd m1, m2 291 paddd m0, m1 292 paddd m0, m3 293%endmacro 294 295%macro PROCESS_SAD_8x4 0 296 movq m1, [r2] 297 movq m2, [r2 + r3] 298 lea r2, [r2 + 2 * r3] 299 movq m3, [r0] 300 movq m4, [r0 + r1] 301 lea r0, [r0 + 2 * r1] 302 punpcklqdq m1, m2 303 punpcklqdq m3, m4 304 psadbw m1, m3 305 paddd m0, m1 306 movq m1, [r2] 307 movq m2, [r2 + r3] 308 lea r2, [r2 + 2 * r3] 309 movq m3, [r0] 310 movq m4, [r0 + r1] 311 lea r0, [r0 + 2 * r1] 312 punpcklqdq m1, m2 313 punpcklqdq m3, m4 314 psadbw m1, m3 315 paddd m0, m1 316%endmacro 317 318%macro PROCESS_SAD_64x4 0 319 movu m1, [r2] 320 movu m2, [r2 + 16] 321 movu m3, [r2 + 32] 322 movu m4, [r2 + 48] 323 psadbw m1, [r0] 324 psadbw m2, [r0 + 16] 325 psadbw m3, [r0 + 32] 326 psadbw m4, [r0 + 48] 327 paddd m1, m2 328 paddd m3, m4 329 paddd m0, m1 330 paddd m0, m3 331 lea r2, [r2 + r3] 332 lea r0, [r0 + r1] 333 334 movu m1, [r2] 335 movu m2, [r2 + 16] 336 movu m3, [r2 + 32] 337 movu m4, [r2 + 48] 338 psadbw m1, [r0] 339 psadbw m2, [r0 + 16] 340 psadbw m3, [r0 + 32] 341 psadbw m4, [r0 + 48] 342 paddd m1, m2 343 paddd m3, m4 344 paddd m0, m1 345 paddd m0, m3 346 lea r2, [r2 + r3] 347 lea r0, [r0 + r1] 348 349 movu m1, [r2] 350 movu m2, [r2 + 16] 351 movu m3, [r2 + 32] 352 movu m4, [r2 + 48] 353 psadbw m1, [r0] 354 psadbw m2, [r0 + 16] 355 psadbw m3, [r0 + 32] 356 psadbw m4, [r0 + 48] 357 paddd m1, m2 358 paddd m3, m4 359 paddd m0, m1 360 paddd m0, m3 361 lea r2, [r2 + r3] 362 lea r0, [r0 + r1] 363 364 movu m1, [r2] 365 movu m2, [r2 + 16] 366 movu m3, [r2 + 32] 367 movu m4, [r2 + 48] 368 psadbw m1, [r0] 369 psadbw m2, [r0 + 16] 370 psadbw m3, [r0 + 32] 371 psadbw m4, [r0 + 48] 372 paddd m1, m2 373 paddd m3, m4 374 paddd m0, m1 375 paddd m0, m3 376 lea r2, [r2 + r3] 377 lea r0, [r0 + r1] 378%endmacro 379 380%macro SAD_W16 0 381;----------------------------------------------------------------------------- 382; int pixel_sad_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t ) 383;----------------------------------------------------------------------------- 384cglobal pixel_sad_16x16, 4,4,8 385 movu m0, [r2] 386 movu m1, [r2+r3] 387 lea r2, [r2+2*r3] 388 movu m2, [r2] 389 movu m3, [r2+r3] 390 lea r2, [r2+2*r3] 391 psadbw m0, [r0] 392 psadbw m1, [r0+r1] 393 lea r0, [r0+2*r1] 394 movu m4, [r2] 395 paddw m0, m1 396 psadbw m2, [r0] 397 psadbw m3, [r0+r1] 398 lea r0, [r0+2*r1] 399 movu m5, [r2+r3] 400 lea r2, [r2+2*r3] 401 paddw m2, m3 402 movu m6, [r2] 403 movu m7, [r2+r3] 404 lea r2, [r2+2*r3] 405 paddw m0, m2 406 psadbw m4, [r0] 407 psadbw m5, [r0+r1] 408 lea r0, [r0+2*r1] 409 movu m1, [r2] 410 paddw m4, m5 411 psadbw m6, [r0] 412 psadbw m7, [r0+r1] 413 lea r0, [r0+2*r1] 414 movu m2, [r2+r3] 415 lea r2, [r2+2*r3] 416 paddw m6, m7 417 movu m3, [r2] 418 paddw m0, m4 419 movu m4, [r2+r3] 420 lea r2, [r2+2*r3] 421 paddw m0, m6 422 psadbw m1, [r0] 423 psadbw m2, [r0+r1] 424 lea r0, [r0+2*r1] 425 movu m5, [r2] 426 paddw m1, m2 427 psadbw m3, [r0] 428 psadbw m4, [r0+r1] 429 lea r0, [r0+2*r1] 430 movu m6, [r2+r3] 431 lea r2, [r2+2*r3] 432 paddw m3, m4 433 movu m7, [r2] 434 paddw m0, m1 435 movu m1, [r2+r3] 436 paddw m0, m3 437 psadbw m5, [r0] 438 psadbw m6, [r0+r1] 439 lea r0, [r0+2*r1] 440 paddw m5, m6 441 psadbw m7, [r0] 442 psadbw m1, [r0+r1] 443 paddw m7, m1 444 paddw m0, m5 445 paddw m0, m7 446 SAD_END_SSE2 447 448;----------------------------------------------------------------------------- 449; int pixel_sad_16x8( uint8_t *, intptr_t, uint8_t *, intptr_t ) 450;----------------------------------------------------------------------------- 451cglobal pixel_sad_16x8, 4,4 452 movu m0, [r2] 453 movu m2, [r2+r3] 454 lea r2, [r2+2*r3] 455 movu m3, [r2] 456 movu m4, [r2+r3] 457 psadbw m0, [r0] 458 psadbw m2, [r0+r1] 459 lea r0, [r0+2*r1] 460 psadbw m3, [r0] 461 psadbw m4, [r0+r1] 462 lea r0, [r0+2*r1] 463 lea r2, [r2+2*r3] 464 paddw m0, m2 465 paddw m3, m4 466 paddw m0, m3 467 movu m1, [r2] 468 movu m2, [r2+r3] 469 lea r2, [r2+2*r3] 470 movu m3, [r2] 471 movu m4, [r2+r3] 472 psadbw m1, [r0] 473 psadbw m2, [r0+r1] 474 lea r0, [r0+2*r1] 475 psadbw m3, [r0] 476 psadbw m4, [r0+r1] 477 lea r0, [r0+2*r1] 478 lea r2, [r2+2*r3] 479 paddw m1, m2 480 paddw m3, m4 481 paddw m0, m1 482 paddw m0, m3 483 SAD_END_SSE2 484 485;----------------------------------------------------------------------------- 486; int pixel_sad_16x12( uint8_t *, intptr_t, uint8_t *, intptr_t ) 487;----------------------------------------------------------------------------- 488cglobal pixel_sad_16x12, 4,4,3 489 pxor m0, m0 490 491 PROCESS_SAD_16x4 492 PROCESS_SAD_16x4 493 PROCESS_SAD_16x4 494 495 movhlps m1, m0 496 paddd m0, m1 497 movd eax, m0 498 RET 499 500;----------------------------------------------------------------------------- 501; int pixel_sad_16x32( uint8_t *, intptr_t, uint8_t *, intptr_t ) 502;----------------------------------------------------------------------------- 503cglobal pixel_sad_16x32, 4,5,3 504 pxor m0, m0 505 mov r4d, 4 506.loop: 507 PROCESS_SAD_16x4 508 PROCESS_SAD_16x4 509 dec r4d 510 jnz .loop 511 512 movhlps m1, m0 513 paddd m0, m1 514 movd eax, m0 515 RET 516 517;----------------------------------------------------------------------------- 518; int pixel_sad_16x64( uint8_t *, intptr_t, uint8_t *, intptr_t ) 519;----------------------------------------------------------------------------- 520cglobal pixel_sad_16x64, 4,5,3 521 pxor m0, m0 522 mov r4d, 8 523.loop: 524 PROCESS_SAD_16x4 525 PROCESS_SAD_16x4 526 dec r4d 527 jnz .loop 528 529 movhlps m1, m0 530 paddd m0, m1 531 movd eax, m0 532 RET 533 534;----------------------------------------------------------------------------- 535; int pixel_sad_16x4( uint8_t *, intptr_t, uint8_t *, intptr_t ) 536;----------------------------------------------------------------------------- 537cglobal pixel_sad_16x4, 4,4,3 538 539 movu m0, [r2] 540 movu m1, [r2 + r3] 541 psadbw m0, [r0] 542 psadbw m1, [r0 + r1] 543 paddd m0, m1 544 lea r2, [r2 + 2 * r3] 545 lea r0, [r0 + 2 * r1] 546 movu m1, [r2] 547 movu m2, [r2 + r3] 548 psadbw m1, [r0] 549 psadbw m2, [r0 + r1] 550 paddd m1, m2 551 paddd m0, m1 552 553 movhlps m1, m0 554 paddd m0, m1 555 movd eax, m0 556 RET 557 558;----------------------------------------------------------------------------- 559; int pixel_sad_32x8( uint8_t *, intptr_t, uint8_t *, intptr_t ) 560;----------------------------------------------------------------------------- 561cglobal pixel_sad_32x8, 4,4,3 562 pxor m0, m0 563 564 PROCESS_SAD_32x4 565 PROCESS_SAD_32x4 566 567 movhlps m1, m0 568 paddd m0, m1 569 movd eax, m0 570 RET 571 572;----------------------------------------------------------------------------- 573; int pixel_sad_32x24( uint8_t *, intptr_t, uint8_t *, intptr_t ) 574;----------------------------------------------------------------------------- 575cglobal pixel_sad_32x24, 4,5,3 576 pxor m0, m0 577 mov r4d, 3 578.loop: 579 PROCESS_SAD_32x4 580 PROCESS_SAD_32x4 581 dec r4d 582 jnz .loop 583 584 movhlps m1, m0 585 paddd m0, m1 586 movd eax, m0 587 RET 588 589;----------------------------------------------------------------------------- 590; int pixel_sad_32x32( uint8_t *, intptr_t, uint8_t *, intptr_t ) 591;----------------------------------------------------------------------------- 592cglobal pixel_sad_32x32, 4,5,3 593 pxor m0, m0 594 mov r4d, 4 595.loop: 596 PROCESS_SAD_32x4 597 PROCESS_SAD_32x4 598 dec r4d 599 jnz .loop 600 601 movhlps m1, m0 602 paddd m0, m1 603 movd eax, m0 604 RET 605 606;----------------------------------------------------------------------------- 607; int pixel_sad_32x16( uint8_t *, intptr_t, uint8_t *, intptr_t ) 608;----------------------------------------------------------------------------- 609cglobal pixel_sad_32x16, 4,4,3 610 pxor m0, m0 611 612 PROCESS_SAD_32x4 613 PROCESS_SAD_32x4 614 PROCESS_SAD_32x4 615 PROCESS_SAD_32x4 616 617 movhlps m1, m0 618 paddd m0, m1 619 movd eax, m0 620 RET 621 622;----------------------------------------------------------------------------- 623; int pixel_sad_32x64( uint8_t *, intptr_t, uint8_t *, intptr_t ) 624;----------------------------------------------------------------------------- 625cglobal pixel_sad_32x64, 4,5,3 626 pxor m0, m0 627 mov r4d, 8 628.loop: 629 PROCESS_SAD_32x4 630 PROCESS_SAD_32x4 631 dec r4d 632 jnz .loop 633 634 movhlps m1, m0 635 paddd m0, m1 636 movd eax, m0 637 RET 638 639;----------------------------------------------------------------------------- 640; int pixel_sad_8x32( uint8_t *, intptr_t, uint8_t *, intptr_t ) 641;----------------------------------------------------------------------------- 642cglobal pixel_sad_8x32, 4,5,3 643 pxor m0, m0 644 mov r4d, 4 645.loop: 646 PROCESS_SAD_8x4 647 PROCESS_SAD_8x4 648 dec r4d 649 jnz .loop 650 651 movhlps m1, m0 652 paddd m0, m1 653 movd eax, m0 654 RET 655 656;----------------------------------------------------------------------------- 657; int pixel_sad_64x16( uint8_t *, intptr_t, uint8_t *, intptr_t ) 658;----------------------------------------------------------------------------- 659cglobal pixel_sad_64x16, 4,4,5 660 pxor m0, m0 661 662 PROCESS_SAD_64x4 663 PROCESS_SAD_64x4 664 PROCESS_SAD_64x4 665 PROCESS_SAD_64x4 666 667 movhlps m1, m0 668 paddd m0, m1 669 movd eax, m0 670 RET 671 672;----------------------------------------------------------------------------- 673; int pixel_sad_64x32( uint8_t *, intptr_t, uint8_t *, intptr_t ) 674;----------------------------------------------------------------------------- 675cglobal pixel_sad_64x32, 4,5,5 676 pxor m0, m0 677 mov r4, 4 678 679.loop: 680 PROCESS_SAD_64x4 681 PROCESS_SAD_64x4 682 683 dec r4 684 jnz .loop 685 686 movhlps m1, m0 687 paddd m0, m1 688 movd eax, m0 689 RET 690 691;----------------------------------------------------------------------------- 692; int pixel_sad_64x48( uint8_t *, intptr_t, uint8_t *, intptr_t ) 693;----------------------------------------------------------------------------- 694cglobal pixel_sad_64x48, 4,5,5 695 pxor m0, m0 696 mov r4, 6 697 698.loop: 699 PROCESS_SAD_64x4 700 PROCESS_SAD_64x4 701 dec r4d 702 jnz .loop 703 704 movhlps m1, m0 705 paddd m0, m1 706 movd eax, m0 707 RET 708 709;----------------------------------------------------------------------------- 710; int pixel_sad_64x64( uint8_t *, intptr_t, uint8_t *, intptr_t ) 711;----------------------------------------------------------------------------- 712cglobal pixel_sad_64x64, 4,5,5 713 pxor m0, m0 714 mov r4, 8 715 716.loop: 717 PROCESS_SAD_64x4 718 PROCESS_SAD_64x4 719 dec r4 720 jnz .loop 721 722 movhlps m1, m0 723 paddd m0, m1 724 movd eax, m0 725 RET 726 727;----------------------------------------------------------------------------- 728; int pixel_sad_48x64( uint8_t *, intptr_t, uint8_t *, intptr_t ) 729;----------------------------------------------------------------------------- 730cglobal pixel_sad_48x64, 4,5,5 731 pxor m0, m0 732 mov r4, 64 733 734.loop: 735 PROCESS_SAD_48x4 736 lea r2, [r2 + r3] 737 lea r0, [r0 + r1] 738 739 PROCESS_SAD_48x4 740 lea r2, [r2 + r3] 741 lea r0, [r0 + r1] 742 743 sub r4, 8 744 cmp r4, 8 745 746jnz .loop 747 PROCESS_SAD_48x4 748 lea r2, [r2 + r3] 749 lea r0, [r0 + r1] 750 PROCESS_SAD_48x4 751 752 movhlps m1, m0 753 paddd m0, m1 754 movd eax, m0 755 RET 756 757;----------------------------------------------------------------------------- 758; int pixel_sad_24x32( uint8_t *, intptr_t, uint8_t *, intptr_t ) 759;----------------------------------------------------------------------------- 760cglobal pixel_sad_24x32, 4,5,4 761 pxor m0, m0 762 mov r4, 32 763 764.loop: 765 PROCESS_SAD_24x4 766 lea r2, [r2 + r3] 767 lea r0, [r0 + r1] 768 PROCESS_SAD_24x4 769 lea r2, [r2 + r3] 770 lea r0, [r0 + r1] 771 sub r4, 8 772 cmp r4, 8 773jnz .loop 774 PROCESS_SAD_24x4 775 lea r2, [r2 + r3] 776 lea r0, [r0 + r1] 777 PROCESS_SAD_24x4 778 779 movhlps m1, m0 780 paddd m0, m1 781 movd eax, m0 782 RET 783 784;----------------------------------------------------------------------------- 785; int pixel_sad_12x16( uint8_t *, intptr_t, uint8_t *, intptr_t ) 786;----------------------------------------------------------------------------- 787cglobal pixel_sad_12x16, 4,4,4 788 mova m4, [MSK] 789 pxor m0, m0 790 791 PROCESS_SAD_12x4 792 lea r2, [r2 + r3] 793 lea r0, [r0 + r1] 794 PROCESS_SAD_12x4 795 lea r2, [r2 + r3] 796 lea r0, [r0 + r1] 797 PROCESS_SAD_12x4 798 lea r2, [r2 + r3] 799 lea r0, [r0 + r1] 800 PROCESS_SAD_12x4 801 802 movhlps m1, m0 803 paddd m0, m1 804 movd eax, m0 805 RET 806 807%endmacro 808 809INIT_XMM sse2 810SAD_W16 811INIT_XMM sse3 812SAD_W16 813INIT_XMM sse2, aligned 814SAD_W16 815 816%macro SAD_INC_4x8P_SSE 1 817 movq m1, [r0] 818 movq m2, [r0+r1] 819 lea r0, [r0+2*r1] 820 movq m3, [r2] 821 movq m4, [r2+r3] 822 lea r2, [r2+2*r3] 823 movhps m1, [r0] 824 movhps m2, [r0+r1] 825 movhps m3, [r2] 826 movhps m4, [r2+r3] 827 lea r0, [r0+2*r1] 828 psadbw m1, m3 829 psadbw m2, m4 830 lea r2, [r2+2*r3] 831 ACCUM paddw, 0, 1, %1 832 paddw m0, m2 833%endmacro 834 835INIT_XMM 836;Even on Nehalem, no sizes other than 8x16 benefit from this method. 837cglobal pixel_sad_8x16_sse2, 4,4 838 SAD_INC_4x8P_SSE 0 839 SAD_INC_4x8P_SSE 1 840 SAD_INC_4x8P_SSE 1 841 SAD_INC_4x8P_SSE 1 842 SAD_END_SSE2 843 RET 844 845;============================================================================= 846; SAD x3/x4 MMX 847;============================================================================= 848 849%macro SAD_X3_START_1x8P 0 850 movq mm3, [r0] 851 movq mm0, [r1] 852 movq mm1, [r2] 853 movq mm2, [r3] 854 psadbw mm0, mm3 855 psadbw mm1, mm3 856 psadbw mm2, mm3 857%endmacro 858 859%macro SAD_X3_1x8P 2 860 movq mm3, [r0+%1] 861 movq mm4, [r1+%2] 862 movq mm5, [r2+%2] 863 movq mm6, [r3+%2] 864 psadbw mm4, mm3 865 psadbw mm5, mm3 866 psadbw mm6, mm3 867 paddw mm0, mm4 868 paddw mm1, mm5 869 paddw mm2, mm6 870%endmacro 871 872%macro SAD_X3_START_2x4P 3 873 movd mm3, [r0] 874 movd %1, [r1] 875 movd %2, [r2] 876 movd %3, [r3] 877 punpckldq mm3, [r0+FENC_STRIDE] 878 punpckldq %1, [r1+r4] 879 punpckldq %2, [r2+r4] 880 punpckldq %3, [r3+r4] 881 psadbw %1, mm3 882 psadbw %2, mm3 883 psadbw %3, mm3 884%endmacro 885 886%macro SAD_X3_2x16P 1 887%if %1 888 SAD_X3_START_1x8P 889%else 890 SAD_X3_1x8P 0, 0 891%endif 892 SAD_X3_1x8P 8, 8 893 SAD_X3_1x8P FENC_STRIDE, r4 894 SAD_X3_1x8P FENC_STRIDE+8, r4+8 895 add r0, 2*FENC_STRIDE 896 lea r1, [r1+2*r4] 897 lea r2, [r2+2*r4] 898 lea r3, [r3+2*r4] 899%endmacro 900 901%macro SAD_X3_2x8P 1 902%if %1 903 SAD_X3_START_1x8P 904%else 905 SAD_X3_1x8P 0, 0 906%endif 907 SAD_X3_1x8P FENC_STRIDE, r4 908 add r0, 2*FENC_STRIDE 909 lea r1, [r1+2*r4] 910 lea r2, [r2+2*r4] 911 lea r3, [r3+2*r4] 912%endmacro 913 914%macro SAD_X3_2x4P 1 915%if %1 916 SAD_X3_START_2x4P mm0, mm1, mm2 917%else 918 SAD_X3_START_2x4P mm4, mm5, mm6 919 paddw mm0, mm4 920 paddw mm1, mm5 921 paddw mm2, mm6 922%endif 923 add r0, 2*FENC_STRIDE 924 lea r1, [r1+2*r4] 925 lea r2, [r2+2*r4] 926 lea r3, [r3+2*r4] 927%endmacro 928 929%macro SAD_X4_START_1x8P 0 930 movq mm7, [r0] 931 movq mm0, [r1] 932 movq mm1, [r2] 933 movq mm2, [r3] 934 movq mm3, [r4] 935 psadbw mm0, mm7 936 psadbw mm1, mm7 937 psadbw mm2, mm7 938 psadbw mm3, mm7 939%endmacro 940 941%macro SAD_X4_1x8P 2 942 movq mm7, [r0+%1] 943 movq mm4, [r1+%2] 944 movq mm5, [r2+%2] 945 movq mm6, [r3+%2] 946 psadbw mm4, mm7 947 psadbw mm5, mm7 948 psadbw mm6, mm7 949 psadbw mm7, [r4+%2] 950 paddw mm0, mm4 951 paddw mm1, mm5 952 paddw mm2, mm6 953 paddw mm3, mm7 954%endmacro 955 956%macro SAD_X4_START_2x4P 0 957 movd mm7, [r0] 958 movd mm0, [r1] 959 movd mm1, [r2] 960 movd mm2, [r3] 961 movd mm3, [r4] 962 punpckldq mm7, [r0+FENC_STRIDE] 963 punpckldq mm0, [r1+r5] 964 punpckldq mm1, [r2+r5] 965 punpckldq mm2, [r3+r5] 966 punpckldq mm3, [r4+r5] 967 psadbw mm0, mm7 968 psadbw mm1, mm7 969 psadbw mm2, mm7 970 psadbw mm3, mm7 971%endmacro 972 973%macro SAD_X4_INC_2x4P 0 974 movd mm7, [r0] 975 movd mm4, [r1] 976 movd mm5, [r2] 977 punpckldq mm7, [r0+FENC_STRIDE] 978 punpckldq mm4, [r1+r5] 979 punpckldq mm5, [r2+r5] 980 psadbw mm4, mm7 981 psadbw mm5, mm7 982 paddw mm0, mm4 983 paddw mm1, mm5 984 movd mm4, [r3] 985 movd mm5, [r4] 986 punpckldq mm4, [r3+r5] 987 punpckldq mm5, [r4+r5] 988 psadbw mm4, mm7 989 psadbw mm5, mm7 990 paddw mm2, mm4 991 paddw mm3, mm5 992%endmacro 993 994%macro SAD_X4_2x16P 1 995%if %1 996 SAD_X4_START_1x8P 997%else 998 SAD_X4_1x8P 0, 0 999%endif 1000 SAD_X4_1x8P 8, 8 1001 SAD_X4_1x8P FENC_STRIDE, r5 1002 SAD_X4_1x8P FENC_STRIDE+8, r5+8 1003 add r0, 2*FENC_STRIDE 1004 lea r1, [r1+2*r5] 1005 lea r2, [r2+2*r5] 1006 lea r3, [r3+2*r5] 1007 lea r4, [r4+2*r5] 1008%endmacro 1009 1010%macro SAD_X4_2x8P 1 1011%if %1 1012 SAD_X4_START_1x8P 1013%else 1014 SAD_X4_1x8P 0, 0 1015%endif 1016 SAD_X4_1x8P FENC_STRIDE, r5 1017 add r0, 2*FENC_STRIDE 1018 lea r1, [r1+2*r5] 1019 lea r2, [r2+2*r5] 1020 lea r3, [r3+2*r5] 1021 lea r4, [r4+2*r5] 1022%endmacro 1023 1024%macro SAD_X4_2x4P 1 1025%if %1 1026 SAD_X4_START_2x4P 1027%else 1028 SAD_X4_INC_2x4P 1029%endif 1030 add r0, 2*FENC_STRIDE 1031 lea r1, [r1+2*r5] 1032 lea r2, [r2+2*r5] 1033 lea r3, [r3+2*r5] 1034 lea r4, [r4+2*r5] 1035%endmacro 1036 1037%macro SAD_X3_END 0 1038%if UNIX64 1039 movd [r5+0], mm0 1040 movd [r5+4], mm1 1041 movd [r5+8], mm2 1042%else 1043 mov r0, r5mp 1044 movd [r0+0], mm0 1045 movd [r0+4], mm1 1046 movd [r0+8], mm2 1047%endif 1048 RET 1049%endmacro 1050 1051%macro SAD_X4_END 0 1052 mov r0, r6mp 1053 movd [r0+0], mm0 1054 movd [r0+4], mm1 1055 movd [r0+8], mm2 1056 movd [r0+12], mm3 1057 RET 1058%endmacro 1059 1060%macro SAD_X3_12x4 0 1061 mova m3, [r0] 1062 movu m5, [r1] 1063 pand m3, m4 1064 pand m5, m4 1065 psadbw m5, m3 1066 paddd m0, m5 1067 movu m5, [r2] 1068 pand m5, m4 1069 psadbw m5, m3 1070 paddd m1, m5 1071 movu m5, [r3] 1072 pand m5, m4 1073 psadbw m5, m3 1074 paddd m2, m5 1075 mova m3, [r0 + FENC_STRIDE] 1076 movu m5, [r1 + r4] 1077 pand m3, m4 1078 pand m5, m4 1079 psadbw m5, m3 1080 paddd m0, m5 1081 movu m5, [r2 + r4] 1082 pand m5, m4 1083 psadbw m5, m3 1084 paddd m1, m5 1085 movu m5, [r3 + r4] 1086 pand m5, m4 1087 psadbw m5, m3 1088 paddd m2, m5 1089 mova m3, [r0 + FENC_STRIDE * 2] 1090 movu m5, [r1 + r4 * 2] 1091 pand m3, m4 1092 pand m5, m4 1093 psadbw m5, m3 1094 paddd m0, m5 1095 movu m5, [r2 + r4 * 2] 1096 pand m5, m4 1097 psadbw m5, m3 1098 paddd m1, m5 1099 movu m5, [r3 + r4 * 2] 1100 pand m5, m4 1101 psadbw m5, m3 1102 paddd m2, m5 1103 lea r1, [r1 + r4 * 2] 1104 lea r2, [r2 + r4 * 2] 1105 lea r3, [r3 + r4 * 2] 1106 mova m3, [r0 + FENC_STRIDE + FENC_STRIDE * 2] 1107 movu m5, [r1 + r4] 1108 pand m3, m4 1109 pand m5, m4 1110 psadbw m5, m3 1111 paddd m0, m5 1112 movu m5, [r2 + r4] 1113 pand m5, m4 1114 psadbw m5, m3 1115 paddd m1, m5 1116 movu m5, [r3 + r4] 1117 pand m5, m4 1118 psadbw m5, m3 1119 paddd m2, m5 1120 lea r0, [r0 + FENC_STRIDE * 4] 1121 lea r1, [r1 + r4 * 2] 1122 lea r2, [r2 + r4 * 2] 1123 lea r3, [r3 + r4 * 2] 1124%endmacro 1125 1126%macro SAD_X4_12x4 0 1127 mova m4, [r0] 1128 movu m5, [r1] 1129 pand m4, m6 1130 pand m5, m6 1131 psadbw m5, m4 1132 paddd m0, m5 1133 movu m5, [r2] 1134 pand m5, m6 1135 psadbw m5, m4 1136 paddd m1, m5 1137 movu m5, [r3] 1138 pand m5, m6 1139 psadbw m5, m4 1140 paddd m2, m5 1141 movu m5, [r4] 1142 pand m5, m6 1143 psadbw m5, m4 1144 paddd m3, m5 1145 mova m4, [r0 + FENC_STRIDE] 1146 movu m5, [r1 + r5] 1147 pand m4, m6 1148 pand m5, m6 1149 psadbw m5, m4 1150 paddd m0, m5 1151 movu m5, [r2 + r5] 1152 pand m5, m6 1153 psadbw m5, m4 1154 paddd m1, m5 1155 movu m5, [r3 + r5] 1156 pand m5, m6 1157 psadbw m5, m4 1158 paddd m2, m5 1159 movu m5, [r4 + r5] 1160 pand m5, m6 1161 psadbw m5, m4 1162 paddd m3, m5 1163 mova m4, [r0 + FENC_STRIDE * 2] 1164 movu m5, [r1 + r5 * 2] 1165 pand m4, m6 1166 pand m5, m6 1167 psadbw m5, m4 1168 paddd m0, m5 1169 movu m5, [r2 + r5 * 2] 1170 pand m5, m6 1171 psadbw m5, m4 1172 paddd m1, m5 1173 movu m5, [r3 + r5 * 2] 1174 pand m5, m6 1175 psadbw m5, m4 1176 paddd m2, m5 1177 movu m5, [r4 + r5 * 2] 1178 pand m5, m6 1179 psadbw m5, m4 1180 paddd m3, m5 1181 lea r1, [r1 + r5 * 2] 1182 lea r2, [r2 + r5 * 2] 1183 lea r3, [r3 + r5 * 2] 1184 lea r4, [r4 + r5 * 2] 1185 mova m4, [r0 + FENC_STRIDE + FENC_STRIDE * 2] 1186 movu m5, [r1 + r5] 1187 pand m4, m6 1188 pand m5, m6 1189 psadbw m5, m4 1190 paddd m0, m5 1191 movu m5, [r2 + r5] 1192 pand m5, m6 1193 psadbw m5, m4 1194 paddd m1, m5 1195 movu m5, [r3 + r5] 1196 pand m5, m6 1197 psadbw m5, m4 1198 paddd m2, m5 1199 movu m5, [r4 + r5] 1200 pand m5, m6 1201 psadbw m5, m4 1202 paddd m3, m5 1203 lea r0, [r0 + FENC_STRIDE * 4] 1204 lea r1, [r1 + r5 * 2] 1205 lea r2, [r2 + r5 * 2] 1206 lea r3, [r3 + r5 * 2] 1207 lea r4, [r4 + r5 * 2] 1208%endmacro 1209 1210%macro SAD_X3_24x4 0 1211 mova m3, [r0] 1212 mova m4, [r0 + 16] 1213 movu m5, [r1] 1214 movu m6, [r1 + 16] 1215 psadbw m5, m3 1216 psadbw m6, m4 1217 pshufd m6, m6, 84 1218 paddd m5, m6 1219 paddd m0, m5 1220 movu m5, [r2] 1221 movu m6, [r2 + 16] 1222 psadbw m5, m3 1223 psadbw m6, m4 1224 pshufd m6, m6, 84 1225 paddd m5, m6 1226 paddd m1, m5 1227 movu m5, [r3] 1228 movu m6, [r3 + 16] 1229 psadbw m5, m3 1230 psadbw m6, m4 1231 pshufd m6, m6, 84 1232 paddd m5, m6 1233 paddd m2, m5 1234 1235 mova m3, [r0 + FENC_STRIDE] 1236 mova m4, [r0 + 16 + FENC_STRIDE] 1237 movu m5, [r1 + r4] 1238 movu m6, [r1 + 16 + r4] 1239 psadbw m5, m3 1240 psadbw m6, m4 1241 pshufd m6, m6, 84 1242 paddd m5, m6 1243 paddd m0, m5 1244 movu m5, [r2 + r4] 1245 movu m6, [r2 + 16 + r4] 1246 psadbw m5, m3 1247 psadbw m6, m4 1248 pshufd m6, m6, 84 1249 paddd m5, m6 1250 paddd m1, m5 1251 movu m5, [r3 + r4] 1252 movu m6, [r3 + 16 + r4] 1253 psadbw m5, m3 1254 psadbw m6, m4 1255 pshufd m6, m6, 84 1256 paddd m5, m6 1257 paddd m2, m5 1258 1259 mova m3, [r0 + FENC_STRIDE * 2] 1260 mova m4, [r0 + 16 + FENC_STRIDE * 2] 1261 movu m5, [r1 + r4 * 2] 1262 movu m6, [r1 + 16 + r4 * 2] 1263 psadbw m5, m3 1264 psadbw m6, m4 1265 pshufd m6, m6, 84 1266 paddd m5, m6 1267 paddd m0, m5 1268 movu m5, [r2 + r4 * 2] 1269 movu m6, [r2 + 16 + r4 * 2] 1270 psadbw m5, m3 1271 psadbw m6, m4 1272 pshufd m6, m6, 84 1273 paddd m5, m6 1274 paddd m1, m5 1275 movu m5, [r3 + r4 * 2] 1276 movu m6, [r3 + 16 + r4 * 2] 1277 psadbw m5, m3 1278 psadbw m6, m4 1279 pshufd m6, m6, 84 1280 paddd m5, m6 1281 paddd m2, m5 1282 lea r0, [r0 + FENC_STRIDE * 2] 1283 lea r1, [r1 + r4 * 2] 1284 lea r2, [r2 + r4 * 2] 1285 lea r3, [r3 + r4 * 2] 1286 1287 mova m3, [r0 + FENC_STRIDE] 1288 mova m4, [r0 + 16 + FENC_STRIDE] 1289 movu m5, [r1 + r4] 1290 movu m6, [r1 + 16 + r4] 1291 psadbw m5, m3 1292 psadbw m6, m4 1293 pshufd m6, m6, 84 1294 paddd m5, m6 1295 paddd m0, m5 1296 movu m5, [r2 + r4] 1297 movu m6, [r2 + 16 + r4] 1298 psadbw m5, m3 1299 psadbw m6, m4 1300 pshufd m6, m6, 84 1301 paddd m5, m6 1302 paddd m1, m5 1303 movu m5, [r3 + r4] 1304 movu m6, [r3 + 16 + r4] 1305 psadbw m5, m3 1306 psadbw m6, m4 1307 pshufd m6, m6, 84 1308 paddd m5, m6 1309 paddd m2, m5 1310 lea r0, [r0 + FENC_STRIDE * 2] 1311 lea r1, [r1 + r4 * 2] 1312 lea r2, [r2 + r4 * 2] 1313 lea r3, [r3 + r4 * 2] 1314%endmacro 1315 1316%macro SAD_X4_24x4 0 1317 mova m4, [r0] 1318 mova m5, [r0 + 16] 1319 movu m6, [r1] 1320 movu m7, [r1 + 16] 1321 psadbw m6, m4 1322 psadbw m7, m5 1323 pshufd m7, m7, 84 1324 paddd m6, m7 1325 paddd m0, m6 1326 movu m6, [r2] 1327 movu m7, [r2 + 16] 1328 psadbw m6, m4 1329 psadbw m7, m5 1330 pshufd m7, m7, 84 1331 paddd m6, m7 1332 paddd m1, m6 1333 movu m6, [r3] 1334 movu m7, [r3 + 16] 1335 psadbw m6, m4 1336 psadbw m7, m5 1337 pshufd m7, m7, 84 1338 paddd m6, m7 1339 paddd m2, m6 1340 movu m6, [r4] 1341 movu m7, [r4 + 16] 1342 psadbw m6, m4 1343 psadbw m7, m5 1344 pshufd m7, m7, 84 1345 paddd m6, m7 1346 paddd m3, m6 1347 1348 mova m4, [r0 + FENC_STRIDE] 1349 mova m5, [r0 + 16 + FENC_STRIDE] 1350 movu m6, [r1 + r5] 1351 movu m7, [r1 + 16 + r5] 1352 psadbw m6, m4 1353 psadbw m7, m5 1354 pshufd m7, m7, 84 1355 paddd m6, m7 1356 paddd m0, m6 1357 movu m6, [r2 + r5] 1358 movu m7, [r2 + 16 + r5] 1359 psadbw m6, m4 1360 psadbw m7, m5 1361 pshufd m7, m7, 84 1362 paddd m6, m7 1363 paddd m1, m6 1364 movu m6, [r3 + r5] 1365 movu m7, [r3 + 16 + r5] 1366 psadbw m6, m4 1367 psadbw m7, m5 1368 pshufd m7, m7, 84 1369 paddd m6, m7 1370 paddd m2, m6 1371 movu m6, [r4 + r5] 1372 movu m7, [r4 + 16 + r5] 1373 psadbw m6, m4 1374 psadbw m7, m5 1375 pshufd m7, m7, 84 1376 paddd m6, m7 1377 paddd m3, m6 1378 1379 mova m4, [r0 + FENC_STRIDE * 2] 1380 mova m5, [r0 + 16 + FENC_STRIDE * 2] 1381 movu m6, [r1 + r5 * 2] 1382 movu m7, [r1 + 16 + r5 * 2] 1383 psadbw m6, m4 1384 psadbw m7, m5 1385 pshufd m7, m7, 84 1386 paddd m6, m7 1387 paddd m0, m6 1388 movu m6, [r2 + r5 * 2] 1389 movu m7, [r2 + 16 + r5 * 2] 1390 psadbw m6, m4 1391 psadbw m7, m5 1392 pshufd m7, m7, 84 1393 paddd m6, m7 1394 paddd m1, m6 1395 movu m6, [r3 + r5 * 2] 1396 movu m7, [r3 + 16 + r5 * 2] 1397 psadbw m6, m4 1398 psadbw m7, m5 1399 pshufd m7, m7, 84 1400 paddd m6, m7 1401 paddd m2, m6 1402 movu m6, [r4 + r5 * 2] 1403 movu m7, [r4 + 16 + r5 * 2] 1404 psadbw m6, m4 1405 psadbw m7, m5 1406 pshufd m7, m7, 84 1407 paddd m6, m7 1408 paddd m3, m6 1409 lea r0, [r0 + FENC_STRIDE * 2] 1410 lea r1, [r1 + r5 * 2] 1411 lea r2, [r2 + r5 * 2] 1412 lea r3, [r3 + r5 * 2] 1413 lea r4, [r4 + r5 * 2] 1414 mova m4, [r0 + FENC_STRIDE] 1415 mova m5, [r0 + 16 + FENC_STRIDE] 1416 movu m6, [r1 + r5] 1417 movu m7, [r1 + 16 + r5] 1418 psadbw m6, m4 1419 psadbw m7, m5 1420 pshufd m7, m7, 84 1421 paddd m6, m7 1422 paddd m0, m6 1423 movu m6, [r2 + r5] 1424 movu m7, [r2 + 16 + r5] 1425 psadbw m6, m4 1426 psadbw m7, m5 1427 pshufd m7, m7, 84 1428 paddd m6, m7 1429 paddd m1, m6 1430 movu m6, [r3 + r5] 1431 movu m7, [r3 + 16 + r5] 1432 psadbw m6, m4 1433 psadbw m7, m5 1434 pshufd m7, m7, 84 1435 paddd m6, m7 1436 paddd m2, m6 1437 movu m6, [r4 + r5] 1438 movu m7, [r4 + 16 + r5] 1439 psadbw m6, m4 1440 psadbw m7, m5 1441 pshufd m7, m7, 84 1442 paddd m6, m7 1443 paddd m3, m6 1444 lea r0, [r0 + FENC_STRIDE * 2] 1445 lea r1, [r1 + r5 * 2] 1446 lea r2, [r2 + r5 * 2] 1447 lea r3, [r3 + r5 * 2] 1448 lea r4, [r4 + r5 * 2] 1449%endmacro 1450 1451%macro SAD_X3_32x4 0 1452 mova m3, [r0] 1453 mova m4, [r0 + 16] 1454 movu m5, [r1] 1455 movu m6, [r1 + 16] 1456 psadbw m5, m3 1457 psadbw m6, m4 1458 paddd m5, m6 1459 paddd m0, m5 1460 movu m5, [r2] 1461 movu m6, [r2 + 16] 1462 psadbw m5, m3 1463 psadbw m6, m4 1464 paddd m5, m6 1465 paddd m1, m5 1466 movu m5, [r3] 1467 movu m6, [r3 + 16] 1468 psadbw m5, m3 1469 psadbw m6, m4 1470 paddd m5, m6 1471 paddd m2, m5 1472 lea r0, [r0 + FENC_STRIDE] 1473 lea r1, [r1 + r4] 1474 lea r2, [r2 + r4] 1475 lea r3, [r3 + r4] 1476 mova m3, [r0] 1477 mova m4, [r0 + 16] 1478 movu m5, [r1] 1479 movu m6, [r1 + 16] 1480 psadbw m5, m3 1481 psadbw m6, m4 1482 paddd m5, m6 1483 paddd m0, m5 1484 movu m5, [r2] 1485 movu m6, [r2 + 16] 1486 psadbw m5, m3 1487 psadbw m6, m4 1488 paddd m5, m6 1489 paddd m1, m5 1490 movu m5, [r3] 1491 movu m6, [r3 + 16] 1492 psadbw m5, m3 1493 psadbw m6, m4 1494 paddd m5, m6 1495 paddd m2, m5 1496 lea r0, [r0 + FENC_STRIDE] 1497 lea r1, [r1 + r4] 1498 lea r2, [r2 + r4] 1499 lea r3, [r3 + r4] 1500 mova m3, [r0] 1501 mova m4, [r0 + 16] 1502 movu m5, [r1] 1503 movu m6, [r1 + 16] 1504 psadbw m5, m3 1505 psadbw m6, m4 1506 paddd m5, m6 1507 paddd m0, m5 1508 movu m5, [r2] 1509 movu m6, [r2 + 16] 1510 psadbw m5, m3 1511 psadbw m6, m4 1512 paddd m5, m6 1513 paddd m1, m5 1514 movu m5, [r3] 1515 movu m6, [r3 + 16] 1516 psadbw m5, m3 1517 psadbw m6, m4 1518 paddd m5, m6 1519 paddd m2, m5 1520 lea r0, [r0 + FENC_STRIDE] 1521 lea r1, [r1 + r4] 1522 lea r2, [r2 + r4] 1523 lea r3, [r3 + r4] 1524 mova m3, [r0] 1525 mova m4, [r0 + 16] 1526 movu m5, [r1] 1527 movu m6, [r1 + 16] 1528 psadbw m5, m3 1529 psadbw m6, m4 1530 paddd m5, m6 1531 paddd m0, m5 1532 movu m5, [r2] 1533 movu m6, [r2 + 16] 1534 psadbw m5, m3 1535 psadbw m6, m4 1536 paddd m5, m6 1537 paddd m1, m5 1538 movu m5, [r3] 1539 movu m6, [r3 + 16] 1540 psadbw m5, m3 1541 psadbw m6, m4 1542 paddd m5, m6 1543 paddd m2, m5 1544 lea r0, [r0 + FENC_STRIDE] 1545 lea r1, [r1 + r4] 1546 lea r2, [r2 + r4] 1547 lea r3, [r3 + r4] 1548%endmacro 1549 1550%macro SAD_X4_32x4 0 1551 mova m4, [r0] 1552 mova m5, [r0 + 16] 1553 movu m6, [r1] 1554 movu m7, [r1 + 16] 1555 psadbw m6, m4 1556 psadbw m7, m5 1557 paddd m6, m7 1558 paddd m0, m6 1559 movu m6, [r2] 1560 movu m7, [r2 + 16] 1561 psadbw m6, m4 1562 psadbw m7, m5 1563 paddd m6, m7 1564 paddd m1, m6 1565 movu m6, [r3] 1566 movu m7, [r3 + 16] 1567 psadbw m6, m4 1568 psadbw m7, m5 1569 paddd m6, m7 1570 paddd m2, m6 1571 movu m6, [r4] 1572 movu m7, [r4 + 16] 1573 psadbw m6, m4 1574 psadbw m7, m5 1575 paddd m6, m7 1576 paddd m3, m6 1577 lea r0, [r0 + FENC_STRIDE] 1578 lea r1, [r1 + r5] 1579 lea r2, [r2 + r5] 1580 lea r3, [r3 + r5] 1581 lea r4, [r4 + r5] 1582 mova m4, [r0] 1583 mova m5, [r0 + 16] 1584 movu m6, [r1] 1585 movu m7, [r1 + 16] 1586 psadbw m6, m4 1587 psadbw m7, m5 1588 paddd m6, m7 1589 paddd m0, m6 1590 movu m6, [r2] 1591 movu m7, [r2 + 16] 1592 psadbw m6, m4 1593 psadbw m7, m5 1594 paddd m6, m7 1595 paddd m1, m6 1596 movu m6, [r3] 1597 movu m7, [r3 + 16] 1598 psadbw m6, m4 1599 psadbw m7, m5 1600 paddd m6, m7 1601 paddd m2, m6 1602 movu m6, [r4] 1603 movu m7, [r4 + 16] 1604 psadbw m6, m4 1605 psadbw m7, m5 1606 paddd m6, m7 1607 paddd m3, m6 1608 lea r0, [r0 + FENC_STRIDE] 1609 lea r1, [r1 + r5] 1610 lea r2, [r2 + r5] 1611 lea r3, [r3 + r5] 1612 lea r4, [r4 + r5] 1613 mova m4, [r0] 1614 mova m5, [r0 + 16] 1615 movu m6, [r1] 1616 movu m7, [r1 + 16] 1617 psadbw m6, m4 1618 psadbw m7, m5 1619 paddd m6, m7 1620 paddd m0, m6 1621 movu m6, [r2] 1622 movu m7, [r2 + 16] 1623 psadbw m6, m4 1624 psadbw m7, m5 1625 paddd m6, m7 1626 paddd m1, m6 1627 movu m6, [r3] 1628 movu m7, [r3 + 16] 1629 psadbw m6, m4 1630 psadbw m7, m5 1631 paddd m6, m7 1632 paddd m2, m6 1633 movu m6, [r4] 1634 movu m7, [r4 + 16] 1635 psadbw m6, m4 1636 psadbw m7, m5 1637 paddd m6, m7 1638 paddd m3, m6 1639 lea r0, [r0 + FENC_STRIDE] 1640 lea r1, [r1 + r5] 1641 lea r2, [r2 + r5] 1642 lea r3, [r3 + r5] 1643 lea r4, [r4 + r5] 1644 mova m4, [r0] 1645 mova m5, [r0 + 16] 1646 movu m6, [r1] 1647 movu m7, [r1 + 16] 1648 psadbw m6, m4 1649 psadbw m7, m5 1650 paddd m6, m7 1651 paddd m0, m6 1652 movu m6, [r2] 1653 movu m7, [r2 + 16] 1654 psadbw m6, m4 1655 psadbw m7, m5 1656 paddd m6, m7 1657 paddd m1, m6 1658 movu m6, [r3] 1659 movu m7, [r3 + 16] 1660 psadbw m6, m4 1661 psadbw m7, m5 1662 paddd m6, m7 1663 paddd m2, m6 1664 movu m6, [r4] 1665 movu m7, [r4 + 16] 1666 psadbw m6, m4 1667 psadbw m7, m5 1668 paddd m6, m7 1669 paddd m3, m6 1670 lea r0, [r0 + FENC_STRIDE] 1671 lea r1, [r1 + r5] 1672 lea r2, [r2 + r5] 1673 lea r3, [r3 + r5] 1674 lea r4, [r4 + r5] 1675%endmacro 1676 1677%macro SAD_X3_48x4 0 1678 mova m3, [r0] 1679 mova m4, [r0 + 16] 1680 mova m5, [r0 + 32] 1681 movu m6, [r1] 1682 psadbw m6, m3 1683 paddd m0, m6 1684 movu m6, [r1 + 16] 1685 psadbw m6, m4 1686 paddd m0, m6 1687 movu m6, [r1 + 32] 1688 psadbw m6, m5 1689 paddd m0, m6 1690 movu m6, [r2] 1691 psadbw m6, m3 1692 paddd m1, m6 1693 movu m6, [r2 + 16] 1694 psadbw m6, m4 1695 paddd m1, m6 1696 movu m6, [r2 + 32] 1697 psadbw m6, m5 1698 paddd m1, m6 1699 movu m6, [r3] 1700 psadbw m6, m3 1701 paddd m2, m6 1702 movu m6, [r3 + 16] 1703 psadbw m6, m4 1704 paddd m2, m6 1705 movu m6, [r3 + 32] 1706 psadbw m6, m5 1707 paddd m2, m6 1708 1709 mova m3, [r0 + FENC_STRIDE] 1710 mova m4, [r0 + 16 + FENC_STRIDE] 1711 mova m5, [r0 + 32 + FENC_STRIDE] 1712 movu m6, [r1 + r4] 1713 psadbw m6, m3 1714 paddd m0, m6 1715 movu m6, [r1 + 16 + r4] 1716 psadbw m6, m4 1717 paddd m0, m6 1718 movu m6, [r1 + 32 + r4] 1719 psadbw m6, m5 1720 paddd m0, m6 1721 movu m6, [r2 + r4] 1722 psadbw m6, m3 1723 paddd m1, m6 1724 movu m6, [r2 + 16 + r4] 1725 psadbw m6, m4 1726 paddd m1, m6 1727 movu m6, [r2 + 32 + r4] 1728 psadbw m6, m5 1729 paddd m1, m6 1730 movu m6, [r3 + r4] 1731 psadbw m6, m3 1732 paddd m2, m6 1733 movu m6, [r3 + 16 + r4] 1734 psadbw m6, m4 1735 paddd m2, m6 1736 movu m6, [r3 + 32 + r4] 1737 psadbw m6, m5 1738 paddd m2, m6 1739 1740 mova m3, [r0 + FENC_STRIDE * 2] 1741 mova m4, [r0 + 16 + FENC_STRIDE * 2] 1742 mova m5, [r0 + 32 + FENC_STRIDE * 2] 1743 movu m6, [r1 + r4 * 2] 1744 psadbw m6, m3 1745 paddd m0, m6 1746 movu m6, [r1 + 16 + r4 * 2] 1747 psadbw m6, m4 1748 paddd m0, m6 1749 movu m6, [r1 + 32 + r4 * 2] 1750 psadbw m6, m5 1751 paddd m0, m6 1752 movu m6, [r2 + r4 * 2] 1753 psadbw m6, m3 1754 paddd m1, m6 1755 movu m6, [r2 + 16 + r4 * 2] 1756 psadbw m6, m4 1757 paddd m1, m6 1758 movu m6, [r2 + 32 + r4 * 2] 1759 psadbw m6, m5 1760 paddd m1, m6 1761 movu m6, [r3 + r4 * 2] 1762 psadbw m6, m3 1763 paddd m2, m6 1764 movu m6, [r3 + 16 + r4 * 2] 1765 psadbw m6, m4 1766 paddd m2, m6 1767 movu m6, [r3 + 32 + r4 * 2] 1768 psadbw m6, m5 1769 paddd m2, m6 1770 1771 lea r0, [r0 + FENC_STRIDE * 2] 1772 lea r1, [r1 + r4 * 2] 1773 lea r2, [r2 + r4 * 2] 1774 lea r3, [r3 + r4 * 2] 1775 mova m3, [r0 + FENC_STRIDE] 1776 mova m4, [r0 + 16 + FENC_STRIDE] 1777 mova m5, [r0 + 32 + FENC_STRIDE] 1778 movu m6, [r1 + r4] 1779 psadbw m6, m3 1780 paddd m0, m6 1781 movu m6, [r1 + 16 + r4] 1782 psadbw m6, m4 1783 paddd m0, m6 1784 movu m6, [r1 + 32 + r4] 1785 psadbw m6, m5 1786 paddd m0, m6 1787 movu m6, [r2 + r4] 1788 psadbw m6, m3 1789 paddd m1, m6 1790 movu m6, [r2 + 16 + r4] 1791 psadbw m6, m4 1792 paddd m1, m6 1793 movu m6, [r2 + 32 + r4] 1794 psadbw m6, m5 1795 paddd m1, m6 1796 movu m6, [r3 + r4] 1797 psadbw m6, m3 1798 paddd m2, m6 1799 movu m6, [r3 + 16 + r4] 1800 psadbw m6, m4 1801 paddd m2, m6 1802 movu m6, [r3 + 32 + r4] 1803 psadbw m6, m5 1804 paddd m2, m6 1805 lea r0, [r0 + FENC_STRIDE * 2] 1806 lea r1, [r1 + r4 * 2] 1807 lea r2, [r2 + r4 * 2] 1808 lea r3, [r3 + r4 * 2] 1809%endmacro 1810 1811%macro SAD_X4_48x4 0 1812 mova m4, [r0] 1813 mova m5, [r0 + 16] 1814 mova m6, [r0 + 32] 1815 movu m7, [r1] 1816 psadbw m7, m4 1817 paddd m0, m7 1818 movu m7, [r1 + 16] 1819 psadbw m7, m5 1820 paddd m0, m7 1821 movu m7, [r1 + 32] 1822 psadbw m7, m6 1823 paddd m0, m7 1824 movu m7, [r2] 1825 psadbw m7, m4 1826 paddd m1, m7 1827 movu m7, [r2 + 16] 1828 psadbw m7, m5 1829 paddd m1, m7 1830 movu m7, [r2 + 32] 1831 psadbw m7, m6 1832 paddd m1, m7 1833 movu m7, [r3] 1834 psadbw m7, m4 1835 paddd m2, m7 1836 movu m7, [r3 + 16] 1837 psadbw m7, m5 1838 paddd m2, m7 1839 movu m7, [r3 + 32] 1840 psadbw m7, m6 1841 paddd m2, m7 1842 movu m7, [r4] 1843 psadbw m7, m4 1844 paddd m3, m7 1845 movu m7, [r4 + 16] 1846 psadbw m7, m5 1847 paddd m3, m7 1848 movu m7, [r4 + 32] 1849 psadbw m7, m6 1850 paddd m3, m7 1851 1852 mova m4, [r0 + FENC_STRIDE] 1853 mova m5, [r0 + 16 + FENC_STRIDE] 1854 mova m6, [r0 + 32 + FENC_STRIDE] 1855 movu m7, [r1 + r5] 1856 psadbw m7, m4 1857 paddd m0, m7 1858 movu m7, [r1 + 16 + r5] 1859 psadbw m7, m5 1860 paddd m0, m7 1861 movu m7, [r1 + 32 + r5] 1862 psadbw m7, m6 1863 paddd m0, m7 1864 movu m7, [r2 + r5] 1865 psadbw m7, m4 1866 paddd m1, m7 1867 movu m7, [r2 + 16 + r5] 1868 psadbw m7, m5 1869 paddd m1, m7 1870 movu m7, [r2 + 32 + r5] 1871 psadbw m7, m6 1872 paddd m1, m7 1873 movu m7, [r3 + r5] 1874 psadbw m7, m4 1875 paddd m2, m7 1876 movu m7, [r3 + 16 + r5] 1877 psadbw m7, m5 1878 paddd m2, m7 1879 movu m7, [r3 + 32 + r5] 1880 psadbw m7, m6 1881 paddd m2, m7 1882 movu m7, [r4 + r5] 1883 psadbw m7, m4 1884 paddd m3, m7 1885 movu m7, [r4 + 16 + r5] 1886 psadbw m7, m5 1887 paddd m3, m7 1888 movu m7, [r4 + 32 + r5] 1889 psadbw m7, m6 1890 paddd m3, m7 1891 1892 mova m4, [r0 + FENC_STRIDE * 2] 1893 mova m5, [r0 + 16 + FENC_STRIDE * 2] 1894 mova m6, [r0 + 32 + FENC_STRIDE * 2] 1895 movu m7, [r1 + r5 * 2] 1896 psadbw m7, m4 1897 paddd m0, m7 1898 movu m7, [r1 + 16 + r5 * 2] 1899 psadbw m7, m5 1900 paddd m0, m7 1901 movu m7, [r1 + 32 + r5 * 2] 1902 psadbw m7, m6 1903 paddd m0, m7 1904 movu m7, [r2 + r5 * 2] 1905 psadbw m7, m4 1906 paddd m1, m7 1907 movu m7, [r2 + 16 + r5 * 2] 1908 psadbw m7, m5 1909 paddd m1, m7 1910 movu m7, [r2 + 32 + r5 * 2] 1911 psadbw m7, m6 1912 paddd m1, m7 1913 movu m7, [r3 + r5 * 2] 1914 psadbw m7, m4 1915 paddd m2, m7 1916 movu m7, [r3 + 16 + r5 * 2] 1917 psadbw m7, m5 1918 paddd m2, m7 1919 movu m7, [r3 + 32 + r5 * 2] 1920 psadbw m7, m6 1921 paddd m2, m7 1922 movu m7, [r4 + r5 * 2] 1923 psadbw m7, m4 1924 paddd m3, m7 1925 movu m7, [r4 + 16 + r5 * 2] 1926 psadbw m7, m5 1927 paddd m3, m7 1928 movu m7, [r4 + 32 + r5 * 2] 1929 psadbw m7, m6 1930 paddd m3, m7 1931 1932 lea r0, [r0 + FENC_STRIDE * 2] 1933 lea r1, [r1 + r5 * 2] 1934 lea r2, [r2 + r5 * 2] 1935 lea r3, [r3 + r5 * 2] 1936 lea r4, [r4 + r5 * 2] 1937 mova m4, [r0 + FENC_STRIDE] 1938 mova m5, [r0 + 16 + FENC_STRIDE] 1939 mova m6, [r0 + 32 + FENC_STRIDE] 1940 movu m7, [r1 + r5] 1941 psadbw m7, m4 1942 paddd m0, m7 1943 movu m7, [r1 + 16 + r5] 1944 psadbw m7, m5 1945 paddd m0, m7 1946 movu m7, [r1 + 32 + r5] 1947 psadbw m7, m6 1948 paddd m0, m7 1949 movu m7, [r2 + r5] 1950 psadbw m7, m4 1951 paddd m1, m7 1952 movu m7, [r2 + 16 + r5] 1953 psadbw m7, m5 1954 paddd m1, m7 1955 movu m7, [r2 + 32 + r5] 1956 psadbw m7, m6 1957 paddd m1, m7 1958 movu m7, [r3 + r5] 1959 psadbw m7, m4 1960 paddd m2, m7 1961 movu m7, [r3 + 16 + r5] 1962 psadbw m7, m5 1963 paddd m2, m7 1964 movu m7, [r3 + 32 + r5] 1965 psadbw m7, m6 1966 paddd m2, m7 1967 movu m7, [r4 + r5] 1968 psadbw m7, m4 1969 paddd m3, m7 1970 movu m7, [r4 + 16 + r5] 1971 psadbw m7, m5 1972 paddd m3, m7 1973 movu m7, [r4 + 32 + r5] 1974 psadbw m7, m6 1975 paddd m3, m7 1976 lea r0, [r0 + FENC_STRIDE * 2] 1977 lea r1, [r1 + r5 * 2] 1978 lea r2, [r2 + r5 * 2] 1979 lea r3, [r3 + r5 * 2] 1980 lea r4, [r4 + r5 * 2] 1981%endmacro 1982 1983%macro SAD_X3_64x4 0 1984 mova m3, [r0] 1985 mova m4, [r0 + 16] 1986 movu m5, [r1] 1987 psadbw m5, m3 1988 paddd m0, m5 1989 movu m5, [r1 + 16] 1990 psadbw m5, m4 1991 paddd m0, m5 1992 movu m5, [r2] 1993 psadbw m5, m3 1994 paddd m1, m5 1995 movu m5, [r2 + 16] 1996 psadbw m5, m4 1997 paddd m1, m5 1998 movu m5, [r3] 1999 psadbw m5, m3 2000 paddd m2, m5 2001 movu m5, [r3 + 16] 2002 psadbw m5, m4 2003 paddd m2, m5 2004 mova m3, [r0 + 32] 2005 mova m4, [r0 + 48] 2006 movu m5, [r1 + 32] 2007 psadbw m5, m3 2008 paddd m0, m5 2009 movu m5, [r1 + 48] 2010 psadbw m5, m4 2011 paddd m0, m5 2012 movu m5, [r2 + 32] 2013 psadbw m5, m3 2014 paddd m1, m5 2015 movu m5, [r2 + 48] 2016 psadbw m5, m4 2017 paddd m1, m5 2018 movu m5, [r3 + 32] 2019 psadbw m5, m3 2020 paddd m2, m5 2021 movu m5, [r3 + 48] 2022 psadbw m5, m4 2023 paddd m2, m5 2024 2025 mova m3, [r0 + FENC_STRIDE] 2026 mova m4, [r0 + 16 + FENC_STRIDE] 2027 movu m5, [r1 + r4] 2028 psadbw m5, m3 2029 paddd m0, m5 2030 movu m5, [r1 + 16 + r4] 2031 psadbw m5, m4 2032 paddd m0, m5 2033 movu m5, [r2 + r4] 2034 psadbw m5, m3 2035 paddd m1, m5 2036 movu m5, [r2 + 16 + r4] 2037 psadbw m5, m4 2038 paddd m1, m5 2039 movu m5, [r3 + r4] 2040 psadbw m5, m3 2041 paddd m2, m5 2042 movu m5, [r3 + 16 + r4] 2043 psadbw m5, m4 2044 paddd m2, m5 2045 mova m3, [r0 + 32 + FENC_STRIDE] 2046 mova m4, [r0 + 48 + FENC_STRIDE] 2047 movu m5, [r1 + 32 + r4] 2048 psadbw m5, m3 2049 paddd m0, m5 2050 movu m5, [r1 + 48 + r4] 2051 psadbw m5, m4 2052 paddd m0, m5 2053 movu m5, [r2 + 32 + r4] 2054 psadbw m5, m3 2055 paddd m1, m5 2056 movu m5, [r2 + 48 + r4] 2057 psadbw m5, m4 2058 paddd m1, m5 2059 movu m5, [r3 + 32 + r4] 2060 psadbw m5, m3 2061 paddd m2, m5 2062 movu m5, [r3 + 48 + r4] 2063 psadbw m5, m4 2064 paddd m2, m5 2065 2066 mova m3, [r0 + FENC_STRIDE * 2] 2067 mova m4, [r0 + 16 + FENC_STRIDE * 2] 2068 movu m5, [r1 + r4 * 2] 2069 psadbw m5, m3 2070 paddd m0, m5 2071 movu m5, [r1 + 16 + r4 * 2] 2072 psadbw m5, m4 2073 paddd m0, m5 2074 movu m5, [r2 + r4 * 2] 2075 psadbw m5, m3 2076 paddd m1, m5 2077 movu m5, [r2 + 16 + r4 * 2] 2078 psadbw m5, m4 2079 paddd m1, m5 2080 movu m5, [r3 + r4 * 2] 2081 psadbw m5, m3 2082 paddd m2, m5 2083 movu m5, [r3 + 16 + r4 * 2] 2084 psadbw m5, m4 2085 paddd m2, m5 2086 mova m3, [r0 + 32 + FENC_STRIDE * 2] 2087 mova m4, [r0 + 48 + FENC_STRIDE * 2] 2088 movu m5, [r1 + 32 + r4 * 2] 2089 psadbw m5, m3 2090 paddd m0, m5 2091 movu m5, [r1 + 48 + r4 * 2] 2092 psadbw m5, m4 2093 paddd m0, m5 2094 movu m5, [r2 + 32 + r4 * 2] 2095 psadbw m5, m3 2096 paddd m1, m5 2097 movu m5, [r2 + 48 + r4 * 2] 2098 psadbw m5, m4 2099 paddd m1, m5 2100 movu m5, [r3 + 32 + r4 * 2] 2101 psadbw m5, m3 2102 paddd m2, m5 2103 movu m5, [r3 + 48 + r4 * 2] 2104 psadbw m5, m4 2105 paddd m2, m5 2106 2107 lea r0, [r0 + FENC_STRIDE * 2] 2108 lea r1, [r1 + r4 * 2] 2109 lea r2, [r2 + r4 * 2] 2110 lea r3, [r3 + r4 * 2] 2111 mova m3, [r0 + FENC_STRIDE] 2112 mova m4, [r0 + 16 + FENC_STRIDE] 2113 movu m5, [r1 + r4] 2114 psadbw m5, m3 2115 paddd m0, m5 2116 movu m5, [r1 + 16 + r4] 2117 psadbw m5, m4 2118 paddd m0, m5 2119 movu m5, [r2 + r4] 2120 psadbw m5, m3 2121 paddd m1, m5 2122 movu m5, [r2 + 16 + r4] 2123 psadbw m5, m4 2124 paddd m1, m5 2125 movu m5, [r3 + r4] 2126 psadbw m5, m3 2127 paddd m2, m5 2128 movu m5, [r3 + 16 + r4] 2129 psadbw m5, m4 2130 paddd m2, m5 2131 mova m3, [r0 + 32 + FENC_STRIDE] 2132 mova m4, [r0 + 48 + FENC_STRIDE] 2133 movu m5, [r1 + 32 + r4] 2134 psadbw m5, m3 2135 paddd m0, m5 2136 movu m5, [r1 + 48 + r4] 2137 psadbw m5, m4 2138 paddd m0, m5 2139 movu m5, [r2 + 32 + r4] 2140 psadbw m5, m3 2141 paddd m1, m5 2142 movu m5, [r2 + 48 + r4] 2143 psadbw m5, m4 2144 paddd m1, m5 2145 movu m5, [r3 + 32 + r4] 2146 psadbw m5, m3 2147 paddd m2, m5 2148 movu m5, [r3 + 48 + r4] 2149 psadbw m5, m4 2150 paddd m2, m5 2151 lea r0, [r0 + FENC_STRIDE * 2] 2152 lea r1, [r1 + r4 * 2] 2153 lea r2, [r2 + r4 * 2] 2154 lea r3, [r3 + r4 * 2] 2155%endmacro 2156 2157%macro SAD_X4_64x4 0 2158 mova m4, [r0] 2159 mova m5, [r0 + 16] 2160 movu m6, [r1] 2161 psadbw m6, m4 2162 paddd m0, m6 2163 movu m6, [r1 + 16] 2164 psadbw m6, m5 2165 paddd m0, m6 2166 movu m6, [r2] 2167 psadbw m6, m4 2168 paddd m1, m6 2169 movu m6, [r2 + 16] 2170 psadbw m6, m5 2171 paddd m1, m6 2172 movu m6, [r3] 2173 psadbw m6, m4 2174 paddd m2, m6 2175 movu m6, [r3 + 16] 2176 psadbw m6, m5 2177 paddd m2, m6 2178 movu m6, [r4] 2179 psadbw m6, m4 2180 paddd m3, m6 2181 movu m6, [r4 + 16] 2182 psadbw m6, m5 2183 paddd m3, m6 2184 mova m4, [r0 + 32] 2185 mova m5, [r0 + 48] 2186 movu m6, [r1 + 32] 2187 psadbw m6, m4 2188 paddd m0, m6 2189 movu m6, [r1 + 48] 2190 psadbw m6, m5 2191 paddd m0, m6 2192 movu m6, [r2 + 32] 2193 psadbw m6, m4 2194 paddd m1, m6 2195 movu m6, [r2 + 48] 2196 psadbw m6, m5 2197 paddd m1, m6 2198 movu m6, [r3 + 32] 2199 psadbw m6, m4 2200 paddd m2, m6 2201 movu m6, [r3 + 48] 2202 psadbw m6, m5 2203 paddd m2, m6 2204 movu m6, [r4 + 32] 2205 psadbw m6, m4 2206 paddd m3, m6 2207 movu m6, [r4 + 48] 2208 psadbw m6, m5 2209 paddd m3, m6 2210 2211 mova m4, [r0 + FENC_STRIDE] 2212 mova m5, [r0 + 16 + FENC_STRIDE] 2213 movu m6, [r1 + r5] 2214 psadbw m6, m4 2215 paddd m0, m6 2216 movu m6, [r1 + 16 + r5] 2217 psadbw m6, m5 2218 paddd m0, m6 2219 movu m6, [r2 + r5] 2220 psadbw m6, m4 2221 paddd m1, m6 2222 movu m6, [r2 + 16 + r5] 2223 psadbw m6, m5 2224 paddd m1, m6 2225 movu m6, [r3 + r5] 2226 psadbw m6, m4 2227 paddd m2, m6 2228 movu m6, [r3 + 16 + r5] 2229 psadbw m6, m5 2230 paddd m2, m6 2231 movu m6, [r4 + r5] 2232 psadbw m6, m4 2233 paddd m3, m6 2234 movu m6, [r4 + 16 + r5] 2235 psadbw m6, m5 2236 paddd m3, m6 2237 mova m4, [r0 + 32 + FENC_STRIDE] 2238 mova m5, [r0 + 48 + FENC_STRIDE] 2239 movu m6, [r1 + 32 + r5] 2240 psadbw m6, m4 2241 paddd m0, m6 2242 movu m6, [r1 + 48 + r5] 2243 psadbw m6, m5 2244 paddd m0, m6 2245 movu m6, [r2 + 32 + r5] 2246 psadbw m6, m4 2247 paddd m1, m6 2248 movu m6, [r2 + 48 + r5] 2249 psadbw m6, m5 2250 paddd m1, m6 2251 movu m6, [r3 + 32 + r5] 2252 psadbw m6, m4 2253 paddd m2, m6 2254 movu m6, [r3 + 48 + r5] 2255 psadbw m6, m5 2256 paddd m2, m6 2257 movu m6, [r4 + 32 + r5] 2258 psadbw m6, m4 2259 paddd m3, m6 2260 movu m6, [r4 + 48 + r5] 2261 psadbw m6, m5 2262 paddd m3, m6 2263 2264 mova m4, [r0 + FENC_STRIDE * 2] 2265 mova m5, [r0 + 16 + FENC_STRIDE * 2] 2266 movu m6, [r1 + r5 * 2] 2267 psadbw m6, m4 2268 paddd m0, m6 2269 movu m6, [r1 + 16 + r5 * 2] 2270 psadbw m6, m5 2271 paddd m0, m6 2272 movu m6, [r2 + r5 * 2] 2273 psadbw m6, m4 2274 paddd m1, m6 2275 movu m6, [r2 + 16 + r5 * 2] 2276 psadbw m6, m5 2277 paddd m1, m6 2278 movu m6, [r3 + r5 * 2] 2279 psadbw m6, m4 2280 paddd m2, m6 2281 movu m6, [r3 + 16 + r5 * 2] 2282 psadbw m6, m5 2283 paddd m2, m6 2284 movu m6, [r4 + r5 * 2] 2285 psadbw m6, m4 2286 paddd m3, m6 2287 movu m6, [r4 + 16 + r5 * 2] 2288 psadbw m6, m5 2289 paddd m3, m6 2290 mova m4, [r0 + 32 + FENC_STRIDE * 2] 2291 mova m5, [r0 + 48 + FENC_STRIDE * 2] 2292 movu m6, [r1 + 32 + r5 * 2] 2293 psadbw m6, m4 2294 paddd m0, m6 2295 movu m6, [r1 + 48 + r5 * 2] 2296 psadbw m6, m5 2297 paddd m0, m6 2298 movu m6, [r2 + 32 + r5 * 2] 2299 psadbw m6, m4 2300 paddd m1, m6 2301 movu m6, [r2 + 48 + r5 * 2] 2302 psadbw m6, m5 2303 paddd m1, m6 2304 movu m6, [r3 + 32 + r5 * 2] 2305 psadbw m6, m4 2306 paddd m2, m6 2307 movu m6, [r3 + 48 + r5 * 2] 2308 psadbw m6, m5 2309 paddd m2, m6 2310 movu m6, [r4 + 32 + r5 * 2] 2311 psadbw m6, m4 2312 paddd m3, m6 2313 movu m6, [r4 + 48 + r5 * 2] 2314 psadbw m6, m5 2315 paddd m3, m6 2316 2317 lea r0, [r0 + FENC_STRIDE * 2] 2318 lea r1, [r1 + r5 * 2] 2319 lea r2, [r2 + r5 * 2] 2320 lea r3, [r3 + r5 * 2] 2321 lea r4, [r4 + r5 * 2] 2322 mova m4, [r0 + FENC_STRIDE] 2323 mova m5, [r0 + 16 + FENC_STRIDE] 2324 movu m6, [r1 + r5] 2325 psadbw m6, m4 2326 paddd m0, m6 2327 movu m6, [r1 + 16 + r5] 2328 psadbw m6, m5 2329 paddd m0, m6 2330 movu m6, [r2 + r5] 2331 psadbw m6, m4 2332 paddd m1, m6 2333 movu m6, [r2 + 16 + r5] 2334 psadbw m6, m5 2335 paddd m1, m6 2336 movu m6, [r3 + r5] 2337 psadbw m6, m4 2338 paddd m2, m6 2339 movu m6, [r3 + 16 + r5] 2340 psadbw m6, m5 2341 paddd m2, m6 2342 movu m6, [r4 + r5] 2343 psadbw m6, m4 2344 paddd m3, m6 2345 movu m6, [r4 + 16 + r5] 2346 psadbw m6, m5 2347 paddd m3, m6 2348 mova m4, [r0 + 32 + FENC_STRIDE] 2349 mova m5, [r0 + 48 + FENC_STRIDE] 2350 movu m6, [r1 + 32 + r5] 2351 psadbw m6, m4 2352 paddd m0, m6 2353 movu m6, [r1 + 48 + r5] 2354 psadbw m6, m5 2355 paddd m0, m6 2356 movu m6, [r2 + 32 + r5] 2357 psadbw m6, m4 2358 paddd m1, m6 2359 movu m6, [r2 + 48 + r5] 2360 psadbw m6, m5 2361 paddd m1, m6 2362 movu m6, [r3 + 32 + r5] 2363 psadbw m6, m4 2364 paddd m2, m6 2365 movu m6, [r3 + 48 + r5] 2366 psadbw m6, m5 2367 paddd m2, m6 2368 movu m6, [r4 + 32 + r5] 2369 psadbw m6, m4 2370 paddd m3, m6 2371 movu m6, [r4 + 48 + r5] 2372 psadbw m6, m5 2373 paddd m3, m6 2374 lea r0, [r0 + FENC_STRIDE * 2] 2375 lea r1, [r1 + r5 * 2] 2376 lea r2, [r2 + r5 * 2] 2377 lea r3, [r3 + r5 * 2] 2378 lea r4, [r4 + r5 * 2] 2379%endmacro 2380 2381;----------------------------------------------------------------------------- 2382; void pixel_sad_x3_16x16( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, 2383; uint8_t *pix2, intptr_t i_stride, int scores[3] ) 2384;----------------------------------------------------------------------------- 2385%macro SAD_X 3 2386cglobal pixel_sad_x%1_%2x%3_mmx2, %1+2, %1+2 2387 SAD_X%1_2x%2P 1 2388%rep %3/2-1 2389 SAD_X%1_2x%2P 0 2390%endrep 2391 SAD_X%1_END 2392%endmacro 2393 2394INIT_MMX 2395SAD_X 3, 16, 16 2396SAD_X 3, 16, 8 2397SAD_X 3, 8, 16 2398SAD_X 3, 8, 8 2399SAD_X 3, 8, 4 2400SAD_X 3, 4, 16 2401SAD_X 3, 4, 8 2402SAD_X 3, 4, 4 2403SAD_X 4, 16, 16 2404SAD_X 4, 16, 8 2405SAD_X 4, 8, 16 2406SAD_X 4, 8, 8 2407SAD_X 4, 8, 4 2408SAD_X 4, 4, 16 2409SAD_X 4, 4, 8 2410SAD_X 4, 4, 4 2411 2412 2413 2414;============================================================================= 2415; SAD x3/x4 XMM 2416;============================================================================= 2417 2418%macro SAD_X3_START_1x16P_SSE2 0 2419 mova m2, [r0] 2420%if cpuflag(avx) 2421 psadbw m0, m2, [r1] 2422 psadbw m1, m2, [r2] 2423 psadbw m2, [r3] 2424%else 2425 movu m0, [r1] 2426 movu m1, [r2] 2427 movu m3, [r3] 2428 psadbw m0, m2 2429 psadbw m1, m2 2430 psadbw m2, m3 2431%endif 2432%endmacro 2433 2434%macro SAD_X3_1x16P_SSE2 2 2435 mova m3, [r0+%1] 2436%if cpuflag(avx) 2437 psadbw m4, m3, [r1+%2] 2438 psadbw m5, m3, [r2+%2] 2439 psadbw m3, [r3+%2] 2440%else 2441 movu m4, [r1+%2] 2442 movu m5, [r2+%2] 2443 movu m6, [r3+%2] 2444 psadbw m4, m3 2445 psadbw m5, m3 2446 psadbw m3, m6 2447%endif 2448 paddd m0, m4 2449 paddd m1, m5 2450 paddd m2, m3 2451%endmacro 2452 2453%if ARCH_X86_64 2454 DECLARE_REG_TMP 6 2455%else 2456 DECLARE_REG_TMP 5 2457%endif 2458 2459%macro SAD_X3_4x16P_SSE2 2 2460%if %1==0 2461 lea t0, [r4*3] 2462 SAD_X3_START_1x16P_SSE2 2463%else 2464 SAD_X3_1x16P_SSE2 FENC_STRIDE*(0+(%1&1)*4), r4*0 2465%endif 2466 SAD_X3_1x16P_SSE2 FENC_STRIDE*(1+(%1&1)*4), r4*1 2467 SAD_X3_1x16P_SSE2 FENC_STRIDE*(2+(%1&1)*4), r4*2 2468 SAD_X3_1x16P_SSE2 FENC_STRIDE*(3+(%1&1)*4), t0 2469%if %1 != %2-1 2470%if (%1&1) != 0 2471 add r0, 8*FENC_STRIDE 2472%endif 2473 lea r1, [r1+4*r4] 2474 lea r2, [r2+4*r4] 2475 lea r3, [r3+4*r4] 2476%endif 2477%endmacro 2478 2479%macro SAD_X3_START_2x8P_SSE2 0 2480 movq m3, [r0] 2481 movq m0, [r1] 2482 movq m1, [r2] 2483 movq m2, [r3] 2484 movhps m3, [r0+FENC_STRIDE] 2485 movhps m0, [r1+r4] 2486 movhps m1, [r2+r4] 2487 movhps m2, [r3+r4] 2488 psadbw m0, m3 2489 psadbw m1, m3 2490 psadbw m2, m3 2491%endmacro 2492 2493%macro SAD_X3_2x8P_SSE2 4 2494 movq m6, [r0+%1] 2495 movq m3, [r1+%2] 2496 movq m4, [r2+%2] 2497 movq m5, [r3+%2] 2498 movhps m6, [r0+%3] 2499 movhps m3, [r1+%4] 2500 movhps m4, [r2+%4] 2501 movhps m5, [r3+%4] 2502 psadbw m3, m6 2503 psadbw m4, m6 2504 psadbw m5, m6 2505 paddd m0, m3 2506 paddd m1, m4 2507 paddd m2, m5 2508%endmacro 2509 2510%macro SAD_X4_START_2x8P_SSE2 0 2511 movq m4, [r0] 2512 movq m0, [r1] 2513 movq m1, [r2] 2514 movq m2, [r3] 2515 movq m3, [r4] 2516 movhps m4, [r0+FENC_STRIDE] 2517 movhps m0, [r1+r5] 2518 movhps m1, [r2+r5] 2519 movhps m2, [r3+r5] 2520 movhps m3, [r4+r5] 2521 psadbw m0, m4 2522 psadbw m1, m4 2523 psadbw m2, m4 2524 psadbw m3, m4 2525%endmacro 2526 2527%macro SAD_X4_2x8P_SSE2 4 2528 movq m6, [r0+%1] 2529 movq m4, [r1+%2] 2530 movq m5, [r2+%2] 2531 movhps m6, [r0+%3] 2532 movhps m4, [r1+%4] 2533 movhps m5, [r2+%4] 2534 psadbw m4, m6 2535 psadbw m5, m6 2536 paddd m0, m4 2537 paddd m1, m5 2538 movq m4, [r3+%2] 2539 movq m5, [r4+%2] 2540 movhps m4, [r3+%4] 2541 movhps m5, [r4+%4] 2542 psadbw m4, m6 2543 psadbw m5, m6 2544 paddd m2, m4 2545 paddd m3, m5 2546%endmacro 2547 2548%macro SAD_X4_START_1x16P_SSE2 0 2549 mova m3, [r0] 2550%if cpuflag(avx) 2551 psadbw m0, m3, [r1] 2552 psadbw m1, m3, [r2] 2553 psadbw m2, m3, [r3] 2554 psadbw m3, [r4] 2555%else 2556 movu m0, [r1] 2557 movu m1, [r2] 2558 movu m2, [r3] 2559 movu m4, [r4] 2560 psadbw m0, m3 2561 psadbw m1, m3 2562 psadbw m2, m3 2563 psadbw m3, m4 2564%endif 2565%endmacro 2566 2567%macro SAD_X4_1x16P_SSE2 2 2568 mova m6, [r0+%1] 2569%if cpuflag(avx) 2570 psadbw m4, m6, [r1+%2] 2571 psadbw m5, m6, [r2+%2] 2572%else 2573 movu m4, [r1+%2] 2574 movu m5, [r2+%2] 2575 psadbw m4, m6 2576 psadbw m5, m6 2577%endif 2578 paddd m0, m4 2579 paddd m1, m5 2580%if cpuflag(avx) 2581 psadbw m4, m6, [r3+%2] 2582 psadbw m5, m6, [r4+%2] 2583%else 2584 movu m4, [r3+%2] 2585 movu m5, [r4+%2] 2586 psadbw m4, m6 2587 psadbw m5, m6 2588%endif 2589 paddd m2, m4 2590 paddd m3, m5 2591%endmacro 2592 2593%macro SAD_X4_4x16P_SSE2 2 2594%if %1==0 2595 lea r6, [r5*3] 2596 SAD_X4_START_1x16P_SSE2 2597%else 2598 SAD_X4_1x16P_SSE2 FENC_STRIDE*(0+(%1&1)*4), r5*0 2599%endif 2600 SAD_X4_1x16P_SSE2 FENC_STRIDE*(1+(%1&1)*4), r5*1 2601 SAD_X4_1x16P_SSE2 FENC_STRIDE*(2+(%1&1)*4), r5*2 2602 SAD_X4_1x16P_SSE2 FENC_STRIDE*(3+(%1&1)*4), r6 2603%if %1 != %2-1 2604%if (%1&1) != 0 2605 add r0, 8*FENC_STRIDE 2606%endif 2607 lea r1, [r1+4*r5] 2608 lea r2, [r2+4*r5] 2609 lea r3, [r3+4*r5] 2610 lea r4, [r4+4*r5] 2611%endif 2612%endmacro 2613 2614%macro SAD_X3_4x8P_SSE2 2 2615%if %1==0 2616 lea t0, [r4*3] 2617 SAD_X3_START_2x8P_SSE2 2618%else 2619 SAD_X3_2x8P_SSE2 FENC_STRIDE*(0+(%1&1)*4), r4*0, FENC_STRIDE*(1+(%1&1)*4), r4*1 2620%endif 2621 SAD_X3_2x8P_SSE2 FENC_STRIDE*(2+(%1&1)*4), r4*2, FENC_STRIDE*(3+(%1&1)*4), t0 2622%if %1 != %2-1 2623%if (%1&1) != 0 2624 add r0, 8*FENC_STRIDE 2625%endif 2626 lea r1, [r1+4*r4] 2627 lea r2, [r2+4*r4] 2628 lea r3, [r3+4*r4] 2629%endif 2630%endmacro 2631 2632%macro SAD_X4_4x8P_SSE2 2 2633%if %1==0 2634 lea r6, [r5*3] 2635 SAD_X4_START_2x8P_SSE2 2636%else 2637 SAD_X4_2x8P_SSE2 FENC_STRIDE*(0+(%1&1)*4), r5*0, FENC_STRIDE*(1+(%1&1)*4), r5*1 2638%endif 2639 SAD_X4_2x8P_SSE2 FENC_STRIDE*(2+(%1&1)*4), r5*2, FENC_STRIDE*(3+(%1&1)*4), r6 2640%if %1 != %2-1 2641%if (%1&1) != 0 2642 add r0, 8*FENC_STRIDE 2643%endif 2644 lea r1, [r1+4*r5] 2645 lea r2, [r2+4*r5] 2646 lea r3, [r3+4*r5] 2647 lea r4, [r4+4*r5] 2648%endif 2649%endmacro 2650 2651%macro SAD_X3_END_SSE2 1 2652 movifnidn r5, r5mp 2653 movhlps m3, m0 2654 movhlps m4, m1 2655 movhlps m5, m2 2656 paddd m0, m3 2657 paddd m1, m4 2658 paddd m2, m5 2659 movd [r5+0], m0 2660 movd [r5+4], m1 2661 movd [r5+8], m2 2662 RET 2663%endmacro 2664 2665%macro SAD_X4_END_SSE2 1 2666 mov r0, r6mp 2667 psllq m1, 32 2668 psllq m3, 32 2669 paddd m0, m1 2670 paddd m2, m3 2671 movhlps m1, m0 2672 movhlps m3, m2 2673 paddd m0, m1 2674 paddd m2, m3 2675 movq [r0+0], m0 2676 movq [r0+8], m2 2677 RET 2678%endmacro 2679 2680%macro SAD_X3_START_2x16P_AVX2 0 2681 movu m3, [r0] ; assumes FENC_STRIDE == 16 2682 movu xm0, [r1] 2683 movu xm1, [r2] 2684 movu xm2, [r3] 2685 vinserti128 m0, m0, [r1+r4], 1 2686 vinserti128 m1, m1, [r2+r4], 1 2687 vinserti128 m2, m2, [r3+r4], 1 2688 psadbw m0, m3 2689 psadbw m1, m3 2690 psadbw m2, m3 2691%endmacro 2692 2693%macro SAD_X3_2x16P_AVX2 3 2694 movu m3, [r0+%1] ; assumes FENC_STRIDE == 16 2695 movu xm4, [r1+%2] 2696 movu xm5, [r2+%2] 2697 movu xm6, [r3+%2] 2698 vinserti128 m4, m4, [r1+%3], 1 2699 vinserti128 m5, m5, [r2+%3], 1 2700 vinserti128 m6, m6, [r3+%3], 1 2701 psadbw m4, m3 2702 psadbw m5, m3 2703 psadbw m6, m3 2704 paddw m0, m4 2705 paddw m1, m5 2706 paddw m2, m6 2707%endmacro 2708 2709%macro SAD_X3_4x16P_AVX2 2 2710%if %1==0 2711 lea t0, [r4*3] 2712 SAD_X3_START_2x16P_AVX2 2713%else 2714 SAD_X3_2x16P_AVX2 FENC_STRIDE*(0+(%1&1)*4), r4*0, r4*1 2715%endif 2716 SAD_X3_2x16P_AVX2 FENC_STRIDE*(2+(%1&1)*4), r4*2, t0 2717%if %1 != %2-1 2718%if (%1&1) != 0 2719 add r0, 8*FENC_STRIDE 2720%endif 2721 lea r1, [r1+4*r4] 2722 lea r2, [r2+4*r4] 2723 lea r3, [r3+4*r4] 2724%endif 2725%endmacro 2726 2727%macro SAD_X4_START_2x16P_AVX2 0 2728 vbroadcasti128 m4, [r0] 2729 vbroadcasti128 m5, [r0+FENC_STRIDE] 2730 movu xm0, [r1] 2731 movu xm1, [r2] 2732 movu xm2, [r1+r5] 2733 movu xm3, [r2+r5] 2734 vinserti128 m0, m0, [r3], 1 2735 vinserti128 m1, m1, [r4], 1 2736 vinserti128 m2, m2, [r3+r5], 1 2737 vinserti128 m3, m3, [r4+r5], 1 2738 psadbw m0, m4 2739 psadbw m1, m4 2740 psadbw m2, m5 2741 psadbw m3, m5 2742 paddw m0, m2 2743 paddw m1, m3 2744%endmacro 2745 2746%macro SAD_X4_2x16P_AVX2 4 2747 vbroadcasti128 m6, [r0+%1] 2748 vbroadcasti128 m7, [r0+%3] 2749 movu xm2, [r1+%2] 2750 movu xm3, [r2+%2] 2751 movu xm4, [r1+%4] 2752 movu xm5, [r2+%4] 2753 vinserti128 m2, m2, [r3+%2], 1 2754 vinserti128 m3, m3, [r4+%2], 1 2755 vinserti128 m4, m4, [r3+%4], 1 2756 vinserti128 m5, m5, [r4+%4], 1 2757 psadbw m2, m6 2758 psadbw m3, m6 2759 psadbw m4, m7 2760 psadbw m5, m7 2761 paddd m0, m2 2762 paddd m1, m3 2763 paddd m0, m4 2764 paddd m1, m5 2765%endmacro 2766 2767%macro SAD_X4_4x16P_AVX2 2 2768%if %1==0 2769 lea r6, [r5*3] 2770 SAD_X4_START_2x16P_AVX2 2771%else 2772 SAD_X4_2x16P_AVX2 FENC_STRIDE*(0+(%1&1)*4), r5*0, FENC_STRIDE*(1+(%1&1)*4), r5*1 2773%endif 2774 SAD_X4_2x16P_AVX2 FENC_STRIDE*(2+(%1&1)*4), r5*2, FENC_STRIDE*(3+(%1&1)*4), r6 2775%if %1 != %2-1 2776%if (%1&1) != 0 2777 add r0, 8*FENC_STRIDE 2778%endif 2779 lea r1, [r1+4*r5] 2780 lea r2, [r2+4*r5] 2781 lea r3, [r3+4*r5] 2782 lea r4, [r4+4*r5] 2783%endif 2784%endmacro 2785 2786%macro SAD_X4_START_2x32P_AVX2 0 2787 mova m4, [r0] 2788 movu m0, [r1] 2789 movu m2, [r2] 2790 movu m1, [r3] 2791 movu m3, [r4] 2792 psadbw m0, m4 2793 psadbw m2, m4 2794 psadbw m1, m4 2795 psadbw m3, m4 2796 packusdw m0, m2 2797 packusdw m1, m3 2798 2799 mova m6, [r0+FENC_STRIDE] 2800 movu m2, [r1+r5] 2801 movu m4, [r2+r5] 2802 movu m3, [r3+r5] 2803 movu m5, [r4+r5] 2804 psadbw m2, m6 2805 psadbw m4, m6 2806 psadbw m3, m6 2807 psadbw m5, m6 2808 packusdw m2, m4 2809 packusdw m3, m5 2810 paddd m0, m2 2811 paddd m1, m3 2812%endmacro 2813 2814%macro SAD_X4_2x32P_AVX2 4 2815 mova m6, [r0+%1] 2816 movu m2, [r1+%2] 2817 movu m4, [r2+%2] 2818 movu m3, [r3+%2] 2819 movu m5, [r4+%2] 2820 psadbw m2, m6 2821 psadbw m4, m6 2822 psadbw m3, m6 2823 psadbw m5, m6 2824 packusdw m2, m4 2825 packusdw m3, m5 2826 paddd m0, m2 2827 paddd m1, m3 2828 2829 mova m6, [r0+%3] 2830 movu m2, [r1+%4] 2831 movu m4, [r2+%4] 2832 movu m3, [r3+%4] 2833 movu m5, [r4+%4] 2834 psadbw m2, m6 2835 psadbw m4, m6 2836 psadbw m3, m6 2837 psadbw m5, m6 2838 packusdw m2, m4 2839 packusdw m3, m5 2840 paddd m0, m2 2841 paddd m1, m3 2842%endmacro 2843 2844%macro SAD_X4_4x32P_AVX2 2 2845%if %1==0 2846 lea r6, [r5*3] 2847 SAD_X4_START_2x32P_AVX2 2848%else 2849 SAD_X4_2x32P_AVX2 FENC_STRIDE*(0+(%1&1)*4), r5*0, FENC_STRIDE*(1+(%1&1)*4), r5*1 2850%endif 2851 SAD_X4_2x32P_AVX2 FENC_STRIDE*(2+(%1&1)*4), r5*2, FENC_STRIDE*(3+(%1&1)*4), r6 2852%if %1 != %2-1 2853%if (%1&1) != 0 2854 add r0, 8*FENC_STRIDE 2855%endif 2856 lea r1, [r1+4*r5] 2857 lea r2, [r2+4*r5] 2858 lea r3, [r3+4*r5] 2859 lea r4, [r4+4*r5] 2860%endif 2861%endmacro 2862 2863%macro SAD_X3_END_AVX2 0 2864 movifnidn r5, r5mp 2865 packssdw m0, m1 ; 0 0 1 1 0 0 1 1 2866 packssdw m2, m2 ; 2 2 _ _ 2 2 _ _ 2867 phaddd m0, m2 ; 0 1 2 _ 0 1 2 _ 2868 vextracti128 xm1, m0, 1 2869 paddd xm0, xm1 ; 0 1 2 _ 2870 mova [r5], xm0 2871 RET 2872%endmacro 2873 2874%macro SAD_X4_END_AVX2 0 2875 mov r0, r6mp 2876 pshufd m0, m0, 0x8 2877 pshufd m1, m1, 0x8 2878 vextracti128 xm2, m0, 1 2879 vextracti128 xm3, m1, 1 2880 punpcklqdq xm0, xm1 2881 punpcklqdq xm2, xm3 2882 phaddd xm0, xm2 ; 0 1 2 3 2883 mova [r0], xm0 2884 RET 2885%endmacro 2886 2887%macro SAD_X4_32P_END_AVX2 0 2888 mov r0, r6mp 2889 vextracti128 xm2, m0, 1 2890 vextracti128 xm3, m1, 1 2891 paddd xm0, xm2 2892 paddd xm1, xm3 2893 phaddd xm0, xm1 2894 mova [r0], xm0 2895 RET 2896%endmacro 2897 2898;----------------------------------------------------------------------------- 2899; void pixel_sad_x3_16x16( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, 2900; uint8_t *pix2, intptr_t i_stride, int scores[3] ) 2901;----------------------------------------------------------------------------- 2902%macro SAD_X_SSE2 4 2903cglobal pixel_sad_x%1_%2x%3, 2+%1,3+%1,%4 2904%assign x 0 2905%rep %3/4 2906 SAD_X%1_4x%2P_SSE2 x, %3/4 2907%assign x x+1 2908%endrep 2909%if %3 == 64 2910 SAD_X%1_END_SSE2 1 2911%else 2912 SAD_X%1_END_SSE2 0 2913%endif 2914%endmacro 2915 2916%macro SAD_X3_W12 0 2917cglobal pixel_sad_x3_12x16, 5, 7, 8 2918 mova m4, [MSK] 2919 pxor m0, m0 2920 pxor m1, m1 2921 pxor m2, m2 2922 2923 SAD_X3_12x4 2924 SAD_X3_12x4 2925 SAD_X3_12x4 2926 SAD_X3_12x4 2927 SAD_X3_END_SSE2 1 2928%endmacro 2929 2930%macro SAD_X4_W12 0 2931cglobal pixel_sad_x4_12x16, 6, 8, 8 2932 mova m6, [MSK] 2933 pxor m0, m0 2934 pxor m1, m1 2935 pxor m2, m2 2936 pxor m3, m3 2937 2938 SAD_X4_12x4 2939 SAD_X4_12x4 2940 SAD_X4_12x4 2941 SAD_X4_12x4 2942 SAD_X4_END_SSE2 1 2943%endmacro 2944 2945%macro SAD_X3_W24 0 2946cglobal pixel_sad_x3_24x32, 5, 7, 8 2947 pxor m0, m0 2948 pxor m1, m1 2949 pxor m2, m2 2950 mov r6, 32 2951 2952.loop: 2953 SAD_X3_24x4 2954 SAD_X3_24x4 2955 SAD_X3_24x4 2956 SAD_X3_24x4 2957 2958 sub r6, 16 2959 cmp r6, 0 2960jnz .loop 2961 SAD_X3_END_SSE2 1 2962%endmacro 2963 2964%macro SAD_X4_W24 0 2965%if ARCH_X86_64 == 1 2966cglobal pixel_sad_x4_24x32, 6, 8, 8 2967%define count r7 2968%else 2969cglobal pixel_sad_x4_24x32, 6, 7, 8, 0-4 2970%define count dword [rsp] 2971%endif 2972 pxor m0, m0 2973 pxor m1, m1 2974 pxor m2, m2 2975 pxor m3, m3 2976 mov count, 32 2977 2978.loop: 2979 SAD_X4_24x4 2980 SAD_X4_24x4 2981 SAD_X4_24x4 2982 SAD_X4_24x4 2983 2984 sub count, 16 2985 jnz .loop 2986 SAD_X4_END_SSE2 1 2987 2988%endmacro 2989 2990%macro SAD_X3_W32 0 2991cglobal pixel_sad_x3_32x8, 5, 6, 8 2992 pxor m0, m0 2993 pxor m1, m1 2994 pxor m2, m2 2995 2996 SAD_X3_32x4 2997 SAD_X3_32x4 2998 SAD_X3_END_SSE2 1 2999 3000cglobal pixel_sad_x3_32x16, 5, 6, 8 3001 pxor m0, m0 3002 pxor m1, m1 3003 pxor m2, m2 3004 3005 SAD_X3_32x4 3006 SAD_X3_32x4 3007 SAD_X3_32x4 3008 SAD_X3_32x4 3009 SAD_X3_END_SSE2 1 3010 3011cglobal pixel_sad_x3_32x24, 5, 6, 8 3012 pxor m0, m0 3013 pxor m1, m1 3014 pxor m2, m2 3015 3016 SAD_X3_32x4 3017 SAD_X3_32x4 3018 SAD_X3_32x4 3019 SAD_X3_32x4 3020 SAD_X3_32x4 3021 SAD_X3_32x4 3022 SAD_X3_END_SSE2 1 3023 3024cglobal pixel_sad_x3_32x32, 5, 7, 8 3025 pxor m0, m0 3026 pxor m1, m1 3027 pxor m2, m2 3028 mov r6, 32 3029 3030.loop: 3031 SAD_X3_32x4 3032 SAD_X3_32x4 3033 SAD_X3_32x4 3034 SAD_X3_32x4 3035 3036 sub r6, 16 3037 cmp r6, 0 3038jnz .loop 3039 SAD_X3_END_SSE2 1 3040 3041cglobal pixel_sad_x3_32x64, 5, 7, 8 3042 pxor m0, m0 3043 pxor m1, m1 3044 pxor m2, m2 3045 mov r6, 64 3046 3047.loop1: 3048 SAD_X3_32x4 3049 SAD_X3_32x4 3050 SAD_X3_32x4 3051 SAD_X3_32x4 3052 3053 sub r6, 16 3054 cmp r6, 0 3055jnz .loop1 3056 SAD_X3_END_SSE2 1 3057%endmacro 3058 3059%macro SAD_X4_W32 0 3060cglobal pixel_sad_x4_32x8, 6, 7, 8 3061 pxor m0, m0 3062 pxor m1, m1 3063 pxor m2, m2 3064 pxor m3, m3 3065 3066 SAD_X4_32x4 3067 SAD_X4_32x4 3068 SAD_X4_END_SSE2 1 3069 3070cglobal pixel_sad_x4_32x16, 6, 7, 8 3071 pxor m0, m0 3072 pxor m1, m1 3073 pxor m2, m2 3074 pxor m3, m3 3075 3076 SAD_X4_32x4 3077 SAD_X4_32x4 3078 SAD_X4_32x4 3079 SAD_X4_32x4 3080 SAD_X4_END_SSE2 1 3081 3082cglobal pixel_sad_x4_32x24, 6, 7, 8 3083 pxor m0, m0 3084 pxor m1, m1 3085 pxor m2, m2 3086 pxor m3, m3 3087 3088 SAD_X4_32x4 3089 SAD_X4_32x4 3090 SAD_X4_32x4 3091 SAD_X4_32x4 3092 SAD_X4_32x4 3093 SAD_X4_32x4 3094 SAD_X4_END_SSE2 1 3095 3096%if ARCH_X86_64 == 1 3097cglobal pixel_sad_x4_32x32, 6, 8, 8 3098%define count r7 3099%else 3100cglobal pixel_sad_x4_32x32, 6, 7, 8, 0-4 3101%define count dword [rsp] 3102%endif 3103 pxor m0, m0 3104 pxor m1, m1 3105 pxor m2, m2 3106 pxor m3, m3 3107 mov count, 32 3108 3109.loop: 3110 SAD_X4_32x4 3111 SAD_X4_32x4 3112 SAD_X4_32x4 3113 SAD_X4_32x4 3114 3115 sub count, 16 3116 jnz .loop 3117 SAD_X4_END_SSE2 1 3118 3119%if ARCH_X86_64 == 1 3120cglobal pixel_sad_x4_32x64, 6, 8, 8 3121%define count r7 3122%else 3123cglobal pixel_sad_x4_32x64, 6, 7, 8, 0-4 3124%define count dword [rsp] 3125%endif 3126 pxor m0, m0 3127 pxor m1, m1 3128 pxor m2, m2 3129 pxor m3, m3 3130 mov count, 64 3131 3132.loop: 3133 SAD_X4_32x4 3134 SAD_X4_32x4 3135 SAD_X4_32x4 3136 SAD_X4_32x4 3137 3138 sub count, 16 3139 jnz .loop 3140 SAD_X4_END_SSE2 1 3141 3142%endmacro 3143 3144%macro SAD_X3_W48 0 3145cglobal pixel_sad_x3_48x64, 5, 7, 8 3146 pxor m0, m0 3147 pxor m1, m1 3148 pxor m2, m2 3149 mov r6, 64 3150 3151.loop: 3152 SAD_X3_48x4 3153 SAD_X3_48x4 3154 SAD_X3_48x4 3155 SAD_X3_48x4 3156 3157 sub r6, 16 3158 jnz .loop 3159 SAD_X3_END_SSE2 1 3160%endmacro 3161 3162%macro SAD_X4_W48 0 3163%if ARCH_X86_64 == 1 3164cglobal pixel_sad_x4_48x64, 6, 8, 8 3165%define count r7 3166%else 3167cglobal pixel_sad_x4_48x64, 6, 7, 8, 0-4 3168%define count dword [rsp] 3169%endif 3170 pxor m0, m0 3171 pxor m1, m1 3172 pxor m2, m2 3173 pxor m3, m3 3174 mov count, 64 3175 3176.loop: 3177 SAD_X4_48x4 3178 SAD_X4_48x4 3179 SAD_X4_48x4 3180 SAD_X4_48x4 3181 3182 sub count, 16 3183 jnz .loop 3184 SAD_X4_END_SSE2 1 3185%endmacro 3186 3187%macro SAD_X3_W64 0 3188cglobal pixel_sad_x3_64x16, 5, 7, 7 3189 pxor m0, m0 3190 pxor m1, m1 3191 pxor m2, m2 3192 mov r6, 16 3193 3194.loop: 3195 SAD_X3_64x4 3196 SAD_X3_64x4 3197 3198 sub r6, 8 3199 jnz .loop 3200 SAD_X3_END_SSE2 1 3201 3202cglobal pixel_sad_x3_64x32, 5, 7, 7 3203 pxor m0, m0 3204 pxor m1, m1 3205 pxor m2, m2 3206 mov r6, 32 3207 3208.loop: 3209 SAD_X3_64x4 3210 SAD_X3_64x4 3211 3212 sub r6, 8 3213 jnz .loop 3214 SAD_X3_END_SSE2 1 3215 3216cglobal pixel_sad_x3_64x48, 5, 7, 7 3217 pxor m0, m0 3218 pxor m1, m1 3219 pxor m2, m2 3220 mov r6, 48 3221 3222.loop: 3223 SAD_X3_64x4 3224 SAD_X3_64x4 3225 3226 sub r6, 8 3227 jnz .loop 3228 SAD_X3_END_SSE2 1 3229 3230cglobal pixel_sad_x3_64x64, 5, 7, 7 3231 pxor m0, m0 3232 pxor m1, m1 3233 pxor m2, m2 3234 mov r6, 64 3235 3236.loop: 3237 SAD_X3_64x4 3238 SAD_X3_64x4 3239 3240 sub r6, 8 3241 jnz .loop 3242 SAD_X3_END_SSE2 1 3243%endmacro 3244 3245%macro SAD_X4_W64 0 3246%if ARCH_X86_64 == 1 3247cglobal pixel_sad_x4_64x16, 6, 8, 8 3248%define count r7 3249%else 3250cglobal pixel_sad_x4_64x16, 6, 7, 8, 0-4 3251%define count dword [rsp] 3252%endif 3253 pxor m0, m0 3254 pxor m1, m1 3255 pxor m2, m2 3256 pxor m3, m3 3257 mov count, 16 3258 3259.loop: 3260 SAD_X4_64x4 3261 SAD_X4_64x4 3262 3263 sub count, 8 3264 jnz .loop 3265 SAD_X4_END_SSE2 1 3266 3267%if ARCH_X86_64 == 1 3268cglobal pixel_sad_x4_64x32, 6, 8, 8 3269%define count r7 3270%else 3271cglobal pixel_sad_x4_64x32, 6, 7, 8, 0-4 3272%define count dword [rsp] 3273%endif 3274 pxor m0, m0 3275 pxor m1, m1 3276 pxor m2, m2 3277 pxor m3, m3 3278 mov count, 32 3279 3280.loop: 3281 SAD_X4_64x4 3282 SAD_X4_64x4 3283 3284 sub count, 8 3285 jnz .loop 3286 SAD_X4_END_SSE2 1 3287 3288%if ARCH_X86_64 == 1 3289cglobal pixel_sad_x4_64x48, 6, 8, 8 3290%define count r7 3291%else 3292cglobal pixel_sad_x4_64x48, 6, 7, 8, 0-4 3293%define count dword [rsp] 3294%endif 3295 pxor m0, m0 3296 pxor m1, m1 3297 pxor m2, m2 3298 pxor m3, m3 3299 mov count, 48 3300 3301.loop: 3302 SAD_X4_64x4 3303 SAD_X4_64x4 3304 3305 sub count, 8 3306 jnz .loop 3307 SAD_X4_END_SSE2 1 3308 3309%if ARCH_X86_64 == 1 3310cglobal pixel_sad_x4_64x64, 6, 8, 8 3311%define count r7 3312%else 3313cglobal pixel_sad_x4_64x64, 6, 7, 8, 0-4 3314%define count dword [rsp] 3315%endif 3316 pxor m0, m0 3317 pxor m1, m1 3318 pxor m2, m2 3319 pxor m3, m3 3320 mov count, 64 3321 3322.loop: 3323 SAD_X4_64x4 3324 SAD_X4_64x4 3325 3326 sub count, 8 3327 jnz .loop 3328 SAD_X4_END_SSE2 1 3329%endmacro 3330 3331INIT_XMM sse2 3332SAD_X_SSE2 3, 16, 16, 7 3333SAD_X_SSE2 3, 16, 8, 7 3334SAD_X_SSE2 3, 8, 16, 7 3335SAD_X_SSE2 3, 8, 8, 7 3336SAD_X_SSE2 3, 8, 4, 7 3337SAD_X_SSE2 4, 16, 16, 7 3338SAD_X_SSE2 4, 16, 8, 7 3339SAD_X_SSE2 4, 8, 16, 7 3340SAD_X_SSE2 4, 8, 8, 7 3341SAD_X_SSE2 4, 8, 4, 7 3342 3343INIT_XMM sse3 3344SAD_X_SSE2 3, 16, 16, 7 3345SAD_X_SSE2 3, 16, 8, 7 3346SAD_X_SSE2 3, 16, 4, 7 3347SAD_X_SSE2 4, 16, 16, 7 3348SAD_X_SSE2 4, 16, 8, 7 3349SAD_X_SSE2 4, 16, 4, 7 3350 3351INIT_XMM ssse3 3352SAD_X3_W12 3353SAD_X3_W32 3354SAD_X3_W24 3355SAD_X3_W48 3356SAD_X3_W64 3357SAD_X_SSE2 3, 16, 64, 7 3358SAD_X_SSE2 3, 16, 32, 7 3359SAD_X_SSE2 3, 16, 16, 7 3360SAD_X_SSE2 3, 16, 12, 7 3361SAD_X_SSE2 3, 16, 8, 7 3362SAD_X_SSE2 3, 8, 32, 7 3363SAD_X_SSE2 3, 8, 16, 7 3364SAD_X4_W12 3365SAD_X4_W24 3366SAD_X4_W32 3367SAD_X4_W48 3368SAD_X4_W64 3369SAD_X_SSE2 4, 16, 64, 7 3370SAD_X_SSE2 4, 16, 32, 7 3371SAD_X_SSE2 4, 16, 16, 7 3372SAD_X_SSE2 4, 16, 12, 7 3373SAD_X_SSE2 4, 16, 8, 7 3374SAD_X_SSE2 4, 8, 32, 7 3375SAD_X_SSE2 4, 8, 16, 7 3376SAD_X_SSE2 4, 8, 8, 7 3377SAD_X_SSE2 4, 8, 4, 7 3378 3379INIT_XMM avx 3380SAD_X3_W12 3381SAD_X3_W32 3382SAD_X3_W24 3383SAD_X3_W48 3384SAD_X3_W64 3385SAD_X_SSE2 3, 16, 64, 7 3386SAD_X_SSE2 3, 16, 32, 6 3387SAD_X_SSE2 3, 16, 16, 6 3388SAD_X_SSE2 3, 16, 12, 6 3389SAD_X_SSE2 3, 16, 8, 6 3390SAD_X_SSE2 3, 16, 4, 6 3391SAD_X4_W12 3392SAD_X4_W24 3393SAD_X4_W32 3394SAD_X4_W48 3395SAD_X4_W64 3396SAD_X_SSE2 4, 16, 64, 7 3397SAD_X_SSE2 4, 16, 32, 7 3398SAD_X_SSE2 4, 16, 16, 7 3399SAD_X_SSE2 4, 16, 12, 7 3400SAD_X_SSE2 4, 16, 8, 7 3401SAD_X_SSE2 4, 16, 4, 7 3402 3403%macro SAD_X_AVX2 4 3404cglobal pixel_sad_x%1_%2x%3, 2+%1,3+%1,%4 3405%assign x 0 3406%rep %3/4 3407 SAD_X%1_4x%2P_AVX2 x, %3/4 3408%assign x x+1 3409%endrep 3410 3411 %if (%1==4) && (%2==32) 3412 SAD_X%1_32P_END_AVX2 3413 %else 3414 SAD_X%1_END_AVX2 3415 %endif 3416%endmacro 3417 3418INIT_YMM avx2 3419SAD_X_AVX2 3, 16, 32, 7 3420SAD_X_AVX2 3, 16, 16, 7 3421SAD_X_AVX2 3, 16, 12, 7 3422SAD_X_AVX2 3, 16, 8, 7 3423SAD_X_AVX2 4, 16, 32, 8 3424SAD_X_AVX2 4, 16, 16, 8 3425SAD_X_AVX2 4, 16, 12, 8 3426SAD_X_AVX2 4, 16, 8, 8 3427 3428SAD_X_AVX2 4, 32, 8, 8 3429SAD_X_AVX2 4, 32, 16, 8 3430SAD_X_AVX2 4, 32, 24, 8 3431SAD_X_AVX2 4, 32, 32, 8 3432SAD_X_AVX2 4, 32, 64, 8 3433 3434;============================================================================= 3435; SAD cacheline split 3436;============================================================================= 3437 3438; Core2 (Conroe) can load unaligned data just as quickly as aligned data... 3439; unless the unaligned data spans the border between 2 cachelines, in which 3440; case it's really slow. The exact numbers may differ, but all Intel cpus prior 3441; to Nehalem have a large penalty for cacheline splits. 3442; (8-byte alignment exactly half way between two cachelines is ok though.) 3443; LDDQU was supposed to fix this, but it only works on Pentium 4. 3444; So in the split case we load aligned data and explicitly perform the 3445; alignment between registers. Like on archs that have only aligned loads, 3446; except complicated by the fact that PALIGNR takes only an immediate, not 3447; a variable alignment. 3448; It is also possible to hoist the realignment to the macroblock level (keep 3449; 2 copies of the reference frame, offset by 32 bytes), but the extra memory 3450; needed for that method makes it often slower. 3451 3452; sad 16x16 costs on Core2: 3453; good offsets: 49 cycles (50/64 of all mvs) 3454; cacheline split: 234 cycles (14/64 of all mvs. ammortized: +40 cycles) 3455; page split: 3600 cycles (14/4096 of all mvs. ammortized: +11.5 cycles) 3456; cache or page split with palignr: 57 cycles (ammortized: +2 cycles) 3457 3458; computed jump assumes this loop is exactly 80 bytes 3459%macro SAD16_CACHELINE_LOOP_SSE2 1 ; alignment 3460ALIGN 16 3461sad_w16_align%1_sse2: 3462 movdqa xmm1, [r2+16] 3463 movdqa xmm2, [r2+r3+16] 3464 movdqa xmm3, [r2] 3465 movdqa xmm4, [r2+r3] 3466 pslldq xmm1, 16-%1 3467 pslldq xmm2, 16-%1 3468 psrldq xmm3, %1 3469 psrldq xmm4, %1 3470 por xmm1, xmm3 3471 por xmm2, xmm4 3472 psadbw xmm1, [r0] 3473 psadbw xmm2, [r0+r1] 3474 paddw xmm0, xmm1 3475 paddw xmm0, xmm2 3476 lea r0, [r0+2*r1] 3477 lea r2, [r2+2*r3] 3478 dec r4 3479 jg sad_w16_align%1_sse2 3480 ret 3481%endmacro 3482 3483; computed jump assumes this loop is exactly 64 bytes 3484%macro SAD16_CACHELINE_LOOP_SSSE3 1 ; alignment 3485ALIGN 16 3486sad_w16_align%1_ssse3: 3487 movdqa xmm1, [r2+16] 3488 movdqa xmm2, [r2+r3+16] 3489 palignr xmm1, [r2], %1 3490 palignr xmm2, [r2+r3], %1 3491 psadbw xmm1, [r0] 3492 psadbw xmm2, [r0+r1] 3493 paddw xmm0, xmm1 3494 paddw xmm0, xmm2 3495 lea r0, [r0+2*r1] 3496 lea r2, [r2+2*r3] 3497 dec r4 3498 jg sad_w16_align%1_ssse3 3499 ret 3500%endmacro 3501 3502%macro SAD16_CACHELINE_FUNC 2 ; cpu, height 3503cglobal pixel_sad_16x%2_cache64_%1 3504 mov eax, r2m 3505 and eax, 0x37 3506 cmp eax, 0x30 3507 jle pixel_sad_16x%2_sse2 3508 PROLOGUE 4,6 3509 mov r4d, r2d 3510 and r4d, 15 3511%ifidn %1, ssse3 3512 shl r4d, 6 ; code size = 64 3513%else 3514 lea r4, [r4*5] 3515 shl r4d, 4 ; code size = 80 3516%endif 3517%define sad_w16_addr (sad_w16_align1_%1 + (sad_w16_align1_%1 - sad_w16_align2_%1)) 3518%ifdef PIC 3519 lea r5, [sad_w16_addr] 3520 add r5, r4 3521%else 3522 lea r5, [sad_w16_addr + r4] 3523%endif 3524 and r2, ~15 3525 mov r4d, %2/2 3526 pxor xmm0, xmm0 3527 call r5 3528 movhlps xmm1, xmm0 3529 paddw xmm0, xmm1 3530 movd eax, xmm0 3531 RET 3532%endmacro 3533 3534%macro SAD_CACHELINE_START_MMX2 4 ; width, height, iterations, cacheline 3535 mov eax, r2m 3536 and eax, 0x17|%1|(%4>>1) 3537 cmp eax, 0x10|%1|(%4>>1) 3538 jle pixel_sad_%1x%2_mmx2 3539 and eax, 7 3540 shl eax, 3 3541 movd mm6, [pd_64] 3542 movd mm7, eax 3543 psubw mm6, mm7 3544 PROLOGUE 4,5 3545 and r2, ~7 3546 mov r4d, %3 3547 pxor mm0, mm0 3548%endmacro 3549 3550%macro SAD16_CACHELINE_FUNC_MMX2 2 ; height, cacheline 3551cglobal pixel_sad_16x%1_cache%2_mmx2 3552 SAD_CACHELINE_START_MMX2 16, %1, %1, %2 3553.loop: 3554 movq mm1, [r2] 3555 movq mm2, [r2+8] 3556 movq mm3, [r2+16] 3557 movq mm4, mm2 3558 psrlq mm1, mm7 3559 psllq mm2, mm6 3560 psllq mm3, mm6 3561 psrlq mm4, mm7 3562 por mm1, mm2 3563 por mm3, mm4 3564 psadbw mm1, [r0] 3565 psadbw mm3, [r0+8] 3566 paddw mm0, mm1 3567 paddw mm0, mm3 3568 add r2, r3 3569 add r0, r1 3570 dec r4 3571 jg .loop 3572 movd eax, mm0 3573 RET 3574%endmacro 3575 3576%macro SAD8_CACHELINE_FUNC_MMX2 2 ; height, cacheline 3577cglobal pixel_sad_8x%1_cache%2_mmx2 3578 SAD_CACHELINE_START_MMX2 8, %1, %1/2, %2 3579.loop: 3580 movq mm1, [r2+8] 3581 movq mm2, [r2+r3+8] 3582 movq mm3, [r2] 3583 movq mm4, [r2+r3] 3584 psllq mm1, mm6 3585 psllq mm2, mm6 3586 psrlq mm3, mm7 3587 psrlq mm4, mm7 3588 por mm1, mm3 3589 por mm2, mm4 3590 psadbw mm1, [r0] 3591 psadbw mm2, [r0+r1] 3592 paddw mm0, mm1 3593 paddw mm0, mm2 3594 lea r2, [r2+2*r3] 3595 lea r0, [r0+2*r1] 3596 dec r4 3597 jg .loop 3598 movd eax, mm0 3599 RET 3600%endmacro 3601 3602; sad_x3/x4_cache64: check each mv. 3603; if they're all within a cacheline, use normal sad_x3/x4. 3604; otherwise, send them individually to sad_cache64. 3605%macro CHECK_SPLIT 3 ; pix, width, cacheline 3606 mov eax, %1 3607 and eax, 0x17|%2|(%3>>1) 3608 cmp eax, 0x10|%2|(%3>>1) 3609 jg .split 3610%endmacro 3611 3612%macro SADX3_CACHELINE_FUNC 6 ; width, height, cacheline, normal_ver, split_ver, name 3613cglobal pixel_sad_x3_%1x%2_cache%3_%6 3614 CHECK_SPLIT r1m, %1, %3 3615 CHECK_SPLIT r2m, %1, %3 3616 CHECK_SPLIT r3m, %1, %3 3617 jmp pixel_sad_x3_%1x%2_%4 3618.split: 3619%if ARCH_X86_64 3620 PROLOGUE 6,9 3621 push r3 3622 push r2 3623%if WIN64 3624 movsxd r4, r4d 3625 sub rsp, 40 ; shadow space and alignment 3626%endif 3627 mov r2, r1 3628 mov r1, FENC_STRIDE 3629 mov r3, r4 3630 mov r7, r0 3631 mov r8, r5 3632 call pixel_sad_%1x%2_cache%3_%5 3633 mov [r8], eax 3634%if WIN64 3635 mov r2, [rsp+40+0*8] 3636%else 3637 pop r2 3638%endif 3639 mov r0, r7 3640 call pixel_sad_%1x%2_cache%3_%5 3641 mov [r8+4], eax 3642%if WIN64 3643 mov r2, [rsp+40+1*8] 3644%else 3645 pop r2 3646%endif 3647 mov r0, r7 3648 call pixel_sad_%1x%2_cache%3_%5 3649 mov [r8+8], eax 3650%if WIN64 3651 add rsp, 40+2*8 3652%endif 3653 RET 3654%else 3655 push edi 3656 mov edi, [esp+28] 3657 push dword [esp+24] 3658 push dword [esp+16] 3659 push dword 16 3660 push dword [esp+20] 3661 call pixel_sad_%1x%2_cache%3_%5 3662 mov ecx, [esp+32] 3663 mov [edi], eax 3664 mov [esp+8], ecx 3665 call pixel_sad_%1x%2_cache%3_%5 3666 mov ecx, [esp+36] 3667 mov [edi+4], eax 3668 mov [esp+8], ecx 3669 call pixel_sad_%1x%2_cache%3_%5 3670 mov [edi+8], eax 3671 add esp, 16 3672 pop edi 3673 ret 3674%endif 3675%endmacro 3676 3677%macro SADX4_CACHELINE_FUNC 6 ; width, height, cacheline, normal_ver, split_ver, name 3678cglobal pixel_sad_x4_%1x%2_cache%3_%6 3679 CHECK_SPLIT r1m, %1, %3 3680 CHECK_SPLIT r2m, %1, %3 3681 CHECK_SPLIT r3m, %1, %3 3682 CHECK_SPLIT r4m, %1, %3 3683 jmp pixel_sad_x4_%1x%2_%4 3684.split: 3685%if ARCH_X86_64 3686 PROLOGUE 6,9 3687 mov r8, r6mp 3688 push r4 3689 push r3 3690 push r2 3691%if WIN64 3692 sub rsp, 32 ; shadow space 3693%endif 3694 mov r2, r1 3695 mov r1, FENC_STRIDE 3696 mov r3, r5 3697 mov r7, r0 3698 call pixel_sad_%1x%2_cache%3_%5 3699 mov [r8], eax 3700%if WIN64 3701 mov r2, [rsp+32+0*8] 3702%else 3703 pop r2 3704%endif 3705 mov r0, r7 3706 call pixel_sad_%1x%2_cache%3_%5 3707 mov [r8+4], eax 3708%if WIN64 3709 mov r2, [rsp+32+1*8] 3710%else 3711 pop r2 3712%endif 3713 mov r0, r7 3714 call pixel_sad_%1x%2_cache%3_%5 3715 mov [r8+8], eax 3716%if WIN64 3717 mov r2, [rsp+32+2*8] 3718%else 3719 pop r2 3720%endif 3721 mov r0, r7 3722 call pixel_sad_%1x%2_cache%3_%5 3723 mov [r8+12], eax 3724%if WIN64 3725 add rsp, 32+3*8 3726%endif 3727 RET 3728%else 3729 push edi 3730 mov edi, [esp+32] 3731 push dword [esp+28] 3732 push dword [esp+16] 3733 push dword 16 3734 push dword [esp+20] 3735 call pixel_sad_%1x%2_cache%3_%5 3736 mov ecx, [esp+32] 3737 mov [edi], eax 3738 mov [esp+8], ecx 3739 call pixel_sad_%1x%2_cache%3_%5 3740 mov ecx, [esp+36] 3741 mov [edi+4], eax 3742 mov [esp+8], ecx 3743 call pixel_sad_%1x%2_cache%3_%5 3744 mov ecx, [esp+40] 3745 mov [edi+8], eax 3746 mov [esp+8], ecx 3747 call pixel_sad_%1x%2_cache%3_%5 3748 mov [edi+12], eax 3749 add esp, 16 3750 pop edi 3751 ret 3752%endif 3753%endmacro 3754 3755%macro SADX34_CACHELINE_FUNC 1+ 3756 SADX3_CACHELINE_FUNC %1 3757 SADX4_CACHELINE_FUNC %1 3758%endmacro 3759 3760 3761; instantiate the aligned sads 3762 3763INIT_MMX 3764%if ARCH_X86_64 == 0 3765SAD16_CACHELINE_FUNC_MMX2 8, 32 3766SAD16_CACHELINE_FUNC_MMX2 16, 32 3767SAD8_CACHELINE_FUNC_MMX2 4, 32 3768SAD8_CACHELINE_FUNC_MMX2 8, 32 3769SAD8_CACHELINE_FUNC_MMX2 16, 32 3770SAD16_CACHELINE_FUNC_MMX2 8, 64 3771SAD16_CACHELINE_FUNC_MMX2 16, 64 3772%endif ; !ARCH_X86_64 3773SAD8_CACHELINE_FUNC_MMX2 4, 64 3774SAD8_CACHELINE_FUNC_MMX2 8, 64 3775SAD8_CACHELINE_FUNC_MMX2 16, 64 3776 3777%if ARCH_X86_64 == 0 3778SADX34_CACHELINE_FUNC 16, 16, 32, mmx2, mmx2, mmx2 3779SADX34_CACHELINE_FUNC 16, 8, 32, mmx2, mmx2, mmx2 3780SADX34_CACHELINE_FUNC 8, 16, 32, mmx2, mmx2, mmx2 3781SADX34_CACHELINE_FUNC 8, 8, 32, mmx2, mmx2, mmx2 3782SADX34_CACHELINE_FUNC 16, 16, 64, mmx2, mmx2, mmx2 3783SADX34_CACHELINE_FUNC 16, 8, 64, mmx2, mmx2, mmx2 3784%endif ; !ARCH_X86_64 3785SADX34_CACHELINE_FUNC 8, 16, 64, mmx2, mmx2, mmx2 3786SADX34_CACHELINE_FUNC 8, 8, 64, mmx2, mmx2, mmx2 3787 3788%if ARCH_X86_64 == 0 3789SAD16_CACHELINE_FUNC sse2, 8 3790SAD16_CACHELINE_FUNC sse2, 16 3791%assign i 1 3792%rep 15 3793SAD16_CACHELINE_LOOP_SSE2 i 3794%assign i i+1 3795%endrep 3796SADX34_CACHELINE_FUNC 16, 16, 64, sse2, sse2, sse2 3797SADX34_CACHELINE_FUNC 16, 8, 64, sse2, sse2, sse2 3798%endif ; !ARCH_X86_64 3799SADX34_CACHELINE_FUNC 8, 16, 64, sse2, mmx2, sse2 3800 3801SAD16_CACHELINE_FUNC ssse3, 8 3802SAD16_CACHELINE_FUNC ssse3, 16 3803%assign i 1 3804%rep 15 3805SAD16_CACHELINE_LOOP_SSSE3 i 3806%assign i i+1 3807%endrep 3808SADX34_CACHELINE_FUNC 16, 16, 64, sse2, ssse3, ssse3 3809SADX34_CACHELINE_FUNC 16, 8, 64, sse2, ssse3, ssse3 3810 3811%if HIGH_BIT_DEPTH==0 3812INIT_YMM avx2 3813cglobal pixel_sad_x3_8x4, 6,6,5 3814 xorps m0, m0 3815 xorps m1, m1 3816 3817 sub r2, r1 ; rebase on pointer r1 3818 sub r3, r1 3819 3820 ; row 0 3821 vpbroadcastq xm2, [r0 + 0 * FENC_STRIDE] 3822 movq xm3, [r1] 3823 movhps xm3, [r1 + r2] 3824 movq xm4, [r1 + r3] 3825 psadbw xm3, xm2 3826 psadbw xm4, xm2 3827 paddd xm0, xm3 3828 paddd xm1, xm4 3829 add r1, r4 3830 3831 ; row 1 3832 vpbroadcastq xm2, [r0 + 1 * FENC_STRIDE] 3833 movq xm3, [r1] 3834 movhps xm3, [r1 + r2] 3835 movq xm4, [r1 + r3] 3836 psadbw xm3, xm2 3837 psadbw xm4, xm2 3838 paddd xm0, xm3 3839 paddd xm1, xm4 3840 add r1, r4 3841 3842 ; row 2 3843 vpbroadcastq xm2, [r0 + 2 * FENC_STRIDE] 3844 movq xm3, [r1] 3845 movhps xm3, [r1 + r2] 3846 movq xm4, [r1 + r3] 3847 psadbw xm3, xm2 3848 psadbw xm4, xm2 3849 paddd xm0, xm3 3850 paddd xm1, xm4 3851 add r1, r4 3852 3853 ; row 3 3854 vpbroadcastq xm2, [r0 + 3 * FENC_STRIDE] 3855 movq xm3, [r1] 3856 movhps xm3, [r1 + r2] 3857 movq xm4, [r1 + r3] 3858 psadbw xm3, xm2 3859 psadbw xm4, xm2 3860 paddd xm0, xm3 3861 paddd xm1, xm4 3862 3863 pshufd xm0, xm0, q0020 3864 movq [r5 + 0], xm0 3865 movd [r5 + 8], xm1 3866 RET 3867 3868INIT_YMM avx2 3869cglobal pixel_sad_x3_8x8, 6,6,5 3870 xorps m0, m0 3871 xorps m1, m1 3872 3873 sub r2, r1 ; rebase on pointer r1 3874 sub r3, r1 3875%assign x 0 3876%rep 4 3877 ; row 0 3878 vpbroadcastq xm2, [r0 + 0 * FENC_STRIDE] 3879 movq xm3, [r1] 3880 movhps xm3, [r1 + r2] 3881 movq xm4, [r1 + r3] 3882 psadbw xm3, xm2 3883 psadbw xm4, xm2 3884 paddd xm0, xm3 3885 paddd xm1, xm4 3886 add r1, r4 3887 3888 ; row 1 3889 vpbroadcastq xm2, [r0 + 1 * FENC_STRIDE] 3890 movq xm3, [r1] 3891 movhps xm3, [r1 + r2] 3892 movq xm4, [r1 + r3] 3893 psadbw xm3, xm2 3894 psadbw xm4, xm2 3895 paddd xm0, xm3 3896 paddd xm1, xm4 3897 3898%assign x x+1 3899 %if x < 4 3900 add r1, r4 3901 add r0, 2 * FENC_STRIDE 3902 %endif 3903%endrep 3904 3905 pshufd xm0, xm0, q0020 3906 movq [r5 + 0], xm0 3907 movd [r5 + 8], xm1 3908 RET 3909 3910INIT_YMM avx2 3911cglobal pixel_sad_x3_8x16, 6,6,5 3912 xorps m0, m0 3913 xorps m1, m1 3914 3915 sub r2, r1 ; rebase on pointer r1 3916 sub r3, r1 3917%assign x 0 3918%rep 8 3919 ; row 0 3920 vpbroadcastq xm2, [r0 + 0 * FENC_STRIDE] 3921 movq xm3, [r1] 3922 movhps xm3, [r1 + r2] 3923 movq xm4, [r1 + r3] 3924 psadbw xm3, xm2 3925 psadbw xm4, xm2 3926 paddd xm0, xm3 3927 paddd xm1, xm4 3928 add r1, r4 3929 3930 ; row 1 3931 vpbroadcastq xm2, [r0 + 1 * FENC_STRIDE] 3932 movq xm3, [r1] 3933 movhps xm3, [r1 + r2] 3934 movq xm4, [r1 + r3] 3935 psadbw xm3, xm2 3936 psadbw xm4, xm2 3937 paddd xm0, xm3 3938 paddd xm1, xm4 3939 3940%assign x x+1 3941 %if x < 8 3942 add r1, r4 3943 add r0, 2 * FENC_STRIDE 3944 %endif 3945%endrep 3946 3947 pshufd xm0, xm0, q0020 3948 movq [r5 + 0], xm0 3949 movd [r5 + 8], xm1 3950 RET 3951 3952INIT_YMM avx2 3953cglobal pixel_sad_x4_8x8, 7,7,5 3954 xorps m0, m0 3955 xorps m1, m1 3956 3957 sub r2, r1 ; rebase on pointer r1 3958 sub r3, r1 3959 sub r4, r1 3960%assign x 0 3961%rep 4 3962 ; row 0 3963 vpbroadcastq xm2, [r0 + 0 * FENC_STRIDE] 3964 movq xm3, [r1] 3965 movhps xm3, [r1 + r2] 3966 movq xm4, [r1 + r3] 3967 movhps xm4, [r1 + r4] 3968 psadbw xm3, xm2 3969 psadbw xm4, xm2 3970 paddd xm0, xm3 3971 paddd xm1, xm4 3972 add r1, r5 3973 3974 ; row 1 3975 vpbroadcastq xm2, [r0 + 1 * FENC_STRIDE] 3976 movq xm3, [r1] 3977 movhps xm3, [r1 + r2] 3978 movq xm4, [r1 + r3] 3979 movhps xm4, [r1 + r4] 3980 psadbw xm3, xm2 3981 psadbw xm4, xm2 3982 paddd xm0, xm3 3983 paddd xm1, xm4 3984 3985%assign x x+1 3986 %if x < 4 3987 add r1, r5 3988 add r0, 2 * FENC_STRIDE 3989 %endif 3990%endrep 3991 3992 pshufd xm0, xm0, q0020 3993 pshufd xm1, xm1, q0020 3994 movq [r6 + 0], xm0 3995 movq [r6 + 8], xm1 3996 RET 3997 3998INIT_YMM avx2 3999cglobal pixel_sad_32x8, 4,4,6 4000 xorps m0, m0 4001 xorps m5, m5 4002 4003 movu m1, [r0] ; row 0 of pix0 4004 movu m2, [r2] ; row 0 of pix1 4005 movu m3, [r0 + r1] ; row 1 of pix0 4006 movu m4, [r2 + r3] ; row 1 of pix1 4007 4008 psadbw m1, m2 4009 psadbw m3, m4 4010 paddd m0, m1 4011 paddd m5, m3 4012 4013 lea r2, [r2 + 2 * r3] 4014 lea r0, [r0 + 2 * r1] 4015 4016 movu m1, [r0] ; row 2 of pix0 4017 movu m2, [r2] ; row 2 of pix1 4018 movu m3, [r0 + r1] ; row 3 of pix0 4019 movu m4, [r2 + r3] ; row 3 of pix1 4020 4021 psadbw m1, m2 4022 psadbw m3, m4 4023 paddd m0, m1 4024 paddd m5, m3 4025 4026 lea r2, [r2 + 2 * r3] 4027 lea r0, [r0 + 2 * r1] 4028 4029 movu m1, [r0] ; row 4 of pix0 4030 movu m2, [r2] ; row 4 of pix1 4031 movu m3, [r0 + r1] ; row 5 of pix0 4032 movu m4, [r2 + r3] ; row 5 of pix1 4033 4034 psadbw m1, m2 4035 psadbw m3, m4 4036 paddd m0, m1 4037 paddd m5, m3 4038 4039 lea r2, [r2 + 2 * r3] 4040 lea r0, [r0 + 2 * r1] 4041 4042 movu m1, [r0] ; row 6 of pix0 4043 movu m2, [r2] ; row 6 of pix1 4044 movu m3, [r0 + r1] ; row 7 of pix0 4045 movu m4, [r2 + r3] ; row 7 of pix1 4046 4047 psadbw m1, m2 4048 psadbw m3, m4 4049 paddd m0, m1 4050 paddd m5, m3 4051 4052 paddd m0, m5 4053 vextracti128 xm1, m0, 1 4054 paddd xm0, xm1 4055 pshufd xm1, xm0, 2 4056 paddd xm0,xm1 4057 movd eax, xm0 4058 RET 4059 4060INIT_YMM avx2 4061cglobal pixel_sad_32x16, 4,5,6 4062 xorps m0, m0 4063 xorps m5, m5 4064 mov r4d, 4 4065 4066.loop 4067 movu m1, [r0] ; row 0 of pix0 4068 movu m2, [r2] ; row 0 of pix1 4069 movu m3, [r0 + r1] ; row 1 of pix0 4070 movu m4, [r2 + r3] ; row 1 of pix1 4071 4072 psadbw m1, m2 4073 psadbw m3, m4 4074 paddd m0, m1 4075 paddd m5, m3 4076 4077 lea r2, [r2 + 2 * r3] 4078 lea r0, [r0 + 2 * r1] 4079 4080 movu m1, [r0] ; row 2 of pix0 4081 movu m2, [r2] ; row 2 of pix1 4082 movu m3, [r0 + r1] ; row 3 of pix0 4083 movu m4, [r2 + r3] ; row 3 of pix1 4084 4085 psadbw m1, m2 4086 psadbw m3, m4 4087 paddd m0, m1 4088 paddd m5, m3 4089 4090 lea r2, [r2 + 2 * r3] 4091 lea r0, [r0 + 2 * r1] 4092 4093 dec r4d 4094 jnz .loop 4095 4096 paddd m0, m5 4097 vextracti128 xm1, m0, 1 4098 paddd xm0, xm1 4099 pshufd xm1, xm0, 2 4100 paddd xm0,xm1 4101 movd eax, xm0 4102 RET 4103 4104INIT_YMM avx2 4105cglobal pixel_sad_32x24, 4,7,6 4106 xorps m0, m0 4107 xorps m5, m5 4108 mov r4d, 6 4109 lea r5, [r1 * 3] 4110 lea r6, [r3 * 3] 4111.loop 4112 movu m1, [r0] ; row 0 of pix0 4113 movu m2, [r2] ; row 0 of pix1 4114 movu m3, [r0 + r1] ; row 1 of pix0 4115 movu m4, [r2 + r3] ; row 1 of pix1 4116 4117 psadbw m1, m2 4118 psadbw m3, m4 4119 paddd m0, m1 4120 paddd m5, m3 4121 4122 movu m1, [r0 + 2 * r1] ; row 2 of pix0 4123 movu m2, [r2 + 2 * r3] ; row 2 of pix1 4124 movu m3, [r0 + r5] ; row 3 of pix0 4125 movu m4, [r2 + r6] ; row 3 of pix1 4126 4127 psadbw m1, m2 4128 psadbw m3, m4 4129 paddd m0, m1 4130 paddd m5, m3 4131 4132 lea r2, [r2 + 4 * r3] 4133 lea r0, [r0 + 4 * r1] 4134 4135 dec r4d 4136 jnz .loop 4137 4138 paddd m0, m5 4139 vextracti128 xm1, m0, 1 4140 paddd xm0, xm1 4141 pshufd xm1, xm0, 2 4142 paddd xm0,xm1 4143 movd eax, xm0 4144 RET 4145 4146INIT_YMM avx2 4147cglobal pixel_sad_32x32, 4,7,5 4148 xorps m0, m0 4149 mov r4d, 32/4 4150 lea r5, [r1 * 3] 4151 lea r6, [r3 * 3] 4152 4153.loop 4154 movu m1, [r0] ; row 0 of pix0 4155 movu m2, [r2] ; row 0 of pix1 4156 movu m3, [r0 + r1] ; row 1 of pix0 4157 movu m4, [r2 + r3] ; row 1 of pix1 4158 4159 psadbw m1, m2 4160 psadbw m3, m4 4161 paddd m0, m1 4162 paddd m0, m3 4163 4164 movu m1, [r0 + 2 * r1] ; row 2 of pix0 4165 movu m2, [r2 + 2 * r3] ; row 2 of pix1 4166 movu m3, [r0 + r5] ; row 3 of pix0 4167 movu m4, [r2 + r6] ; row 3 of pix1 4168 4169 psadbw m1, m2 4170 psadbw m3, m4 4171 paddd m0, m1 4172 paddd m0, m3 4173 4174 lea r2, [r2 + 4 * r3] 4175 lea r0, [r0 + 4 * r1] 4176 4177 dec r4d 4178 jnz .loop 4179 4180 vextracti128 xm1, m0, 1 4181 paddd xm0, xm1 4182 pshufd xm1, xm0, 2 4183 paddd xm0,xm1 4184 movd eax, xm0 4185 RET 4186 4187 INIT_YMM avx2 4188cglobal pixel_sad_32x64, 4,7,5 4189 xorps m0, m0 4190 mov r4d, 64/8 4191 lea r5, [r1 * 3] 4192 lea r6, [r3 * 3] 4193 4194.loop 4195 movu m1, [r0] ; row 0 of pix0 4196 movu m2, [r2] ; row 0 of pix1 4197 movu m3, [r0 + r1] ; row 1 of pix0 4198 movu m4, [r2 + r3] ; row 1 of pix1 4199 4200 psadbw m1, m2 4201 psadbw m3, m4 4202 paddd m0, m1 4203 paddd m0, m3 4204 4205 movu m1, [r0 + 2 * r1] ; row 2 of pix0 4206 movu m2, [r2 + 2 * r3] ; row 2 of pix1 4207 movu m3, [r0 + r5] ; row 3 of pix0 4208 movu m4, [r2 + r6] ; row 3 of pix1 4209 4210 psadbw m1, m2 4211 psadbw m3, m4 4212 paddd m0, m1 4213 paddd m0, m3 4214 4215 lea r2, [r2 + 4 * r3] 4216 lea r0, [r0 + 4 * r1] 4217 4218 movu m1, [r0] ; row 4 of pix0 4219 movu m2, [r2] ; row 4 of pix1 4220 movu m3, [r0 + r1] ; row 5 of pix0 4221 movu m4, [r2 + r3] ; row 5 of pix1 4222 4223 psadbw m1, m2 4224 psadbw m3, m4 4225 paddd m0, m1 4226 paddd m0, m3 4227 4228 movu m1, [r0 + 2 * r1] ; row 6 of pix0 4229 movu m2, [r2 + 2 * r3] ; row 6 of pix1 4230 movu m3, [r0 + r5] ; row 7 of pix0 4231 movu m4, [r2 + r6] ; row 7 of pix1 4232 4233 psadbw m1, m2 4234 psadbw m3, m4 4235 paddd m0, m1 4236 paddd m0, m3 4237 4238 lea r2, [r2 + 4 * r3] 4239 lea r0, [r0 + 4 * r1] 4240 4241 dec r4d 4242 jnz .loop 4243 4244 vextracti128 xm1, m0, 1 4245 paddd xm0, xm1 4246 pshufd xm1, xm0, 2 4247 paddd xm0,xm1 4248 movd eax, xm0 4249 RET 4250 4251INIT_YMM avx2 4252cglobal pixel_sad_48x64, 4,7,7 4253 xorps m0, m0 4254 mov r4d, 64/4 4255 lea r5, [r1 * 3] 4256 lea r6, [r3 * 3] 4257.loop 4258 movu m1, [r0] ; row 0 of pix0 4259 movu m2, [r2] ; row 0 of pix1 4260 movu m3, [r0 + r1] ; row 1 of pix0 4261 movu m4, [r2 + r3] ; row 1 of pix1 4262 movu xm5, [r0 +32] ; last 16 of row 0 of pix0 4263 vinserti128 m5, m5, [r0 + r1 + 32], 1 4264 movu xm6, [r2 +32] ; last 16 of row 0 of pix1 4265 vinserti128 m6, m6, [r2 + r3 + 32], 1 4266 4267 psadbw m1, m2 4268 psadbw m3, m4 4269 psadbw m5, m6 4270 paddd m0, m1 4271 paddd m0, m3 4272 paddd m0, m5 4273 4274 movu m1, [r0 + 2 * r1] ; row 2 of pix0 4275 movu m2, [r2 + 2 * r3] ; row 2 of pix1 4276 movu m3, [r0 + r5] ; row 3 of pix0 4277 movu m4, [r2 + r6] ; row 3 of pix1 4278 movu xm5, [r0 +32 + 2 * r1] 4279 vinserti128 m5, m5, [r0 + r5 + 32], 1 4280 movu xm6, [r2 +32 + 2 * r3] 4281 vinserti128 m6, m6, [r2 + r6 + 32], 1 4282 4283 psadbw m1, m2 4284 psadbw m3, m4 4285 psadbw m5, m6 4286 paddd m0, m1 4287 paddd m0, m3 4288 paddd m0, m5 4289 4290 lea r2, [r2 + 4 * r3] 4291 lea r0, [r0 + 4 * r1] 4292 4293 dec r4d 4294 jnz .loop 4295 4296 vextracti128 xm1, m0, 1 4297 paddd xm0, xm1 4298 pshufd xm1, xm0, 2 4299 paddd xm0,xm1 4300 movd eax, xm0 4301 RET 4302 4303INIT_YMM avx2 4304cglobal pixel_sad_64x16, 4,5,6 4305 xorps m0, m0 4306 xorps m5, m5 4307 mov r4d, 4 4308.loop 4309 movu m1, [r0] ; first 32 of row 0 of pix0 4310 movu m2, [r2] ; first 32 of row 0 of pix1 4311 movu m3, [r0 + 32] ; second 32 of row 0 of pix0 4312 movu m4, [r2 + 32] ; second 32 of row 0 of pix1 4313 4314 psadbw m1, m2 4315 psadbw m3, m4 4316 paddd m0, m1 4317 paddd m5, m3 4318 4319 movu m1, [r0 + r1] ; first 32 of row 1 of pix0 4320 movu m2, [r2 + r3] ; first 32 of row 1 of pix1 4321 movu m3, [r0 + 32 + r1] ; second 32 of row 1 of pix0 4322 movu m4, [r2 + 32 + r3] ; second 32 of row 1 of pix1 4323 4324 psadbw m1, m2 4325 psadbw m3, m4 4326 paddd m0, m1 4327 paddd m5, m3 4328 4329 lea r2, [r2 + 2 * r3] 4330 lea r0, [r0 + 2 * r1] 4331 4332 movu m1, [r0] ; first 32 of row 2 of pix0 4333 movu m2, [r2] ; first 32 of row 2 of pix1 4334 movu m3, [r0 + 32] ; second 32 of row 2 of pix0 4335 movu m4, [r2 + 32] ; second 32 of row 2 of pix1 4336 4337 psadbw m1, m2 4338 psadbw m3, m4 4339 paddd m0, m1 4340 paddd m5, m3 4341 4342 movu m1, [r0 + r1] ; first 32 of row 3 of pix0 4343 movu m2, [r2 + r3] ; first 32 of row 3 of pix1 4344 movu m3, [r0 + 32 + r1] ; second 32 of row 3 of pix0 4345 movu m4, [r2 + 32 + r3] ; second 32 of row 3 of pix1 4346 4347 psadbw m1, m2 4348 psadbw m3, m4 4349 paddd m0, m1 4350 paddd m5, m3 4351 4352 lea r2, [r2 + 2 * r3] 4353 lea r0, [r0 + 2 * r1] 4354 4355 dec r4d 4356 jnz .loop 4357 4358 paddd m0, m5 4359 vextracti128 xm1, m0, 1 4360 paddd xm0, xm1 4361 pshufd xm1, xm0, 2 4362 paddd xm0,xm1 4363 movd eax, xm0 4364 RET 4365 4366INIT_YMM avx2 4367cglobal pixel_sad_64x32, 4,5,6 4368 xorps m0, m0 4369 xorps m5, m5 4370 mov r4d, 16 4371.loop 4372 movu m1, [r0] ; first 32 of row 0 of pix0 4373 movu m2, [r2] ; first 32 of row 0 of pix1 4374 movu m3, [r0 + 32] ; second 32 of row 0 of pix0 4375 movu m4, [r2 + 32] ; second 32 of row 0 of pix1 4376 4377 psadbw m1, m2 4378 psadbw m3, m4 4379 paddd m0, m1 4380 paddd m5, m3 4381 4382 movu m1, [r0 + r1] ; first 32 of row 1 of pix0 4383 movu m2, [r2 + r3] ; first 32 of row 1 of pix1 4384 movu m3, [r0 + 32 + r1] ; second 32 of row 1 of pix0 4385 movu m4, [r2 + 32 + r3] ; second 32 of row 1 of pix1 4386 4387 psadbw m1, m2 4388 psadbw m3, m4 4389 paddd m0, m1 4390 paddd m5, m3 4391 4392 lea r2, [r2 + 2 * r3] 4393 lea r0, [r0 + 2 * r1] 4394 4395 dec r4d 4396 jnz .loop 4397 4398 paddd m0, m5 4399 vextracti128 xm1, m0, 1 4400 paddd xm0, xm1 4401 pshufd xm1, xm0, 2 4402 paddd xm0,xm1 4403 movd eax, xm0 4404 RET 4405 4406INIT_YMM avx2 4407cglobal pixel_sad_64x48, 4,7,6 4408 xorps m0, m0 4409 xorps m5, m5 4410 mov r4d, 12 4411 lea r5, [r1 * 3] 4412 lea r6, [r3 * 3] 4413.loop 4414 movu m1, [r0] ; first 32 of row 0 of pix0 4415 movu m2, [r2] ; first 32 of row 0 of pix1 4416 movu m3, [r0 + 32] ; second 32 of row 0 of pix0 4417 movu m4, [r2 + 32] ; second 32 of row 0 of pix1 4418 4419 psadbw m1, m2 4420 psadbw m3, m4 4421 paddd m0, m1 4422 paddd m5, m3 4423 4424 movu m1, [r0 + r1] ; first 32 of row 1 of pix0 4425 movu m2, [r2 + r3] ; first 32 of row 1 of pix1 4426 movu m3, [r0 + 32 + r1] ; second 32 of row 1 of pix0 4427 movu m4, [r2 + 32 + r3] ; second 32 of row 1 of pix1 4428 4429 psadbw m1, m2 4430 psadbw m3, m4 4431 paddd m0, m1 4432 paddd m5, m3 4433 4434 movu m1, [r0 + 2 * r1] ; first 32 of row 0 of pix0 4435 movu m2, [r2 + 2 * r3] ; first 32 of row 0 of pix1 4436 movu m3, [r0 + 2 * r1 + 32] ; second 32 of row 0 of pix0 4437 movu m4, [r2 + 2 * r3 + 32] ; second 32 of row 0 of pix1 4438 4439 psadbw m1, m2 4440 psadbw m3, m4 4441 paddd m0, m1 4442 paddd m5, m3 4443 4444 movu m1, [r0 + r5] ; first 32 of row 1 of pix0 4445 movu m2, [r2 + r6] ; first 32 of row 1 of pix1 4446 movu m3, [r0 + 32 + r5] ; second 32 of row 1 of pix0 4447 movu m4, [r2 + 32 + r6] ; second 32 of row 1 of pix1 4448 4449 psadbw m1, m2 4450 psadbw m3, m4 4451 paddd m0, m1 4452 paddd m5, m3 4453 4454 lea r2, [r2 + 4 * r3] 4455 lea r0, [r0 + 4 * r1] 4456 4457 dec r4d 4458 jnz .loop 4459 4460 paddd m0, m5 4461 vextracti128 xm1, m0, 1 4462 paddd xm0, xm1 4463 pshufd xm1, xm0, 2 4464 paddd xm0,xm1 4465 movd eax, xm0 4466 RET 4467 4468INIT_YMM avx2 4469cglobal pixel_sad_64x64, 4,7,6 4470 xorps m0, m0 4471 xorps m5, m5 4472 mov r4d, 8 4473 lea r5, [r1 * 3] 4474 lea r6, [r3 * 3] 4475.loop 4476 movu m1, [r0] ; first 32 of row 0 of pix0 4477 movu m2, [r2] ; first 32 of row 0 of pix1 4478 movu m3, [r0 + 32] ; second 32 of row 0 of pix0 4479 movu m4, [r2 + 32] ; second 32 of row 0 of pix1 4480 4481 psadbw m1, m2 4482 psadbw m3, m4 4483 paddd m0, m1 4484 paddd m5, m3 4485 4486 movu m1, [r0 + r1] ; first 32 of row 1 of pix0 4487 movu m2, [r2 + r3] ; first 32 of row 1 of pix1 4488 movu m3, [r0 + 32 + r1] ; second 32 of row 1 of pix0 4489 movu m4, [r2 + 32 + r3] ; second 32 of row 1 of pix1 4490 4491 psadbw m1, m2 4492 psadbw m3, m4 4493 paddd m0, m1 4494 paddd m5, m3 4495 4496 movu m1, [r0 + 2 * r1] ; first 32 of row 2 of pix0 4497 movu m2, [r2 + 2 * r3] ; first 32 of row 2 of pix1 4498 movu m3, [r0 + 2 * r1 + 32] ; second 32 of row 2 of pix0 4499 movu m4, [r2 + 2 * r3 + 32] ; second 32 of row 2 of pix1 4500 4501 psadbw m1, m2 4502 psadbw m3, m4 4503 paddd m0, m1 4504 paddd m5, m3 4505 4506 movu m1, [r0 + r5] ; first 32 of row 3 of pix0 4507 movu m2, [r2 + r6] ; first 32 of row 3 of pix1 4508 movu m3, [r0 + 32 + r5] ; second 32 of row 3 of pix0 4509 movu m4, [r2 + 32 + r6] ; second 32 of row 3 of pix1 4510 4511 psadbw m1, m2 4512 psadbw m3, m4 4513 paddd m0, m1 4514 paddd m5, m3 4515 4516 lea r2, [r2 + 4 * r3] 4517 lea r0, [r0 + 4 * r1] 4518 4519 movu m1, [r0] ; first 32 of row 4 of pix0 4520 movu m2, [r2] ; first 32 of row 4 of pix1 4521 movu m3, [r0 + 32] ; second 32 of row 4 of pix0 4522 movu m4, [r2 + 32] ; second 32 of row 4 of pix1 4523 4524 psadbw m1, m2 4525 psadbw m3, m4 4526 paddd m0, m1 4527 paddd m5, m3 4528 4529 movu m1, [r0 + r1] ; first 32 of row 5 of pix0 4530 movu m2, [r2 + r3] ; first 32 of row 5 of pix1 4531 movu m3, [r0 + 32 + r1] ; second 32 of row 5 of pix0 4532 movu m4, [r2 + 32 + r3] ; second 32 of row 5 of pix1 4533 4534 psadbw m1, m2 4535 psadbw m3, m4 4536 paddd m0, m1 4537 paddd m5, m3 4538 4539 movu m1, [r0 + 2 * r1] ; first 32 of row 6 of pix0 4540 movu m2, [r2 + 2 * r3] ; first 32 of row 6 of pix1 4541 movu m3, [r0 + 2 * r1 + 32] ; second 32 of row 6 of pix0 4542 movu m4, [r2 + 2 * r3 + 32] ; second 32 of row 6 of pix1 4543 4544 psadbw m1, m2 4545 psadbw m3, m4 4546 paddd m0, m1 4547 paddd m5, m3 4548 4549 movu m1, [r0 + r5] ; first 32 of row 7 of pix0 4550 movu m2, [r2 + r6] ; first 32 of row 7 of pix1 4551 movu m3, [r0 + 32 + r5] ; second 32 of row 7 of pix0 4552 movu m4, [r2 + 32 + r6] ; second 32 of row 7 of pix1 4553 4554 psadbw m1, m2 4555 psadbw m3, m4 4556 paddd m0, m1 4557 paddd m5, m3 4558 4559 lea r2, [r2 + 4 * r3] 4560 lea r0, [r0 + 4 * r1] 4561 4562 dec r4d 4563 jnz .loop 4564 4565 paddd m0, m5 4566 vextracti128 xm1, m0, 1 4567 paddd xm0, xm1 4568 pshufd xm1, xm0, 2 4569 paddd xm0,xm1 4570 movd eax, xm0 4571 RET 4572 4573%endif 4574