1;***************************************************************************** 2;* sad16-a.asm: x86 high depth sad functions 3;***************************************************************************** 4;* Copyright (C) 2010-2013 x264 project 5;* 6;* Authors: Oskar Arvidsson <oskar@irock.se> 7;* Henrik Gramner <henrik@gramner.com> 8;* Dnyaneshwar Gorade <dnyaneshwar@multicorewareinc.com> 9;* Min Chen <chenm003@163.com> 10;* 11;* This program is free software; you can redistribute it and/or modify 12;* it under the terms of the GNU General Public License as published by 13;* the Free Software Foundation; either version 2 of the License, or 14;* (at your option) any later version. 15;* 16;* This program is distributed in the hope that it will be useful, 17;* but WITHOUT ANY WARRANTY; without even the implied warranty of 18;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19;* GNU General Public License for more details. 20;* 21;* You should have received a copy of the GNU General Public License 22;* along with this program; if not, write to the Free Software 23;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. 24;* 25;* This program is also available under a commercial proprietary license. 26;* For more information, contact us at license @ x265.com. 27;***************************************************************************** 28 29%include "x86inc.asm" 30%include "x86util.asm" 31 32SECTION .text 33 34cextern pw_1 35 36;============================================================================= 37; SAD MMX 38;============================================================================= 39 40%macro SAD_INC_1x16P_MMX 0 41 movu m1, [r0+ 0] 42 movu m2, [r0+ 8] 43 movu m3, [r0+16] 44 movu m4, [r0+24] 45 psubw m1, [r2+ 0] 46 psubw m2, [r2+ 8] 47 psubw m3, [r2+16] 48 psubw m4, [r2+24] 49 ABSW2 m1, m2, m1, m2, m5, m6 50 ABSW2 m3, m4, m3, m4, m7, m5 51 lea r0, [r0+2*r1] 52 lea r2, [r2+2*r3] 53 paddw m1, m2 54 paddw m3, m4 55 %if BIT_DEPTH <= 10 56 paddw m0, m1 57 paddw m0, m3 58 %else 59 paddw m1, m3 60 pmaddwd m1, [pw_1] 61 paddd m0, m1 62 %endif 63%endmacro 64 65%macro SAD_INC_2x8P_MMX 0 66 movu m1, [r0+0] 67 movu m2, [r0+8] 68 movu m3, [r0+2*r1+0] 69 movu m4, [r0+2*r1+8] 70 psubw m1, [r2+0] 71 psubw m2, [r2+8] 72 psubw m3, [r2+2*r3+0] 73 psubw m4, [r2+2*r3+8] 74 ABSW2 m1, m2, m1, m2, m5, m6 75 ABSW2 m3, m4, m3, m4, m7, m5 76 lea r0, [r0+4*r1] 77 lea r2, [r2+4*r3] 78 paddw m1, m2 79 paddw m3, m4 80 %if BIT_DEPTH <= 10 81 paddw m0, m1 82 paddw m0, m3 83 %else 84 paddw m1, m3 85 pmaddwd m1, [pw_1] 86 paddd m0, m1 87 %endif 88%endmacro 89 90%macro SAD_INC_2x4P_MMX 0 91 movu m1, [r0] 92 movu m2, [r0+2*r1] 93 psubw m1, [r2] 94 psubw m2, [r2+2*r3] 95 ABSW2 m1, m2, m1, m2, m3, m4 96 lea r0, [r0+4*r1] 97 lea r2, [r2+4*r3] 98 %if BIT_DEPTH <= 10 99 paddw m0, m1 100 paddw m0, m2 101 %else 102 paddw m1, m2 103 pmaddwd m1, [pw_1] 104 paddd m0, m1 105 %endif 106%endmacro 107 108;----------------------------------------------------------------------------- 109; int pixel_sad_NxM( uint16_t *, intptr_t, uint16_t *, intptr_t ) 110;----------------------------------------------------------------------------- 111%macro SAD_MMX 3 112cglobal pixel_sad_%1x%2, 4,5-(%2&4/4) 113 pxor m0, m0 114%if %2 == 4 115 SAD_INC_%3x%1P_MMX 116 SAD_INC_%3x%1P_MMX 117%else 118 mov r4d, %2/%3 119.loop: 120 SAD_INC_%3x%1P_MMX 121 dec r4d 122 jg .loop 123%endif 124%if %1*%2 == 256 125 %if BIT_DEPTH <= 10 126 HADDUW m0, m1 127 %else 128 HADDD m0, m1 129 %endif 130%else 131 %if BIT_DEPTH <= 10 132 HADDW m0, m1 133 %else 134 HADDD m0, m1 135 %endif 136%endif 137 movd eax, m0 138 RET 139%endmacro 140 141INIT_MMX mmx2 142SAD_MMX 16, 16, 1 143SAD_MMX 16, 8, 1 144SAD_MMX 8, 16, 2 145SAD_MMX 8, 8, 2 146SAD_MMX 8, 4, 2 147SAD_MMX 4, 8, 2 148SAD_MMX 4, 4, 2 149SAD_MMX 4, 16, 2 150INIT_MMX ssse3 151SAD_MMX 4, 8, 2 152SAD_MMX 4, 4, 2 153 154;============================================================================= 155; SAD XMM 156;============================================================================= 157 158%macro SAD_1x32 0 159 movu m1, [r2+ 0] 160 movu m2, [r2+16] 161 movu m3, [r2+32] 162 movu m4, [r2+48] 163 psubw m1, [r0+0] 164 psubw m2, [r0+16] 165 psubw m3, [r0+32] 166 psubw m4, [r0+48] 167 ABSW2 m1, m2, m1, m2, m5, m6 168 pmaddwd m1, [pw_1] 169 pmaddwd m2, [pw_1] 170 lea r0, [r0+2*r1] 171 lea r2, [r2+2*r3] 172 ABSW2 m3, m4, m3, m4, m7, m5 173 pmaddwd m3, [pw_1] 174 pmaddwd m4, [pw_1] 175 paddd m1, m2 176 paddd m3, m4 177 paddd m0, m1 178 paddd m0, m3 179%endmacro 180 181%macro SAD_1x24 0 182 movu m1, [r2+ 0] 183 movu m2, [r2+16] 184 movu m3, [r2+32] 185 psubw m1, [r0+0] 186 psubw m2, [r0+16] 187 psubw m3, [r0+32] 188 ABSW2 m1, m2, m1, m2, m4, m6 189 pmaddwd m1, [pw_1] 190 pmaddwd m2, [pw_1] 191 lea r0, [r0+2*r1] 192 lea r2, [r2+2*r3] 193 pxor m4, m4 194 psubw m4, m3 195 pmaxsw m3, m4 196 pmaddwd m3, [pw_1] 197 paddd m1, m2 198 paddd m0, m1 199 paddd m0, m3 200%endmacro 201 202%macro SAD_1x48 0 203 movu m1, [r2+ 0] 204 movu m2, [r2+16] 205 movu m3, [r2+32] 206 movu m4, [r2+48] 207 psubw m1, [r0+0] 208 psubw m2, [r0+16] 209 psubw m3, [r0+32] 210 psubw m4, [r0+48] 211 ABSW2 m1, m2, m1, m2, m5, m6 212 pmaddwd m1, [pw_1] 213 pmaddwd m2, [pw_1] 214 ABSW2 m3, m4, m3, m4, m7, m5 215 pmaddwd m3, [pw_1] 216 pmaddwd m4, [pw_1] 217 paddd m1, m2 218 paddd m3, m4 219 paddd m0, m1 220 paddd m0, m3 221 movu m1, [r2+64] 222 movu m2, [r2+80] 223 psubw m1, [r0+64] 224 psubw m2, [r0+80] 225 ABSW2 m1, m2, m1, m2, m3, m4 226 pmaddwd m1, [pw_1] 227 pmaddwd m2, [pw_1] 228 lea r0, [r0+2*r1] 229 lea r2, [r2+2*r3] 230 paddd m0, m1 231 paddd m0, m2 232%endmacro 233 234%macro SAD_1x64 0 235 movu m1, [r2+ 0] 236 movu m2, [r2+16] 237 movu m3, [r2+32] 238 movu m4, [r2+48] 239 psubw m1, [r0+0] 240 psubw m2, [r0+16] 241 psubw m3, [r0+32] 242 psubw m4, [r0+48] 243 ABSW2 m1, m2, m1, m2, m5, m6 244 pmaddwd m1, [pw_1] 245 pmaddwd m2, [pw_1] 246 ABSW2 m3, m4, m3, m4, m7, m5 247 pmaddwd m3, [pw_1] 248 pmaddwd m4, [pw_1] 249 paddd m1, m2 250 paddd m3, m4 251 paddd m0, m1 252 paddd m0, m3 253 movu m1, [r2+64] 254 movu m2, [r2+80] 255 movu m3, [r2+96] 256 movu m4, [r2+112] 257 psubw m1, [r0+64] 258 psubw m2, [r0+80] 259 psubw m3, [r0+96] 260 psubw m4, [r0+112] 261 ABSW2 m1, m2, m1, m2, m5, m6 262 pmaddwd m1, [pw_1] 263 pmaddwd m2, [pw_1] 264 ABSW2 m3, m4, m3, m4, m7, m5 265 pmaddwd m3, [pw_1] 266 pmaddwd m4, [pw_1] 267 paddd m1, m2 268 paddd m3, m4 269 paddd m0, m1 270 paddd m0, m3 271 lea r0, [r0+2*r1] 272 lea r2, [r2+2*r3] 273%endmacro 274 275%macro SAD_1x12 0 276 movu m1, [r2+0] 277 movh m2, [r2+16] 278 psubw m1, [r0+0] 279 movh m3, [r0+16] 280 psubw m2, m3 281 ABSW2 m1, m2, m1, m2, m4, m6 282 pmaddwd m1, [pw_1] 283 pmaddwd m2, [pw_1] 284 lea r0, [r0+2*r1] 285 lea r2, [r2+2*r3] 286 paddd m1, m2 287 paddd m0, m1 288%endmacro 289 290%macro SAD_INC_2ROW 1 291%if 2*%1 > mmsize 292 movu m1, [r2+ 0] 293 movu m2, [r2+16] 294 movu m3, [r2+2*r3+ 0] 295 movu m4, [r2+2*r3+16] 296 psubw m1, [r0+ 0] 297 psubw m2, [r0+16] 298 psubw m3, [r0+2*r1+ 0] 299 psubw m4, [r0+2*r1+16] 300 ABSW2 m1, m2, m1, m2, m5, m6 301 lea r0, [r0+4*r1] 302 lea r2, [r2+4*r3] 303 ABSW2 m3, m4, m3, m4, m7, m5 304 paddw m1, m2 305 paddw m3, m4 306 paddw m1, m3 307 pmaddwd m1, [pw_1] 308 paddd m0, m1 309%else 310 movu m1, [r2] 311 movu m2, [r2+2*r3] 312 psubw m1, [r0] 313 psubw m2, [r0+2*r1] 314 ABSW2 m1, m2, m1, m2, m3, m4 315 lea r0, [r0+4*r1] 316 lea r2, [r2+4*r3] 317 paddw m1, m2 318 pmaddwd m1, [pw_1] 319 paddd m0, m1 320%endif 321%endmacro 322 323%macro SAD_INC_2ROW_Nx64 1 324%if 2*%1 > mmsize 325 movu m1, [r2 + 0] 326 movu m2, [r2 + 16] 327 movu m3, [r2 + 2 * r3 + 0] 328 movu m4, [r2 + 2 * r3 + 16] 329 psubw m1, [r0 + 0] 330 psubw m2, [r0 + 16] 331 psubw m3, [r0 + 2 * r1 + 0] 332 psubw m4, [r0 + 2 * r1 + 16] 333 ABSW2 m1, m2, m1, m2, m5, m6 334 lea r0, [r0 + 4 * r1] 335 lea r2, [r2 + 4 * r3] 336 ABSW2 m3, m4, m3, m4, m7, m5 337 paddw m1, m2 338 paddw m3, m4 339 paddw m1, m3 340 pmaddwd m1, [pw_1] 341 paddd m0, m1 342%else 343 movu m1, [r2] 344 movu m2, [r2 + 2 * r3] 345 psubw m1, [r0] 346 psubw m2, [r0 + 2 * r1] 347 ABSW2 m1, m2, m1, m2, m3, m4 348 lea r0, [r0 + 4 * r1] 349 lea r2, [r2 + 4 * r3] 350 paddw m1, m2 351 pmaddwd m1, [pw_1] 352 paddd m0, m1 353%endif 354%endmacro 355 356; ---------------------------------------------------------------------------- - 357; int pixel_sad_NxM(uint16_t *, intptr_t, uint16_t *, intptr_t) 358; ---------------------------------------------------------------------------- - 359%macro SAD 2 360cglobal pixel_sad_%1x%2, 4,5,8 361 pxor m0, m0 362%if %2 == 4 363 SAD_INC_2ROW %1 364 SAD_INC_2ROW %1 365%else 366 mov r4d, %2/2 367.loop: 368 SAD_INC_2ROW %1 369 dec r4d 370 jg .loop 371%endif 372 HADDD m0, m1 373 movd eax, xm0 374 RET 375%endmacro 376 377; ---------------------------------------------------------------------------- - 378; int pixel_sad_Nx64(uint16_t *, intptr_t, uint16_t *, intptr_t) 379; ---------------------------------------------------------------------------- - 380%macro SAD_Nx64 1 381cglobal pixel_sad_%1x64, 4,5, 8 382 pxor m0, m0 383 mov r4d, 64 / 2 384.loop: 385 SAD_INC_2ROW_Nx64 %1 386 dec r4d 387 jg .loop 388 389 HADDD m0, m1 390 movd eax, xm0 391 RET 392%endmacro 393 394INIT_XMM sse2 395SAD 16, 4 396SAD 16, 8 397SAD 16, 12 398SAD 16, 16 399SAD 16, 32 400SAD_Nx64 16 401 402INIT_XMM sse2 403SAD 8, 4 404SAD 8, 8 405SAD 8, 16 406SAD 8, 32 407 408INIT_YMM avx2 409SAD 16, 4 410SAD 16, 8 411SAD 16, 12 412SAD 16, 16 413SAD 16, 32 414 415INIT_YMM avx2 416cglobal pixel_sad_16x64, 4,7,4 417 pxor m0, m0 418 pxor m3, m3 419 mov r4d, 64 / 8 420 add r3d, r3d 421 add r1d, r1d 422 lea r5, [r1 * 3] 423 lea r6, [r3 * 3] 424.loop: 425 movu m1, [r2] 426 movu m2, [r2 + r3] 427 psubw m1, [r0] 428 psubw m2, [r0 + r1] 429 pabsw m1, m1 430 pabsw m2, m2 431 paddw m0, m1 432 paddw m3, m2 433 434 movu m1, [r2 + 2 * r3] 435 movu m2, [r2 + r6] 436 psubw m1, [r0 + 2 * r1] 437 psubw m2, [r0 + r5] 438 pabsw m1, m1 439 pabsw m2, m2 440 paddw m0, m1 441 paddw m3, m2 442 443 lea r0, [r0 + 4 * r1] 444 lea r2, [r2 + 4 * r3] 445 446 movu m1, [r2] 447 movu m2, [r2 + r3] 448 psubw m1, [r0] 449 psubw m2, [r0 + r1] 450 pabsw m1, m1 451 pabsw m2, m2 452 paddw m0, m1 453 paddw m3, m2 454 455 movu m1, [r2 + 2 * r3] 456 movu m2, [r2 + r6] 457 psubw m1, [r0 + 2 * r1] 458 psubw m2, [r0 + r5] 459 pabsw m1, m1 460 pabsw m2, m2 461 paddw m0, m1 462 paddw m3, m2 463 464 lea r0, [r0 + 4 * r1] 465 lea r2, [r2 + 4 * r3] 466 467 dec r4d 468 jg .loop 469 470 HADDUWD m0, m1 471 HADDUWD m3, m1 472 HADDD m0, m1 473 HADDD m3, m1 474 paddd m0, m3 475 476 movd eax, xm0 477 RET 478 479INIT_YMM avx2 480cglobal pixel_sad_32x8, 4,7,5 481 pxor m0, m0 482 mov r4d, 8/4 483 add r3d, r3d 484 add r1d, r1d 485 lea r5, [r1 * 3] 486 lea r6, [r3 * 3] 487.loop: 488 movu m1, [r2] 489 movu m2, [r2 + 32] 490 movu m3, [r2 + r3] 491 movu m4, [r2 + r3 + 32] 492 psubw m1, [r0] 493 psubw m2, [r0 + 32] 494 psubw m3, [r0 + r1] 495 psubw m4, [r0 + r1 + 32] 496 pabsw m1, m1 497 pabsw m2, m2 498 pabsw m3, m3 499 pabsw m4, m4 500 paddw m1, m2 501 paddw m3, m4 502 paddw m0, m1 503 paddw m0, m3 504 505 movu m1, [r2 + 2 * r3] 506 movu m2, [r2 + 2 * r3 + 32] 507 movu m3, [r2 + r6] 508 movu m4, [r2 + r6 + 32] 509 psubw m1, [r0 + 2 * r1] 510 psubw m2, [r0 + 2 * r1 + 32] 511 psubw m3, [r0 + r5] 512 psubw m4, [r0 + r5 + 32] 513 pabsw m1, m1 514 pabsw m2, m2 515 lea r0, [r0 + 4 * r1] 516 lea r2, [r2 + 4 * r3] 517 pabsw m3, m3 518 pabsw m4, m4 519 paddw m1, m2 520 paddw m3, m4 521 paddw m0, m1 522 paddw m0, m3 523 524 dec r4d 525 jg .loop 526 527 HADDW m0, m1 528 movd eax, xm0 529 RET 530 531INIT_YMM avx2 532cglobal pixel_sad_32x16, 4,7,5 533 pxor m0, m0 534 mov r4d, 16/8 535 add r3d, r3d 536 add r1d, r1d 537 lea r5, [r1 * 3] 538 lea r6, [r3 * 3] 539.loop: 540 movu m1, [r2] 541 movu m2, [r2 + 32] 542 movu m3, [r2 + r3] 543 movu m4, [r2 + r3 + 32] 544 psubw m1, [r0] 545 psubw m2, [r0 + 32] 546 psubw m3, [r0 + r1] 547 psubw m4, [r0 + r1 + 32] 548 pabsw m1, m1 549 pabsw m2, m2 550 pabsw m3, m3 551 pabsw m4, m4 552 paddw m1, m2 553 paddw m3, m4 554 paddw m0, m1 555 paddw m0, m3 556 557 movu m1, [r2 + 2 * r3] 558 movu m2, [r2 + 2 * r3 + 32] 559 movu m3, [r2 + r6] 560 movu m4, [r2 + r6 + 32] 561 psubw m1, [r0 + 2 * r1] 562 psubw m2, [r0 + 2 * r1 + 32] 563 psubw m3, [r0 + r5] 564 psubw m4, [r0 + r5 + 32] 565 pabsw m1, m1 566 pabsw m2, m2 567 lea r0, [r0 + 4 * r1] 568 lea r2, [r2 + 4 * r3] 569 pabsw m3, m3 570 pabsw m4, m4 571 paddw m1, m2 572 paddw m3, m4 573 paddw m0, m1 574 paddw m0, m3 575 576 movu m1, [r2] 577 movu m2, [r2 + 32] 578 movu m3, [r2 + r3] 579 movu m4, [r2 + r3 + 32] 580 psubw m1, [r0] 581 psubw m2, [r0 + 32] 582 psubw m3, [r0 + r1] 583 psubw m4, [r0 + r1 + 32] 584 pabsw m1, m1 585 pabsw m2, m2 586 pabsw m3, m3 587 pabsw m4, m4 588 paddw m1, m2 589 paddw m3, m4 590 paddw m0, m1 591 paddw m0, m3 592 593 movu m1, [r2 + 2 * r3] 594 movu m2, [r2 + 2 * r3 + 32] 595 movu m3, [r2 + r6] 596 movu m4, [r2 + r6 + 32] 597 psubw m1, [r0 + 2 * r1] 598 psubw m2, [r0 + 2 * r1 + 32] 599 psubw m3, [r0 + r5] 600 psubw m4, [r0 + r5 + 32] 601 pabsw m1, m1 602 pabsw m2, m2 603 lea r0, [r0 + 4 * r1] 604 lea r2, [r2 + 4 * r3] 605 pabsw m3, m3 606 pabsw m4, m4 607 paddw m1, m2 608 paddw m3, m4 609 paddw m0, m1 610 paddw m0, m3 611 612 dec r4d 613 jg .loop 614 615 HADDW m0, m1 616 movd eax, xm0 617 RET 618 619INIT_YMM avx2 620cglobal pixel_sad_32x24, 4,7,5 621 pxor m0, m0 622 mov r4d, 24/4 623 add r3d, r3d 624 add r1d, r1d 625 lea r5, [r1 * 3] 626 lea r6, [r3 * 3] 627.loop: 628 movu m1, [r2] 629 movu m2, [r2 + 32] 630 movu m3, [r2 + r3] 631 movu m4, [r2 + r3 + 32] 632 psubw m1, [r0] 633 psubw m2, [r0 + 32] 634 psubw m3, [r0 + r1] 635 psubw m4, [r0 + r1 + 32] 636 pabsw m1, m1 637 pabsw m2, m2 638 pabsw m3, m3 639 pabsw m4, m4 640 paddw m1, m2 641 paddw m3, m4 642 paddw m0, m1 643 paddw m0, m3 644 645 movu m1, [r2 + 2 * r3] 646 movu m2, [r2 + 2 * r3 + 32] 647 movu m3, [r2 + r6] 648 movu m4, [r2 + r6 + 32] 649 psubw m1, [r0 + 2 * r1] 650 psubw m2, [r0 + 2 * r1 + 32] 651 psubw m3, [r0 + r5] 652 psubw m4, [r0 + r5 + 32] 653 pabsw m1, m1 654 pabsw m2, m2 655 pabsw m3, m3 656 pabsw m4, m4 657 paddw m1, m2 658 paddw m3, m4 659 paddw m0, m1 660 paddw m0, m3 661 662 lea r0, [r0 + 4 * r1] 663 lea r2, [r2 + 4 * r3] 664 665 dec r4d 666 jg .loop 667 668 HADDUWD m0, m1 669 HADDD m0, m1 670 movd eax, xm0 671 RET 672 673 674INIT_YMM avx2 675cglobal pixel_sad_32x32, 4,7,5 676 pxor m0, m0 677 mov r4d, 32/4 678 add r3d, r3d 679 add r1d, r1d 680 lea r5, [r1 * 3] 681 lea r6, [r3 * 3] 682.loop: 683 movu m1, [r2] 684 movu m2, [r2 + 32] 685 movu m3, [r2 + r3] 686 movu m4, [r2 + r3 + 32] 687 psubw m1, [r0] 688 psubw m2, [r0 + 32] 689 psubw m3, [r0 + r1] 690 psubw m4, [r0 + r1 + 32] 691 pabsw m1, m1 692 pabsw m2, m2 693 pabsw m3, m3 694 pabsw m4, m4 695 paddw m1, m2 696 paddw m3, m4 697 paddw m0, m1 698 paddw m0, m3 699 700 movu m1, [r2 + 2 * r3] 701 movu m2, [r2 + 2 * r3 + 32] 702 movu m3, [r2 + r6] 703 movu m4, [r2 + r6 + 32] 704 psubw m1, [r0 + 2 * r1] 705 psubw m2, [r0 + 2 * r1 + 32] 706 psubw m3, [r0 + r5] 707 psubw m4, [r0 + r5 + 32] 708 pabsw m1, m1 709 pabsw m2, m2 710 pabsw m3, m3 711 pabsw m4, m4 712 paddw m1, m2 713 paddw m3, m4 714 paddw m0, m1 715 paddw m0, m3 716 717 lea r0, [r0 + 4 * r1] 718 lea r2, [r2 + 4 * r3] 719 720 dec r4d 721 jg .loop 722 723 HADDUWD m0, m1 724 HADDD m0, m1 725 movd eax, xm0 726 RET 727 728INIT_YMM avx2 729cglobal pixel_sad_32x64, 4,7,6 730 pxor m0, m0 731 pxor m5, m5 732 mov r4d, 64 / 4 733 add r3d, r3d 734 add r1d, r1d 735 lea r5, [r1 * 3] 736 lea r6, [r3 * 3] 737.loop: 738 movu m1, [r2] 739 movu m2, [r2 + 32] 740 movu m3, [r2 + r3] 741 movu m4, [r2 + r3 + 32] 742 psubw m1, [r0] 743 psubw m2, [r0 + 32] 744 psubw m3, [r0 + r1] 745 psubw m4, [r0 + r1 + 32] 746 pabsw m1, m1 747 pabsw m2, m2 748 pabsw m3, m3 749 pabsw m4, m4 750 paddw m1, m2 751 paddw m3, m4 752 paddw m0, m1 753 paddw m5, m3 754 755 movu m1, [r2 + 2 * r3] 756 movu m2, [r2 + 2 * r3 + 32] 757 movu m3, [r2 + r6] 758 movu m4, [r2 + r6 + 32] 759 psubw m1, [r0 + 2 * r1] 760 psubw m2, [r0 + 2 * r1 + 32] 761 psubw m3, [r0 + r5] 762 psubw m4, [r0 + r5 + 32] 763 pabsw m1, m1 764 pabsw m2, m2 765 pabsw m3, m3 766 pabsw m4, m4 767 paddw m1, m2 768 paddw m3, m4 769 paddw m0, m1 770 paddw m5, m3 771 lea r0, [r0 + 4 * r1] 772 lea r2, [r2 + 4 * r3] 773 774 dec r4d 775 jg .loop 776 777 HADDUWD m0, m1 778 HADDUWD m5, m1 779 HADDD m0, m1 780 HADDD m5, m1 781 paddd m0, m5 782 783 movd eax, xm0 784 RET 785 786INIT_YMM avx2 787cglobal pixel_sad_48x64, 4, 5, 7 788 pxor m0, m0 789 pxor m5, m5 790 pxor m6, m6 791 mov r4d, 64/2 792 add r3d, r3d 793 add r1d, r1d 794.loop: 795 movu m1, [r2 + 0 * mmsize] 796 movu m2, [r2 + 1 * mmsize] 797 movu m3, [r2 + 2 * mmsize] 798 psubw m1, [r0 + 0 * mmsize] 799 psubw m2, [r0 + 1 * mmsize] 800 psubw m3, [r0 + 2 * mmsize] 801 pabsw m1, m1 802 pabsw m2, m2 803 pabsw m3, m3 804 paddw m0, m1 805 paddw m5, m2 806 paddw m6, m3 807 808 movu m1, [r2 + r3 + 0 * mmsize] 809 movu m2, [r2 + r3 + 1 * mmsize] 810 movu m3, [r2 + r3 + 2 * mmsize] 811 psubw m1, [r0 + r1 + 0 * mmsize] 812 psubw m2, [r0 + r1 + 1 * mmsize] 813 psubw m3, [r0 + r1 + 2 * mmsize] 814 pabsw m1, m1 815 pabsw m2, m2 816 pabsw m3, m3 817 paddw m0, m1 818 paddw m5, m2 819 paddw m6, m3 820 821 lea r0, [r0 + 2 * r1] 822 lea r2, [r2 + 2 * r3] 823 824 dec r4d 825 jg .loop 826 827 HADDUWD m0, m1 828 HADDUWD m5, m1 829 HADDUWD m6, m1 830 paddd m0, m5 831 paddd m0, m6 832 HADDD m0, m1 833 movd eax, xm0 834 RET 835 836INIT_YMM avx2 837cglobal pixel_sad_64x16, 4, 5, 5 838 pxor m0, m0 839 mov r4d, 16 / 2 840 add r3d, r3d 841 add r1d, r1d 842.loop: 843 movu m1, [r2 + 0] 844 movu m2, [r2 + 32] 845 movu m3, [r2 + 2 * 32] 846 movu m4, [r2 + 3 * 32] 847 psubw m1, [r0 + 0] 848 psubw m2, [r0 + 32] 849 psubw m3, [r0 + 2 * 32] 850 psubw m4, [r0 + 3 * 32] 851 pabsw m1, m1 852 pabsw m2, m2 853 pabsw m3, m3 854 pabsw m4, m4 855 paddw m1, m2 856 paddw m3, m4 857 paddw m0, m1 858 paddw m0, m3 859 movu m1, [r2 + r3] 860 movu m2, [r2 + r3 + 32] 861 movu m3, [r2 + r3 + 64] 862 movu m4, [r2 + r3 + 96] 863 psubw m1, [r0 + r1] 864 psubw m2, [r0 + r1 + 32] 865 psubw m3, [r0 + r1 + 64] 866 psubw m4, [r0 + r1 + 96] 867 pabsw m1, m1 868 pabsw m2, m2 869 pabsw m3, m3 870 pabsw m4, m4 871 paddw m1, m2 872 paddw m3, m4 873 paddw m0, m1 874 paddw m0, m3 875 lea r0, [r0 + 2 * r1] 876 lea r2, [r2 + 2 * r3] 877 878 dec r4d 879 jg .loop 880 881 HADDUWD m0, m1 882 HADDD m0, m1 883 movd eax, xm0 884 RET 885 886INIT_YMM avx2 887cglobal pixel_sad_64x32, 4, 5, 6 888 pxor m0, m0 889 pxor m5, m5 890 mov r4d, 32 / 2 891 add r3d, r3d 892 add r1d, r1d 893.loop: 894 movu m1, [r2 + 0] 895 movu m2, [r2 + 32] 896 movu m3, [r2 + 2 * 32] 897 movu m4, [r2 + 3 * 32] 898 psubw m1, [r0 + 0] 899 psubw m2, [r0 + 32] 900 psubw m3, [r0 + 2 * 32] 901 psubw m4, [r0 + 3 * 32] 902 pabsw m1, m1 903 pabsw m2, m2 904 pabsw m3, m3 905 pabsw m4, m4 906 paddw m1, m2 907 paddw m3, m4 908 paddw m0, m1 909 paddw m5, m3 910 911 movu m1, [r2 + r3] 912 movu m2, [r2 + r3 + 32] 913 movu m3, [r2 + r3 + 64] 914 movu m4, [r2 + r3 + 96] 915 psubw m1, [r0 + r1] 916 psubw m2, [r0 + r1 + 32] 917 psubw m3, [r0 + r1 + 64] 918 psubw m4, [r0 + r1 + 96] 919 pabsw m1, m1 920 pabsw m2, m2 921 pabsw m3, m3 922 pabsw m4, m4 923 paddw m1, m2 924 paddw m3, m4 925 paddw m0, m1 926 paddw m5, m3 927 lea r0, [r0 + 2 * r1] 928 lea r2, [r2 + 2 * r3] 929 930 dec r4d 931 jg .loop 932 933 HADDUWD m0, m1 934 HADDUWD m5, m1 935 paddd m0, m5 936 HADDD m0, m1 937 938 movd eax, xm0 939 RET 940 941INIT_YMM avx2 942cglobal pixel_sad_64x48, 4, 5, 8 943 pxor m0, m0 944 pxor m5, m5 945 pxor m6, m6 946 pxor m7, m7 947 mov r4d, 48 / 2 948 add r3d, r3d 949 add r1d, r1d 950.loop: 951 movu m1, [r2 + 0] 952 movu m2, [r2 + 32] 953 movu m3, [r2 + 64] 954 movu m4, [r2 + 96] 955 psubw m1, [r0 + 0] 956 psubw m2, [r0 + 32] 957 psubw m3, [r0 + 64] 958 psubw m4, [r0 + 96] 959 pabsw m1, m1 960 pabsw m2, m2 961 pabsw m3, m3 962 pabsw m4, m4 963 paddw m0, m1 964 paddw m5, m2 965 paddw m6, m3 966 paddw m7, m4 967 968 movu m1, [r2 + r3] 969 movu m2, [r2 + r3 + 32] 970 movu m3, [r2 + r3 + 64] 971 movu m4, [r2 + r3 + 96] 972 psubw m1, [r0 + r1] 973 psubw m2, [r0 + r1 + 32] 974 psubw m3, [r0 + r1 + 64] 975 psubw m4, [r0 + r1 + 96] 976 pabsw m1, m1 977 pabsw m2, m2 978 pabsw m3, m3 979 pabsw m4, m4 980 paddw m0, m1 981 paddw m5, m2 982 paddw m6, m3 983 paddw m7, m4 984 985 lea r0, [r0 + 2 * r1] 986 lea r2, [r2 + 2 * r3] 987 988 dec r4d 989 jg .loop 990 991 HADDUWD m0, m1 992 HADDUWD m5, m1 993 HADDUWD m6, m1 994 HADDUWD m7, m1 995 paddd m0, m5 996 paddd m0, m6 997 paddd m0, m7 998 HADDD m0, m1 999 movd eax, xm0 1000 RET 1001 1002INIT_YMM avx2 1003cglobal pixel_sad_64x64, 4, 5, 8 1004 pxor m0, m0 1005 pxor m5, m5 1006 pxor m6, m6 1007 pxor m7, m7 1008 mov r4d, 64 / 2 1009 add r3d, r3d 1010 add r1d, r1d 1011.loop: 1012 movu m1, [r2 + 0] 1013 movu m2, [r2 + 32] 1014 movu m3, [r2 + 64] 1015 movu m4, [r2 + 96] 1016 psubw m1, [r0 + 0] 1017 psubw m2, [r0 + 32] 1018 psubw m3, [r0 + 64] 1019 psubw m4, [r0 + 96] 1020 pabsw m1, m1 1021 pabsw m2, m2 1022 pabsw m3, m3 1023 pabsw m4, m4 1024 paddw m0, m1 1025 paddw m5, m2 1026 paddw m6, m3 1027 paddw m7, m4 1028 1029 movu m1, [r2 + r3] 1030 movu m2, [r2 + r3 + 32] 1031 movu m3, [r2 + r3 + 64] 1032 movu m4, [r2 + r3 + 96] 1033 psubw m1, [r0 + r1] 1034 psubw m2, [r0 + r1 + 32] 1035 psubw m3, [r0 + r1 + 64] 1036 psubw m4, [r0 + r1 + 96] 1037 pabsw m1, m1 1038 pabsw m2, m2 1039 pabsw m3, m3 1040 pabsw m4, m4 1041 paddw m0, m1 1042 paddw m5, m2 1043 paddw m6, m3 1044 paddw m7, m4 1045 1046 lea r0, [r0 + 2 * r1] 1047 lea r2, [r2 + 2 * r3] 1048 1049 dec r4d 1050 jg .loop 1051 1052 HADDUWD m0, m1 1053 HADDUWD m5, m1 1054 HADDUWD m6, m1 1055 HADDUWD m7, m1 1056 paddd m0, m5 1057 paddd m0, m6 1058 paddd m0, m7 1059 HADDD m0, m1 1060 movd eax, xm0 1061 RET 1062 1063;------------------------------------------------------------------ 1064; int pixel_sad_32xN( uint16_t *, intptr_t, uint16_t *, intptr_t ) 1065;------------------------------------------------------------------ 1066%macro SAD_32 2 1067cglobal pixel_sad_%1x%2, 4,5,8 1068 pxor m0, m0 1069 mov r4d, %2/4 1070.loop: 1071 SAD_1x32 1072 SAD_1x32 1073 SAD_1x32 1074 SAD_1x32 1075 dec r4d 1076 jnz .loop 1077 1078 HADDD m0, m1 1079 movd eax, xm0 1080 RET 1081%endmacro 1082 1083INIT_XMM sse2 1084SAD_32 32, 8 1085SAD_32 32, 16 1086SAD_32 32, 24 1087SAD_32 32, 32 1088SAD_32 32, 64 1089 1090;------------------------------------------------------------------ 1091; int pixel_sad_64xN( uint16_t *, intptr_t, uint16_t *, intptr_t ) 1092;------------------------------------------------------------------ 1093%macro SAD_64 2 1094cglobal pixel_sad_%1x%2, 4,5,8 1095 pxor m0, m0 1096 mov r4d, %2/4 1097.loop: 1098 SAD_1x64 1099 SAD_1x64 1100 SAD_1x64 1101 SAD_1x64 1102 dec r4d 1103 jnz .loop 1104 1105 HADDD m0, m1 1106 movd eax, xmm0 1107 RET 1108%endmacro 1109 1110INIT_XMM sse2 1111SAD_64 64, 16 1112SAD_64 64, 32 1113SAD_64 64, 48 1114SAD_64 64, 64 1115 1116;------------------------------------------------------------------ 1117; int pixel_sad_48xN( uint16_t *, intptr_t, uint16_t *, intptr_t ) 1118;------------------------------------------------------------------ 1119%macro SAD_48 2 1120cglobal pixel_sad_%1x%2, 4,5,8 1121 pxor m0, m0 1122 mov r4d, %2/4 1123.loop: 1124 SAD_1x48 1125 SAD_1x48 1126 SAD_1x48 1127 SAD_1x48 1128 dec r4d 1129 jnz .loop 1130 1131 HADDD m0, m1 1132 movd eax, xmm0 1133 RET 1134%endmacro 1135 1136INIT_XMM sse2 1137SAD_48 48, 64 1138 1139;------------------------------------------------------------------ 1140; int pixel_sad_24xN( uint16_t *, intptr_t, uint16_t *, intptr_t ) 1141;------------------------------------------------------------------ 1142%macro SAD_24 2 1143cglobal pixel_sad_%1x%2, 4,5,8 1144 pxor m0, m0 1145 mov r4d, %2/4 1146.loop: 1147 SAD_1x24 1148 SAD_1x24 1149 SAD_1x24 1150 SAD_1x24 1151 dec r4d 1152 jnz .loop 1153 1154 HADDD m0, m1 1155 movd eax, xmm0 1156 RET 1157%endmacro 1158 1159INIT_XMM sse2 1160SAD_24 24, 32 1161 1162;------------------------------------------------------------------ 1163; int pixel_sad_12xN( uint16_t *, intptr_t, uint16_t *, intptr_t ) 1164;------------------------------------------------------------------ 1165%macro SAD_12 2 1166cglobal pixel_sad_%1x%2, 4,5,8 1167 pxor m0, m0 1168 mov r4d, %2/4 1169.loop: 1170 SAD_1x12 1171 SAD_1x12 1172 SAD_1x12 1173 SAD_1x12 1174 dec r4d 1175 jnz .loop 1176 1177 HADDD m0, m1 1178 movd eax, xmm0 1179 RET 1180%endmacro 1181 1182INIT_XMM sse2 1183SAD_12 12, 16 1184 1185 1186;============================================================================= 1187; SAD x3/x4 1188;============================================================================= 1189 1190%macro SAD_X3_INC_P 0 1191 add r0, 4*FENC_STRIDE 1192 lea r1, [r1+4*r4] 1193 lea r2, [r2+4*r4] 1194 lea r3, [r3+4*r4] 1195%endmacro 1196 1197%macro SAD_X3_ONE_START 0 1198 mova m3, [r0] 1199 movu m0, [r1] 1200 movu m1, [r2] 1201 movu m2, [r3] 1202 psubw m0, m3 1203 psubw m1, m3 1204 psubw m2, m3 1205 ABSW2 m0, m1, m0, m1, m4, m5 1206 ABSW m2, m2, m6 1207 pmaddwd m0, [pw_1] 1208 pmaddwd m1, [pw_1] 1209 pmaddwd m2, [pw_1] 1210%endmacro 1211 1212%macro SAD_X3_ONE 2 1213 mova m6, [r0+%1] 1214 movu m3, [r1+%2] 1215 movu m4, [r2+%2] 1216 movu m5, [r3+%2] 1217 psubw m3, m6 1218 psubw m4, m6 1219 psubw m5, m6 1220 ABSW2 m3, m4, m3, m4, m7, m6 1221 ABSW m5, m5, m6 1222 pmaddwd m3, [pw_1] 1223 pmaddwd m4, [pw_1] 1224 pmaddwd m5, [pw_1] 1225 paddd m0, m3 1226 paddd m1, m4 1227 paddd m2, m5 1228%endmacro 1229 1230%macro SAD_X3_END 2 1231%if mmsize == 8 && %1*%2 == 256 1232 HADDUW m0, m3 1233 HADDUW m1, m4 1234 HADDUW m2, m5 1235%else 1236 HADDD m0, m3 1237 HADDD m1, m4 1238 HADDD m2, m5 1239%endif 1240%if UNIX64 1241 movd [r5+0], xm0 1242 movd [r5+4], xm1 1243 movd [r5+8], xm2 1244%else 1245 mov r0, r5mp 1246 movd [r0+0], xm0 1247 movd [r0+4], xm1 1248 movd [r0+8], xm2 1249%endif 1250 RET 1251%endmacro 1252 1253%macro SAD_X4_INC_P 0 1254 add r0, 4*FENC_STRIDE 1255 lea r1, [r1+4*r5] 1256 lea r2, [r2+4*r5] 1257 lea r3, [r3+4*r5] 1258 lea r4, [r4+4*r5] 1259%endmacro 1260 1261%macro SAD_X4_ONE_START 0 1262 mova m4, [r0] 1263 movu m0, [r1] 1264 movu m1, [r2] 1265 movu m2, [r3] 1266 movu m3, [r4] 1267 psubw m0, m4 1268 psubw m1, m4 1269 psubw m2, m4 1270 psubw m3, m4 1271 ABSW2 m0, m1, m0, m1, m5, m6 1272 ABSW2 m2, m3, m2, m3, m4, m7 1273 pmaddwd m0, [pw_1] 1274 pmaddwd m1, [pw_1] 1275 pmaddwd m2, [pw_1] 1276 pmaddwd m3, [pw_1] 1277%endmacro 1278 1279%macro SAD_X4_ONE 2 1280 mova m4, [r0+%1] 1281 movu m5, [r1+%2] 1282 movu m6, [r2+%2] 1283%if num_mmregs > 8 1284 movu m7, [r3+%2] 1285 movu m8, [r4+%2] 1286 psubw m5, m4 1287 psubw m6, m4 1288 psubw m7, m4 1289 psubw m8, m4 1290 ABSW2 m5, m6, m5, m6, m9, m10 1291 ABSW2 m7, m8, m7, m8, m9, m10 1292 pmaddwd m5, [pw_1] 1293 pmaddwd m6, [pw_1] 1294 pmaddwd m7, [pw_1] 1295 pmaddwd m8, [pw_1] 1296 paddd m0, m5 1297 paddd m1, m6 1298 paddd m2, m7 1299 paddd m3, m8 1300%elif cpuflag(ssse3) 1301 movu m7, [r3+%2] 1302 psubw m5, m4 1303 psubw m6, m4 1304 psubw m7, m4 1305 movu m4, [r4+%2] 1306 pabsw m5, m5 1307 psubw m4, [r0+%1] 1308 pabsw m6, m6 1309 pabsw m7, m7 1310 pabsw m4, m4 1311 pmaddwd m5, [pw_1] 1312 pmaddwd m6, [pw_1] 1313 pmaddwd m7, [pw_1] 1314 pmaddwd m4, [pw_1] 1315 paddd m0, m5 1316 paddd m1, m6 1317 paddd m2, m7 1318 paddd m3, m4 1319%else ; num_mmregs == 8 && !ssse3 1320 psubw m5, m4 1321 psubw m6, m4 1322 ABSW m5, m5, m7 1323 ABSW m6, m6, m7 1324 pmaddwd m5, [pw_1] 1325 pmaddwd m6, [pw_1] 1326 paddd m0, m5 1327 paddd m1, m6 1328 movu m5, [r3+%2] 1329 movu m6, [r4+%2] 1330 psubw m5, m4 1331 psubw m6, m4 1332 ABSW2 m5, m6, m5, m6, m7, m4 1333 pmaddwd m5, [pw_1] 1334 pmaddwd m6, [pw_1] 1335 paddd m2, m5 1336 paddd m3, m6 1337%endif 1338%endmacro 1339 1340%macro SAD_X4_END 2 1341%if mmsize == 8 && %1*%2 == 256 1342 HADDUW m0, m4 1343 HADDUW m1, m5 1344 HADDUW m2, m6 1345 HADDUW m3, m7 1346%else 1347 HADDD m0, m4 1348 HADDD m1, m5 1349 HADDD m2, m6 1350 HADDD m3, m7 1351%endif 1352 mov r0, r6mp 1353 movd [r0+ 0], xm0 1354 movd [r0+ 4], xm1 1355 movd [r0+ 8], xm2 1356 movd [r0+12], xm3 1357 RET 1358%endmacro 1359 1360%macro SAD_X_2xNP 4 1361 %assign x %3 1362%rep %4 1363 SAD_X%1_ONE x*mmsize, x*mmsize 1364 SAD_X%1_ONE 2*FENC_STRIDE+x*mmsize, 2*%2+x*mmsize 1365 %assign x x+1 1366%endrep 1367%endmacro 1368 1369%macro PIXEL_VSAD 0 1370cglobal pixel_vsad, 3,3,8 1371 mova m0, [r0] 1372 mova m1, [r0+16] 1373 mova m2, [r0+2*r1] 1374 mova m3, [r0+2*r1+16] 1375 lea r0, [r0+4*r1] 1376 psubw m0, m2 1377 psubw m1, m3 1378 ABSW2 m0, m1, m0, m1, m4, m5 1379 paddw m0, m1 1380 sub r2d, 2 1381 je .end 1382.loop: 1383 mova m4, [r0] 1384 mova m5, [r0+16] 1385 mova m6, [r0+2*r1] 1386 mova m7, [r0+2*r1+16] 1387 lea r0, [r0+4*r1] 1388 psubw m2, m4 1389 psubw m3, m5 1390 psubw m4, m6 1391 psubw m5, m7 1392 ABSW m2, m2, m1 1393 ABSW m3, m3, m1 1394 ABSW m4, m4, m1 1395 ABSW m5, m5, m1 1396 paddw m0, m2 1397 paddw m0, m3 1398 paddw m0, m4 1399 paddw m0, m5 1400 mova m2, m6 1401 mova m3, m7 1402 sub r2d, 2 1403 jg .loop 1404.end: 1405%if BIT_DEPTH == 9 1406 HADDW m0, m1 ; max sum: 62(pixel diffs)*511(pixel_max)=31682 1407%else 1408 HADDUW m0, m1 ; max sum: 62(pixel diffs)*1023(pixel_max)=63426 1409%endif 1410 movd eax, m0 1411 RET 1412%endmacro 1413INIT_XMM sse2 1414PIXEL_VSAD 1415INIT_XMM ssse3 1416PIXEL_VSAD 1417INIT_XMM xop 1418PIXEL_VSAD 1419 1420INIT_YMM avx2 1421cglobal pixel_vsad, 3,3 1422 mova m0, [r0] 1423 mova m1, [r0+2*r1] 1424 lea r0, [r0+4*r1] 1425 psubw m0, m1 1426 pabsw m0, m0 1427 sub r2d, 2 1428 je .end 1429.loop: 1430 mova m2, [r0] 1431 mova m3, [r0+2*r1] 1432 lea r0, [r0+4*r1] 1433 psubw m1, m2 1434 psubw m2, m3 1435 pabsw m1, m1 1436 pabsw m2, m2 1437 paddw m0, m1 1438 paddw m0, m2 1439 mova m1, m3 1440 sub r2d, 2 1441 jg .loop 1442.end: 1443%if BIT_DEPTH == 9 1444 HADDW m0, m1 1445%else 1446 HADDUW m0, m1 1447%endif 1448 movd eax, xm0 1449 RET 1450;----------------------------------------------------------------------------- 1451; void pixel_sad_xN_WxH( uint16_t *fenc, uint16_t *pix0, uint16_t *pix1, 1452; uint16_t *pix2, intptr_t i_stride, int scores[3] ) 1453;----------------------------------------------------------------------------- 1454%macro SAD_X 3 1455cglobal pixel_sad_x%1_%2x%3, 6,7,XMM_REGS 1456 %assign regnum %1+1 1457 %xdefine STRIDE r %+ regnum 1458 mov r6, %3/2-1 1459 SAD_X%1_ONE_START 1460 SAD_X%1_ONE 2*FENC_STRIDE, 2*STRIDE 1461 SAD_X_2xNP %1, STRIDE, 1, %2/(mmsize/2)-1 1462.loop: 1463 SAD_X%1_INC_P 1464 SAD_X_2xNP %1, STRIDE, 0, %2/(mmsize/2) 1465 dec r6 1466 jg .loop 1467%if %1 == 4 1468 mov r6, r6m 1469%endif 1470 SAD_X%1_END %2, %3 1471%endmacro 1472 1473INIT_MMX mmx2 1474%define XMM_REGS 0 1475SAD_X 3, 16, 16 1476SAD_X 3, 16, 8 1477SAD_X 3, 12, 16 1478SAD_X 3, 8, 16 1479SAD_X 3, 8, 8 1480SAD_X 3, 8, 4 1481SAD_X 3, 4, 16 1482SAD_X 3, 4, 8 1483SAD_X 3, 4, 4 1484SAD_X 4, 16, 16 1485SAD_X 4, 16, 8 1486SAD_X 4, 12, 16 1487SAD_X 4, 8, 16 1488SAD_X 4, 8, 8 1489SAD_X 4, 8, 4 1490SAD_X 4, 4, 16 1491SAD_X 4, 4, 8 1492SAD_X 4, 4, 4 1493INIT_MMX ssse3 1494SAD_X 3, 4, 8 1495SAD_X 3, 4, 4 1496SAD_X 4, 4, 8 1497SAD_X 4, 4, 4 1498INIT_XMM ssse3 1499%define XMM_REGS 7 1500SAD_X 3, 16, 16 1501SAD_X 3, 16, 8 1502SAD_X 3, 8, 16 1503SAD_X 3, 8, 8 1504SAD_X 3, 8, 4 1505%define XMM_REGS 9 1506SAD_X 4, 16, 16 1507SAD_X 4, 16, 8 1508SAD_X 4, 8, 16 1509SAD_X 4, 8, 8 1510SAD_X 4, 8, 4 1511INIT_XMM sse2 1512%define XMM_REGS 8 1513SAD_X 3, 64, 64 1514SAD_X 3, 64, 48 1515SAD_X 3, 64, 32 1516SAD_X 3, 64, 16 1517SAD_X 3, 48, 64 1518SAD_X 3, 32, 64 1519SAD_X 3, 32, 32 1520SAD_X 3, 32, 24 1521SAD_X 3, 32, 16 1522SAD_X 3, 32, 8 1523SAD_X 3, 24, 32 1524SAD_X 3, 16, 64 1525SAD_X 3, 16, 32 1526SAD_X 3, 16, 16 1527SAD_X 3, 16, 12 1528SAD_X 3, 16, 8 1529SAD_X 3, 16, 4 1530SAD_X 3, 8, 32 1531SAD_X 3, 8, 16 1532SAD_X 3, 8, 8 1533SAD_X 3, 8, 4 1534%define XMM_REGS 11 1535SAD_X 4, 64, 64 1536SAD_X 4, 64, 48 1537SAD_X 4, 64, 32 1538SAD_X 4, 64, 16 1539SAD_X 4, 48, 64 1540SAD_X 4, 32, 64 1541SAD_X 4, 32, 32 1542SAD_X 4, 32, 24 1543SAD_X 4, 32, 16 1544SAD_X 4, 32, 8 1545SAD_X 4, 24, 32 1546SAD_X 4, 16, 64 1547SAD_X 4, 16, 32 1548SAD_X 4, 16, 16 1549SAD_X 4, 16, 12 1550SAD_X 4, 16, 8 1551SAD_X 4, 16, 4 1552SAD_X 4, 8, 32 1553SAD_X 4, 8, 16 1554SAD_X 4, 8, 8 1555SAD_X 4, 8, 4 1556INIT_YMM avx2 1557%define XMM_REGS 7 1558SAD_X 3, 16, 4 1559SAD_X 3, 16, 8 1560SAD_X 3, 16, 12 1561SAD_X 3, 16, 16 1562SAD_X 3, 16, 32 1563SAD_X 3, 16, 64 1564SAD_X 3, 32, 8 1565SAD_X 3, 32, 16 1566SAD_X 3, 32, 24 1567SAD_X 3, 32, 32 1568SAD_X 3, 32, 64 1569SAD_X 3, 48, 64 1570SAD_X 3, 64, 16 1571SAD_X 3, 64, 32 1572SAD_X 3, 64, 48 1573SAD_X 3, 64, 64 1574%define XMM_REGS 9 1575SAD_X 4, 16, 4 1576SAD_X 4, 16, 8 1577SAD_X 4, 16, 12 1578SAD_X 4, 16, 16 1579SAD_X 4, 16, 32 1580SAD_X 4, 16, 64 1581SAD_X 4, 32, 8 1582SAD_X 4, 32, 16 1583SAD_X 4, 32, 24 1584SAD_X 4, 32, 32 1585SAD_X 4, 32, 64 1586SAD_X 4, 48, 64 1587SAD_X 4, 64, 16 1588SAD_X 4, 64, 32 1589SAD_X 4, 64, 48 1590SAD_X 4, 64, 64 1591 1592