1;***************************************************************************** 2;* ssd-a.asm: x86 ssd functions 3;***************************************************************************** 4;* Copyright (C) 2003-2013 x264 project 5;* 6;* Authors: Loren Merritt <lorenm@u.washington.edu> 7;* Fiona Glaser <fiona@x264.com> 8;* Laurent Aimar <fenrir@via.ecp.fr> 9;* Alex Izvorski <aizvorksi@gmail.com> 10;* 11;* This program is free software; you can redistribute it and/or modify 12;* it under the terms of the GNU General Public License as published by 13;* the Free Software Foundation; either version 2 of the License, or 14;* (at your option) any later version. 15;* 16;* This program is distributed in the hope that it will be useful, 17;* but WITHOUT ANY WARRANTY; without even the implied warranty of 18;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19;* GNU General Public License for more details. 20;* 21;* You should have received a copy of the GNU General Public License 22;* along with this program; if not, write to the Free Software 23;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. 24;* 25;* This program is also available under a commercial proprietary license. 26;* For more information, contact us at license @ x265.com. 27;***************************************************************************** 28 29%include "x86inc.asm" 30%include "x86util.asm" 31 32SECTION_RODATA 32 33 34SECTION .text 35 36cextern pw_00ff 37cextern hsub_mul 38 39;============================================================================= 40; SSD 41;============================================================================= 42 43%if HIGH_BIT_DEPTH 44;----------------------------------------------------------------------------- 45; int pixel_ssd_WxH( uint16_t *, intptr_t, uint16_t *, intptr_t ) 46;----------------------------------------------------------------------------- 47%macro SSD_ONE 2 48cglobal pixel_ssd_ss_%1x%2, 4,7,8 49 FIX_STRIDES r1, r3 50%if mmsize == %1*2 51 %define offset0_1 r1 52 %define offset0_2 r1*2 53 %define offset0_3 r5 54 %define offset1_1 r3 55 %define offset1_2 r3*2 56 %define offset1_3 r6 57 lea r5, [3*r1] 58 lea r6, [3*r3] 59%elif mmsize == %1 60 %define offset0_1 mmsize 61 %define offset0_2 r1 62 %define offset0_3 r1+mmsize 63 %define offset1_1 mmsize 64 %define offset1_2 r3 65 %define offset1_3 r3+mmsize 66%elif mmsize == %1/2 67 %define offset0_1 mmsize 68 %define offset0_2 mmsize*2 69 %define offset0_3 mmsize*3 70 %define offset1_1 mmsize 71 %define offset1_2 mmsize*2 72 %define offset1_3 mmsize*3 73%endif 74 %assign %%n %2/(2*mmsize/%1) 75%if %%n > 1 76 mov r4d, %%n 77%endif 78 pxor m0, m0 79.loop: 80 movu m1, [r0] 81 movu m2, [r0+offset0_1] 82 movu m3, [r0+offset0_2] 83 movu m4, [r0+offset0_3] 84 movu m6, [r2] 85 movu m7, [r2+offset1_1] 86 psubw m1, m6 87 psubw m2, m7 88 movu m6, [r2+offset1_2] 89 movu m7, [r2+offset1_3] 90 psubw m3, m6 91 psubw m4, m7 92%if %%n > 1 93 lea r0, [r0+r1*(%2/%%n)] 94 lea r2, [r2+r3*(%2/%%n)] 95%endif 96 pmaddwd m1, m1 97 pmaddwd m2, m2 98 pmaddwd m3, m3 99 pmaddwd m4, m4 100 paddd m1, m2 101 paddd m3, m4 102 paddd m0, m1 103 paddd m0, m3 104%if %%n > 1 105 dec r4d 106 jg .loop 107%endif 108 109%if BIT_DEPTH == 12 && mmsize == 16 110 movu m5, m0 111 pxor m6, m6 112 punpckldq m0, m6 113 punpckhdq m5, m6 114 paddq m0, m5 115 movhlps m5, m0 116 paddq m0, m5 117 movq r6, xm0 118%else 119 HADDD m0, m5 120 movd eax,xm0 121%endif 122%ifidn movu,movq ; detect MMX 123 EMMS 124%endif 125 RET 126%endmacro 127 128%macro SSD_TWO 2 129cglobal pixel_ssd_ss_%1x%2, 4,7,8 130 FIX_STRIDES r1, r3 131 pxor m0, m0 132 mov r4d, %2/2 133 lea r5, [r1 * 2] 134 lea r6, [r3 * 2] 135.loop: 136 movu m1, [r0] 137 movu m2, [r0 + 16] 138 movu m3, [r0 + 32] 139 movu m4, [r0 + 48] 140 movu m6, [r2] 141 movu m7, [r2 + 16] 142 psubw m1, m6 143 psubw m2, m7 144 movu m6, [r2 + 32] 145 movu m7, [r2 + 48] 146 psubw m3, m6 147 psubw m4, m7 148 pmaddwd m1, m1 149 pmaddwd m2, m2 150 pmaddwd m3, m3 151 pmaddwd m4, m4 152 paddd m1, m2 153 paddd m3, m4 154 paddd m0, m1 155 paddd m0, m3 156 movu m1, [r0 + 64] 157 movu m2, [r0 + 80] 158 movu m6, [r2 + 64] 159 movu m7, [r2 + 80] 160 psubw m1, m6 161 psubw m2, m7 162 pmaddwd m1, m1 163 pmaddwd m2, m2 164 paddd m1, m2 165 paddd m0, m1 166%if %1 == 64 167 movu m3, [r0 + 96] 168 movu m4, [r0 + 112] 169 movu m6, [r2 + 96] 170 movu m7, [r2 + 112] 171 psubw m3, m6 172 psubw m4, m7 173 pmaddwd m3, m3 174 pmaddwd m4, m4 175 paddd m3, m4 176 paddd m0, m3 177%endif 178 movu m1, [r0 + r1] 179 movu m2, [r0 + r1 + 16] 180 movu m3, [r0 + r1 + 32] 181 movu m4, [r0 + r1 + 48] 182 movu m6, [r2 + r3] 183 movu m7, [r2 + r3 + 16] 184 psubw m1, m6 185 psubw m2, m7 186 movu m6, [r2 + r3 + 32] 187 movu m7, [r2 + r3 + 48] 188 psubw m3, m6 189 psubw m4, m7 190 pmaddwd m1, m1 191 pmaddwd m2, m2 192 pmaddwd m3, m3 193 pmaddwd m4, m4 194 paddd m1, m2 195 paddd m3, m4 196 paddd m0, m1 197 paddd m0, m3 198 movu m1, [r0 + r1 + 64] 199 movu m2, [r0 + r1 + 80] 200 movu m6, [r2 + r3 + 64] 201 movu m7, [r2 + r3 + 80] 202 psubw m1, m6 203 psubw m2, m7 204 pmaddwd m1, m1 205 pmaddwd m2, m2 206 paddd m1, m2 207 paddd m0, m1 208%if %1 == 64 209 movu m3, [r0 + r1 + 96] 210 movu m4, [r0 + r1 + 112] 211 movu m6, [r2 + r3 + 96] 212 movu m7, [r2 + r3 + 112] 213 psubw m3, m6 214 psubw m4, m7 215 pmaddwd m3, m3 216 pmaddwd m4, m4 217 paddd m3, m4 218 paddd m0, m3 219%endif 220 lea r0, [r0 + r5] 221 lea r2, [r2 + r6] 222 dec r4d 223 jnz .loop 224 HADDD m0, m5 225 movd eax, xm0 226 RET 227%endmacro 228%macro SSD_24 2 229cglobal pixel_ssd_ss_%1x%2, 4,7,8 230 FIX_STRIDES r1, r3 231 pxor m0, m0 232 mov r4d, %2/2 233 lea r5, [r1 * 2] 234 lea r6, [r3 * 2] 235.loop: 236 movu m1, [r0] 237 movu m2, [r0 + 16] 238 movu m3, [r0 + 32] 239 movu m5, [r2] 240 movu m6, [r2 + 16] 241 movu m7, [r2 + 32] 242 psubw m1, m5 243 psubw m2, m6 244 psubw m3, m7 245 pmaddwd m1, m1 246 pmaddwd m2, m2 247 pmaddwd m3, m3 248 paddd m1, m2 249 paddd m0, m1 250 movu m1, [r0 + r1] 251 movu m2, [r0 + r1 + 16] 252 movu m4, [r0 + r1 + 32] 253 movu m5, [r2 + r3] 254 movu m6, [r2 + r3 + 16] 255 movu m7, [r2 + r3 + 32] 256 psubw m1, m5 257 psubw m2, m6 258 psubw m4, m7 259 pmaddwd m1, m1 260 pmaddwd m2, m2 261 pmaddwd m4, m4 262 paddd m1, m2 263 paddd m3, m4 264 paddd m0, m1 265 paddd m0, m3 266 lea r0, [r0 + r5] 267 lea r2, [r2 + r6] 268 dec r4d 269 jnz .loop 270 HADDD m0, m5 271 movd eax, xm0 272 RET 273%endmacro 274%macro SSD_12 2 275cglobal pixel_ssd_ss_%1x%2, 4,7,8 276 FIX_STRIDES r1, r3 277 pxor m0, m0 278 mov r4d, %2/4 279 lea r5, [r1 * 2] 280 lea r6, [r3 * 2] 281.loop: 282 movu m1, [r0] 283 movh m2, [r0 + 16] 284 movu m3, [r0 + r1] 285 punpcklqdq m2, [r0 + r1 + 16] 286 movu m7, [r2] 287 psubw m1, m7 288 movh m4, [r2 + 16] 289 movu m7, [r2 + r3] 290 psubw m3, m7 291 punpcklqdq m4, [r2 + r3 + 16] 292 psubw m2, m4 293 pmaddwd m1, m1 294 pmaddwd m2, m2 295 pmaddwd m3, m3 296 paddd m1, m2 297 paddd m0, m1 298 299 movu m1, [r0 + r5] 300 movh m2, [r0 + r5 + 16] 301 lea r0, [r0 + r5] 302 movu m6, [r0 + r1] 303 punpcklqdq m2, [r0 + r1 + 16] 304 movu m7, [r2 + r6] 305 psubw m1, m7 306 movh m4, [r2 + r6 + 16] 307 lea r2, [r2 + r6] 308 movu m7, [r2 + r3] 309 psubw m6, m7 310 punpcklqdq m4, [r2 + r3 + 16] 311 psubw m2, m4 312 pmaddwd m1, m1 313 pmaddwd m2, m2 314 pmaddwd m6, m6 315 paddd m1, m2 316 paddd m3, m6 317 paddd m0, m1 318 paddd m0, m3 319 lea r0, [r0 + r5] 320 lea r2, [r2 + r6] 321 dec r4d 322 jnz .loop 323 HADDD m0, m5 324 movd eax, xm0 325 RET 326%endmacro 327 328INIT_YMM avx2 329cglobal pixel_ssd_16x16, 4,7,8 330 FIX_STRIDES r1, r3 331 lea r5, [3 * r1] 332 lea r6, [3 * r3] 333 mov r4d, 4 334 pxor m0, m0 335.loop: 336 movu m1, [r0] 337 movu m2, [r0 + r1] 338 movu m3, [r0 + r1 * 2] 339 movu m4, [r0 + r5] 340 movu m6, [r2] 341 movu m7, [r2 + r3] 342 psubw m1, m6 343 psubw m2, m7 344 movu m6, [r2 + r3 * 2] 345 movu m7, [r2 + r6] 346 psubw m3, m6 347 psubw m4, m7 348 349 lea r0, [r0 + r1 * 4] 350 lea r2, [r2 + r3 * 4] 351 352 pmaddwd m1, m1 353 pmaddwd m2, m2 354 pmaddwd m3, m3 355 pmaddwd m4, m4 356 paddd m1, m2 357 paddd m3, m4 358 paddd m0, m1 359 paddd m0, m3 360 361 dec r4d 362 jg .loop 363 364 HADDD m0, m5 365 movd eax, xm0 366 RET 367 368INIT_YMM avx2 369cglobal pixel_ssd_32x32, 4,7,8 370 add r1, r1 371 add r3, r3 372 mov r4d, 16 373 pxor m0, m0 374.loop: 375 movu m1, [r0] 376 movu m2, [r0 + 32] 377 movu m3, [r0 + r1] 378 movu m4, [r0 + r1 + 32] 379 movu m6, [r2] 380 movu m7, [r2 + 32] 381 psubw m1, m6 382 psubw m2, m7 383 movu m6, [r2 + r3] 384 movu m7, [r2 + r3 + 32] 385 psubw m3, m6 386 psubw m4, m7 387 388 lea r0, [r0 + r1 * 2] 389 lea r2, [r2 + r3 * 2] 390 391 pmaddwd m1, m1 392 pmaddwd m2, m2 393 pmaddwd m3, m3 394 pmaddwd m4, m4 395 paddd m1, m2 396 paddd m3, m4 397 paddd m0, m1 398 paddd m0, m3 399 400 dec r4d 401 jg .loop 402 403 HADDD m0, m5 404 movd eax, xm0 405 RET 406 407INIT_YMM avx2 408cglobal pixel_ssd_64x64, 4,7,8 409 FIX_STRIDES r1, r3 410 mov r4d, 64 411 pxor m0, m0 412.loop: 413 movu m1, [r0] 414 movu m2, [r0+32] 415 movu m3, [r0+32*2] 416 movu m4, [r0+32*3] 417 movu m6, [r2] 418 movu m7, [r2+32] 419 psubw m1, m6 420 psubw m2, m7 421 movu m6, [r2+32*2] 422 movu m7, [r2+32*3] 423 psubw m3, m6 424 psubw m4, m7 425 426 lea r0, [r0+r1] 427 lea r2, [r2+r3] 428 429 pmaddwd m1, m1 430 pmaddwd m2, m2 431 pmaddwd m3, m3 432 pmaddwd m4, m4 433 paddd m1, m2 434 paddd m3, m4 435 paddd m0, m1 436 paddd m0, m3 437 438 dec r4d 439 jg .loop 440 441 HADDD m0, m5 442 movd eax, xm0 443 RET 444 445INIT_MMX mmx2 446SSD_ONE 4, 4 447SSD_ONE 4, 8 448SSD_ONE 4, 16 449SSD_ONE 8, 4 450SSD_ONE 8, 8 451SSD_ONE 8, 16 452SSD_ONE 16, 8 453SSD_ONE 16, 16 454INIT_XMM sse2 455SSD_ONE 8, 4 456SSD_ONE 8, 8 457SSD_ONE 8, 16 458SSD_ONE 8, 32 459SSD_12 12, 16 460SSD_ONE 16, 4 461SSD_ONE 16, 8 462SSD_ONE 16, 12 463SSD_ONE 16, 16 464SSD_ONE 16, 32 465SSD_ONE 16, 64 466SSD_24 24, 32 467SSD_ONE 32, 8 468SSD_ONE 32, 16 469SSD_ONE 32, 24 470SSD_ONE 32, 32 471SSD_ONE 32, 64 472SSD_TWO 48, 64 473SSD_TWO 64, 16 474SSD_TWO 64, 32 475SSD_TWO 64, 48 476SSD_TWO 64, 64 477INIT_YMM avx2 478SSD_ONE 16, 8 479SSD_ONE 16, 16 480SSD_ONE 32, 32 481SSD_ONE 64, 64 482SSD_ONE 16, 32 483SSD_ONE 32, 64 484%endif ; HIGH_BIT_DEPTH 485 486;----------------------------------------------------------------------------- 487; int pixel_ssd_WxH( uint16_t *, intptr_t, uint16_t *, intptr_t ) 488;----------------------------------------------------------------------------- 489%if HIGH_BIT_DEPTH == 0 490%macro SSD_SS 2 491cglobal pixel_ssd_ss_%1x%2, 4,7,6 492 FIX_STRIDES r1, r3 493%if mmsize == %1*4 || mmsize == %1*2 494 %define offset0_1 r1*2 495 %define offset0_2 r1*4 496 %define offset0_3 r5 497 %define offset1_1 r3*2 498 %define offset1_2 r3*4 499 %define offset1_3 r6 500 lea r5, [4*r1] 501 lea r6, [4*r3] 502 lea r5, [r5 + 2*r1] 503 lea r6, [r6 + 2*r3] 504%elif mmsize == %1 505 %define offset0_1 16 506 %define offset0_2 r1*2 507 %define offset0_3 r1*2+16 508 %define offset1_1 16 509 %define offset1_2 r3*2 510 %define offset1_3 r3*2+16 511%endif 512%if %1 == 4 513 %assign %%n %2/(mmsize/%1) 514%else 515 %assign %%n %2/(2*mmsize/%1) 516%endif 517%if %%n > 1 518 mov r4d, %%n 519%endif 520 pxor m0, m0 521.loop: 522%if %1 == 4 523 movh m1, [r0] 524 movh m2, [r2] 525 psubw m1, m2 526 pmaddwd m1, m1 527 paddd m0, m1 528 movh m1, [r0 + offset0_1] 529 movh m2, [r2 + offset1_1] 530 psubw m1, m2 531 pmaddwd m1, m1 532 paddd m0, m1 533 movh m1, [r0 + offset0_2] 534 movh m2, [r2 + offset1_2] 535 psubw m1, m2 536 pmaddwd m1, m1 537 paddd m0, m1 538 movh m1, [r0 + offset0_3] 539 movh m2, [r2 + offset1_3] 540 psubw m1, m2 541 pmaddwd m1, m1 542 paddd m0, m1 543%else 544 movu m1, [r0] 545 movu m2, [r2] 546 psubw m1, m2 547 pmaddwd m1, m1 548 paddd m0, m1 549 movu m1, [r0 + offset0_1] 550 movu m2, [r2 + offset1_1] 551 psubw m1, m2 552 pmaddwd m1, m1 553 paddd m0, m1 554 movu m1, [r0 + offset0_2] 555 movu m2, [r2 + offset1_2] 556 psubw m1, m2 557 pmaddwd m1, m1 558 paddd m0, m1 559 movu m1, [r0 + offset0_3] 560 movu m2, [r2 + offset1_3] 561 psubw m1, m2 562 pmaddwd m1, m1 563 paddd m0, m1 564%endif 565 lea r0, [r0+r1*(%2/%%n)*2] 566 lea r2, [r2+r3*(%2/%%n)*2] 567%if %%n > 1 568 dec r4d 569 jg .loop 570%endif 571%if %1 == 4 572 %if notcpuflag(ssse3) 573 pshufd m1, m0, 1 574 paddd m0, m1 575 %else 576 phaddd m0, m0 577 %endif 578%else 579 HADDD m0, m1 580%endif 581 movd eax, m0 582 RET 583%endmacro 584%macro SSD_SS_ONE 0 585SSD_SS 4, 4 586SSD_SS 4, 8 587SSD_SS 4, 16 588SSD_SS 8, 4 589SSD_SS 8, 8 590SSD_SS 8, 16 591SSD_SS 8, 32 592SSD_SS 16, 4 593SSD_SS 16, 8 594SSD_SS 16, 12 595SSD_SS 16, 16 596SSD_SS 16, 32 597SSD_SS 16, 64 598%endmacro 599 600%macro SSD_SS_12x16 0 601cglobal pixel_ssd_ss_12x16, 4,7,6 602 FIX_STRIDES r1, r3 603 mov r4d, 8 604 pxor m0, m0 605.loop: 606 movu m1, [r0] 607 movu m2, [r2] 608 psubw m1, m2 609 pmaddwd m1, m1 610 paddd m0, m1 611 movu m1, [r0 + 16] 612 movu m2, [r2 + 16] 613 psubw m1, m2 614 pmaddwd m1, m1 615 pslldq m1, 8 616 psrldq m1, 8 617 paddd m0, m1 618 lea r0, [r0 + 2*r1] 619 lea r2, [r2 + 2*r3] 620 movu m1, [r0] 621 movu m2, [r2] 622 psubw m1, m2 623 pmaddwd m1, m1 624 paddd m0, m1 625 movu m1, [r0 + 16] 626 movu m2, [r2 + 16] 627 psubw m1, m2 628 pmaddwd m1, m1 629 pslldq m1, 8 630 psrldq m1, 8 631 paddd m0, m1 632 lea r0, [r0 + 2*r1] 633 lea r2, [r2 + 2*r3] 634 dec r4d 635 jnz .loop 636 HADDD m0, m1 637 movd eax, m0 638 RET 639%endmacro 640 641%macro SSD_SS_32 1 642cglobal pixel_ssd_ss_32x%1, 4,7,6 643 FIX_STRIDES r1, r3 644 mov r4d, %1/2 645 pxor m0, m0 646.loop: 647 movu m1, [r0] 648 movu m2, [r2] 649 psubw m1, m2 650 pmaddwd m1, m1 651 paddd m0, m1 652 movu m1, [r0 + 16] 653 movu m2, [r2 + 16] 654 psubw m1, m2 655 pmaddwd m1, m1 656 paddd m0, m1 657 movu m1, [r0 + 32] 658 movu m2, [r2 + 32] 659 psubw m1, m2 660 pmaddwd m1, m1 661 paddd m0, m1 662 movu m1, [r0 + 48] 663 movu m2, [r2 + 48] 664 psubw m1, m2 665 pmaddwd m1, m1 666 paddd m0, m1 667 lea r0, [r0 + 2*r1] 668 lea r2, [r2 + 2*r3] 669 movu m1, [r0] 670 movu m2, [r2] 671 psubw m1, m2 672 pmaddwd m1, m1 673 paddd m0, m1 674 movu m1, [r0 + 16] 675 movu m2, [r2 + 16] 676 psubw m1, m2 677 pmaddwd m1, m1 678 paddd m0, m1 679 movu m1, [r0 + 32] 680 movu m2, [r2 + 32] 681 psubw m1, m2 682 pmaddwd m1, m1 683 paddd m0, m1 684 movu m1, [r0 + 48] 685 movu m2, [r2 + 48] 686 psubw m1, m2 687 pmaddwd m1, m1 688 paddd m0, m1 689 lea r0, [r0 + 2*r1] 690 lea r2, [r2 + 2*r3] 691 dec r4d 692 jnz .loop 693 HADDD m0, m1 694 movd eax, m0 695 RET 696%endmacro 697 698%macro SSD_SS_32xN 0 699SSD_SS_32 8 700SSD_SS_32 16 701SSD_SS_32 24 702SSD_SS_32 32 703SSD_SS_32 64 704%endmacro 705 706%macro SSD_SS_24 0 707cglobal pixel_ssd_ss_24x32, 4,7,6 708 FIX_STRIDES r1, r3 709 mov r4d, 16 710 pxor m0, m0 711.loop: 712 movu m1, [r0] 713 movu m2, [r2] 714 psubw m1, m2 715 pmaddwd m1, m1 716 paddd m0, m1 717 movu m1, [r0 + 16] 718 movu m2, [r2 + 16] 719 psubw m1, m2 720 pmaddwd m1, m1 721 paddd m0, m1 722 movu m1, [r0 + 32] 723 movu m2, [r2 + 32] 724 psubw m1, m2 725 pmaddwd m1, m1 726 paddd m0, m1 727 lea r0, [r0 + 2*r1] 728 lea r2, [r2 + 2*r3] 729 movu m1, [r0] 730 movu m2, [r2] 731 psubw m1, m2 732 pmaddwd m1, m1 733 paddd m0, m1 734 movu m1, [r0 + 16] 735 movu m2, [r2 + 16] 736 psubw m1, m2 737 pmaddwd m1, m1 738 paddd m0, m1 739 movu m1, [r0 + 32] 740 movu m2, [r2 + 32] 741 psubw m1, m2 742 pmaddwd m1, m1 743 paddd m0, m1 744 lea r0, [r0 + 2*r1] 745 lea r2, [r2 + 2*r3] 746 dec r4d 747 jnz .loop 748 HADDD m0, m1 749 movd eax, m0 750 RET 751%endmacro 752 753%macro SSD_SS_48 0 754cglobal pixel_ssd_ss_48x64, 4,7,6 755 FIX_STRIDES r1, r3 756 mov r4d, 32 757 pxor m0, m0 758.loop: 759 movu m1, [r0] 760 movu m2, [r2] 761 psubw m1, m2 762 pmaddwd m1, m1 763 paddd m0, m1 764 movu m1, [r0 + 16] 765 movu m2, [r2 + 16] 766 psubw m1, m2 767 pmaddwd m1, m1 768 paddd m0, m1 769 movu m1, [r0 + 32] 770 movu m2, [r2 + 32] 771 psubw m1, m2 772 pmaddwd m1, m1 773 paddd m0, m1 774 movu m1, [r0 + 48] 775 movu m2, [r2 + 48] 776 psubw m1, m2 777 pmaddwd m1, m1 778 paddd m0, m1 779 movu m1, [r0 + 64] 780 movu m2, [r2 + 64] 781 psubw m1, m2 782 pmaddwd m1, m1 783 paddd m0, m1 784 movu m1, [r0 + 80] 785 movu m2, [r2 + 80] 786 psubw m1, m2 787 pmaddwd m1, m1 788 paddd m0, m1 789 lea r0, [r0 + 2*r1] 790 lea r2, [r2 + 2*r3] 791 movu m1, [r0] 792 movu m2, [r2] 793 psubw m1, m2 794 pmaddwd m1, m1 795 paddd m0, m1 796 movu m1, [r0 + 16] 797 movu m2, [r2 + 16] 798 psubw m1, m2 799 pmaddwd m1, m1 800 paddd m0, m1 801 movu m1, [r0 + 32] 802 movu m2, [r2 + 32] 803 psubw m1, m2 804 pmaddwd m1, m1 805 paddd m0, m1 806 movu m1, [r0 + 48] 807 movu m2, [r2 + 48] 808 psubw m1, m2 809 pmaddwd m1, m1 810 paddd m0, m1 811 movu m1, [r0 + 64] 812 movu m2, [r2 + 64] 813 psubw m1, m2 814 pmaddwd m1, m1 815 paddd m0, m1 816 movu m1, [r0 + 80] 817 movu m2, [r2 + 80] 818 psubw m1, m2 819 pmaddwd m1, m1 820 paddd m0, m1 821 lea r0, [r0 + 2*r1] 822 lea r2, [r2 + 2*r3] 823 dec r4d 824 jnz .loop 825 HADDD m0, m1 826 movd eax, m0 827 RET 828%endmacro 829 830%macro SSD_SS_64 1 831cglobal pixel_ssd_ss_64x%1, 4,7,6 832 FIX_STRIDES r1, r3 833 mov r4d, %1/2 834 pxor m0, m0 835.loop: 836 movu m1, [r0] 837 movu m2, [r2] 838 psubw m1, m2 839 pmaddwd m1, m1 840 paddd m0, m1 841 movu m1, [r0 + 16] 842 movu m2, [r2 + 16] 843 psubw m1, m2 844 pmaddwd m1, m1 845 paddd m0, m1 846 movu m1, [r0 + 32] 847 movu m2, [r2 + 32] 848 psubw m1, m2 849 pmaddwd m1, m1 850 paddd m0, m1 851 movu m1, [r0 + 48] 852 movu m2, [r2 + 48] 853 psubw m1, m2 854 pmaddwd m1, m1 855 paddd m0, m1 856 movu m1, [r0 + 64] 857 movu m2, [r2 + 64] 858 psubw m1, m2 859 pmaddwd m1, m1 860 paddd m0, m1 861 movu m1, [r0 + 80] 862 movu m2, [r2 + 80] 863 psubw m1, m2 864 pmaddwd m1, m1 865 paddd m0, m1 866 movu m1, [r0 + 96] 867 movu m2, [r2 + 96] 868 psubw m1, m2 869 pmaddwd m1, m1 870 paddd m0, m1 871 movu m1, [r0 + 112] 872 movu m2, [r2 + 112] 873 psubw m1, m2 874 pmaddwd m1, m1 875 paddd m0, m1 876 lea r0, [r0 + 2*r1] 877 lea r2, [r2 + 2*r3] 878 movu m1, [r0] 879 movu m2, [r2] 880 psubw m1, m2 881 pmaddwd m1, m1 882 paddd m0, m1 883 movu m1, [r0 + 16] 884 movu m2, [r2 + 16] 885 psubw m1, m2 886 pmaddwd m1, m1 887 paddd m0, m1 888 movu m1, [r0 + 32] 889 movu m2, [r2 + 32] 890 psubw m1, m2 891 pmaddwd m1, m1 892 paddd m0, m1 893 movu m1, [r0 + 48] 894 movu m2, [r2 + 48] 895 psubw m1, m2 896 pmaddwd m1, m1 897 paddd m0, m1 898 movu m1, [r0 + 64] 899 movu m2, [r2 + 64] 900 psubw m1, m2 901 pmaddwd m1, m1 902 paddd m0, m1 903 movu m1, [r0 + 80] 904 movu m2, [r2 + 80] 905 psubw m1, m2 906 pmaddwd m1, m1 907 paddd m0, m1 908 movu m1, [r0 + 96] 909 movu m2, [r2 + 96] 910 psubw m1, m2 911 pmaddwd m1, m1 912 paddd m0, m1 913 movu m1, [r0 + 112] 914 movu m2, [r2 + 112] 915 psubw m1, m2 916 pmaddwd m1, m1 917 paddd m0, m1 918 lea r0, [r0 + 2*r1] 919 lea r2, [r2 + 2*r3] 920 dec r4d 921 jnz .loop 922 HADDD m0, m1 923 movd eax, m0 924 RET 925%endmacro 926 927%macro SSD_SS_64xN 0 928SSD_SS_64 16 929SSD_SS_64 32 930SSD_SS_64 48 931SSD_SS_64 64 932%endmacro 933 934INIT_XMM sse2 935SSD_SS_ONE 936SSD_SS_12x16 937SSD_SS_24 938SSD_SS_32xN 939SSD_SS_48 940SSD_SS_64xN 941INIT_XMM sse4 942SSD_SS_ONE 943SSD_SS_12x16 944SSD_SS_24 945SSD_SS_32xN 946SSD_SS_48 947SSD_SS_64xN 948INIT_XMM avx 949SSD_SS_ONE 950SSD_SS_12x16 951SSD_SS_24 952SSD_SS_32xN 953SSD_SS_48 954SSD_SS_64xN 955%endif ; !HIGH_BIT_DEPTH 956 957%if HIGH_BIT_DEPTH == 0 958%macro SSD_LOAD_FULL 5 959 movu m1, [t0+%1] 960 movu m2, [t2+%2] 961 movu m3, [t0+%3] 962 movu m4, [t2+%4] 963%if %5==1 964 add t0, t1 965 add t2, t3 966%elif %5==2 967 lea t0, [t0+2*t1] 968 lea t2, [t2+2*t3] 969%endif 970%endmacro 971 972%macro LOAD 5 973 movh m%1, %3 974 movh m%2, %4 975%if %5 976 lea t0, [t0+2*t1] 977%endif 978%endmacro 979 980%macro JOIN 7 981 movh m%3, %5 982 movh m%4, %6 983%if %7 984 lea t2, [t2+2*t3] 985%endif 986 punpcklbw m%1, m7 987 punpcklbw m%3, m7 988 psubw m%1, m%3 989 punpcklbw m%2, m7 990 punpcklbw m%4, m7 991 psubw m%2, m%4 992%endmacro 993 994%macro JOIN_SSE2 7 995 movh m%3, %5 996 movh m%4, %6 997%if %7 998 lea t2, [t2+2*t3] 999%endif 1000 punpcklqdq m%1, m%2 1001 punpcklqdq m%3, m%4 1002 DEINTB %2, %1, %4, %3, 7 1003 psubw m%2, m%4 1004 psubw m%1, m%3 1005%endmacro 1006 1007%macro JOIN_SSSE3 7 1008 movh m%3, %5 1009 movh m%4, %6 1010%if %7 1011 lea t2, [t2+2*t3] 1012%endif 1013 punpcklbw m%1, m%3 1014 punpcklbw m%2, m%4 1015%endmacro 1016 1017%macro LOAD_AVX2 5 1018 mova xm%1, %3 1019 vinserti128 m%1, m%1, %4, 1 1020%if %5 1021 lea t0, [t0+2*t1] 1022%endif 1023%endmacro 1024 1025%macro JOIN_AVX2 7 1026 mova xm%2, %5 1027 vinserti128 m%2, m%2, %6, 1 1028%if %7 1029 lea t2, [t2+2*t3] 1030%endif 1031 SBUTTERFLY bw, %1, %2, %3 1032%endmacro 1033 1034%macro SSD_LOAD_HALF 5 1035 LOAD 1, 2, [t0+%1], [t0+%3], 1 1036 JOIN 1, 2, 3, 4, [t2+%2], [t2+%4], 1 1037 LOAD 3, 4, [t0+%1], [t0+%3], %5 1038 JOIN 3, 4, 5, 6, [t2+%2], [t2+%4], %5 1039%endmacro 1040 1041%macro SSD_CORE 7-8 1042%ifidn %8, FULL 1043 mova m%6, m%2 1044 mova m%7, m%4 1045 psubusb m%2, m%1 1046 psubusb m%4, m%3 1047 psubusb m%1, m%6 1048 psubusb m%3, m%7 1049 por m%1, m%2 1050 por m%3, m%4 1051 punpcklbw m%2, m%1, m%5 1052 punpckhbw m%1, m%5 1053 punpcklbw m%4, m%3, m%5 1054 punpckhbw m%3, m%5 1055%endif 1056 pmaddwd m%1, m%1 1057 pmaddwd m%2, m%2 1058 pmaddwd m%3, m%3 1059 pmaddwd m%4, m%4 1060%endmacro 1061 1062%macro SSD_CORE_SSE2 7-8 1063%ifidn %8, FULL 1064 DEINTB %6, %1, %7, %2, %5 1065 psubw m%6, m%7 1066 psubw m%1, m%2 1067 SWAP %6, %2, %1 1068 DEINTB %6, %3, %7, %4, %5 1069 psubw m%6, m%7 1070 psubw m%3, m%4 1071 SWAP %6, %4, %3 1072%endif 1073 pmaddwd m%1, m%1 1074 pmaddwd m%2, m%2 1075 pmaddwd m%3, m%3 1076 pmaddwd m%4, m%4 1077%endmacro 1078 1079%macro SSD_CORE_SSSE3 7-8 1080%ifidn %8, FULL 1081 punpckhbw m%6, m%1, m%2 1082 punpckhbw m%7, m%3, m%4 1083 punpcklbw m%1, m%2 1084 punpcklbw m%3, m%4 1085 SWAP %6, %2, %3 1086 SWAP %7, %4 1087%endif 1088 pmaddubsw m%1, m%5 1089 pmaddubsw m%2, m%5 1090 pmaddubsw m%3, m%5 1091 pmaddubsw m%4, m%5 1092 pmaddwd m%1, m%1 1093 pmaddwd m%2, m%2 1094 pmaddwd m%3, m%3 1095 pmaddwd m%4, m%4 1096%endmacro 1097 1098%macro SSD_ITER 6 1099 SSD_LOAD_%1 %2,%3,%4,%5,%6 1100 SSD_CORE 1, 2, 3, 4, 7, 5, 6, %1 1101 paddd m1, m2 1102 paddd m3, m4 1103 paddd m0, m1 1104 paddd m0, m3 1105%endmacro 1106 1107;----------------------------------------------------------------------------- 1108; int pixel_ssd_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t ) 1109;----------------------------------------------------------------------------- 1110%macro SSD 2 1111%if %1 != %2 1112 %assign function_align 8 1113%else 1114 %assign function_align 16 1115%endif 1116cglobal pixel_ssd_%1x%2, 0,0,0 1117 mov al, %1*%2/mmsize/2 1118 1119%if %1 != %2 1120 jmp mangle(private_prefix %+ _ %+ pixel_ssd_%1x%1 %+ SUFFIX %+ .startloop) 1121%else 1122 1123.startloop: 1124%if ARCH_X86_64 1125 DECLARE_REG_TMP 0,1,2,3 1126 PROLOGUE 0,0,8 1127%else 1128 PROLOGUE 0,5 1129 DECLARE_REG_TMP 1,2,3,4 1130 mov t0, r0m 1131 mov t1, r1m 1132 mov t2, r2m 1133 mov t3, r3m 1134%endif 1135 1136%if cpuflag(ssse3) 1137 mova m7, [hsub_mul] 1138%elifidn cpuname, sse2 1139 mova m7, [pw_00ff] 1140%elif %1 >= mmsize 1141 pxor m7, m7 1142%endif 1143 pxor m0, m0 1144 1145ALIGN 16 1146.loop: 1147%if %1 > mmsize 1148 SSD_ITER FULL, 0, 0, mmsize, mmsize, 1 1149%elif %1 == mmsize 1150 SSD_ITER FULL, 0, 0, t1, t3, 2 1151%else 1152 SSD_ITER HALF, 0, 0, t1, t3, 2 1153%endif 1154 dec al 1155 jg .loop 1156%if mmsize==32 1157 vextracti128 xm1, m0, 1 1158 paddd xm0, xm1 1159 HADDD xm0, xm1 1160 movd eax, xm0 1161%else 1162 HADDD m0, m1 1163 movd eax, m0 1164%endif 1165%if (mmsize == 8) 1166 emms 1167%endif 1168 RET 1169%endif 1170%endmacro 1171 1172%macro HEVC_SSD 0 1173SSD 32, 64 1174SSD 16, 64 1175SSD 32, 32 1176SSD 32, 16 1177SSD 16, 32 1178SSD 32, 8 1179SSD 8, 32 1180SSD 32, 24 1181SSD 24, 24 ; not used, but resolves x265_pixel_ssd_24x24_sse2.startloop symbol 1182SSD 8, 4 1183SSD 8, 8 1184SSD 16, 16 1185SSD 16, 12 1186SSD 16, 8 1187SSD 8, 16 1188SSD 16, 4 1189%endmacro 1190 1191INIT_MMX mmx 1192SSD 16, 16 1193SSD 16, 8 1194SSD 8, 8 1195SSD 8, 16 1196SSD 4, 4 1197SSD 8, 4 1198SSD 4, 8 1199SSD 4, 16 1200INIT_XMM sse2slow 1201SSD 16, 16 1202SSD 8, 8 1203SSD 16, 8 1204SSD 8, 16 1205SSD 8, 4 1206INIT_XMM sse2 1207%define SSD_CORE SSD_CORE_SSE2 1208%define JOIN JOIN_SSE2 1209HEVC_SSD 1210INIT_XMM ssse3 1211%define SSD_CORE SSD_CORE_SSSE3 1212%define JOIN JOIN_SSSE3 1213HEVC_SSD 1214INIT_XMM avx 1215HEVC_SSD 1216INIT_MMX ssse3 1217SSD 4, 4 1218SSD 4, 8 1219SSD 4, 16 1220INIT_XMM xop 1221SSD 16, 16 1222SSD 8, 8 1223SSD 16, 8 1224SSD 8, 16 1225SSD 8, 4 1226%define LOAD LOAD_AVX2 1227%define JOIN JOIN_AVX2 1228INIT_YMM avx2 1229SSD 16, 16 1230SSD 16, 8 1231SSD 32, 32 1232SSD 64, 64 1233%assign function_align 16 1234%endif ; !HIGH_BIT_DEPTH 1235 1236;----------------------------------------------------------------------------- 1237; int pixel_ssd_12x16( uint8_t *, intptr_t, uint8_t *, intptr_t ) 1238;----------------------------------------------------------------------------- 1239INIT_XMM sse4 1240cglobal pixel_ssd_12x16, 4, 5, 7, src1, stride1, src2, stride2 1241 1242 pxor m6, m6 1243 mov r4d, 4 1244 1245.loop: 1246 movu m0, [r0] 1247 movu m1, [r2] 1248 movu m2, [r0 + r1] 1249 movu m3, [r2 + r3] 1250 1251 punpckhdq m4, m0, m2 1252 punpckhdq m5, m1, m3 1253 1254 pmovzxbw m0, m0 1255 pmovzxbw m1, m1 1256 pmovzxbw m2, m2 1257 pmovzxbw m3, m3 1258 pmovzxbw m4, m4 1259 pmovzxbw m5, m5 1260 1261 psubw m0, m1 1262 psubw m2, m3 1263 psubw m4, m5 1264 1265 pmaddwd m0, m0 1266 pmaddwd m2, m2 1267 pmaddwd m4, m4 1268 1269 paddd m0, m2 1270 paddd m6, m4 1271 paddd m6, m0 1272 1273 movu m0, [r0 + 2 * r1] 1274 movu m1, [r2 + 2 * r3] 1275 lea r0, [r0 + 2 * r1] 1276 lea r2, [r2 + 2 * r3] 1277 movu m2, [r0 + r1] 1278 movu m3, [r2 + r3] 1279 1280 punpckhdq m4, m0, m2 1281 punpckhdq m5, m1, m3 1282 1283 pmovzxbw m0, m0 1284 pmovzxbw m1, m1 1285 pmovzxbw m2, m2 1286 pmovzxbw m3, m3 1287 pmovzxbw m4, m4 1288 pmovzxbw m5, m5 1289 1290 psubw m0, m1 1291 psubw m2, m3 1292 psubw m4, m5 1293 1294 pmaddwd m0, m0 1295 pmaddwd m2, m2 1296 pmaddwd m4, m4 1297 1298 paddd m0, m2 1299 paddd m6, m4 1300 paddd m6, m0 1301 1302 dec r4d 1303 lea r0, [r0 + 2 * r1] 1304 lea r2, [r2 + 2 * r3] 1305 jnz .loop 1306 1307 HADDD m6, m1 1308 movd eax, m6 1309 1310 RET 1311 1312;----------------------------------------------------------------------------- 1313; int pixel_ssd_24x32( uint8_t *, intptr_t, uint8_t *, intptr_t ) 1314;----------------------------------------------------------------------------- 1315INIT_XMM sse4 1316cglobal pixel_ssd_24x32, 4, 5, 8, src1, stride1, src2, stride2 1317 1318 pxor m7, m7 1319 pxor m6, m6 1320 mov r4d, 16 1321 1322.loop: 1323 movu m1, [r0] 1324 pmovzxbw m0, m1 1325 punpckhbw m1, m6 1326 pmovzxbw m2, [r0 + 16] 1327 movu m4, [r2] 1328 pmovzxbw m3, m4 1329 punpckhbw m4, m6 1330 pmovzxbw m5, [r2 + 16] 1331 1332 psubw m0, m3 1333 psubw m1, m4 1334 psubw m2, m5 1335 1336 pmaddwd m0, m0 1337 pmaddwd m1, m1 1338 pmaddwd m2, m2 1339 1340 paddd m0, m1 1341 paddd m7, m2 1342 paddd m7, m0 1343 1344 movu m1, [r0 + r1] 1345 pmovzxbw m0, m1 1346 punpckhbw m1, m6 1347 pmovzxbw m2, [r0 + r1 + 16] 1348 movu m4, [r2 + r3] 1349 pmovzxbw m3, m4 1350 punpckhbw m4, m6 1351 pmovzxbw m5, [r2 + r3 + 16] 1352 1353 psubw m0, m3 1354 psubw m1, m4 1355 psubw m2, m5 1356 1357 pmaddwd m0, m0 1358 pmaddwd m1, m1 1359 pmaddwd m2, m2 1360 1361 paddd m0, m1 1362 paddd m7, m2 1363 paddd m7, m0 1364 1365 dec r4d 1366 lea r0, [r0 + 2 * r1] 1367 lea r2, [r2 + 2 * r3] 1368 jnz .loop 1369 1370 HADDD m7, m1 1371 movd eax, m7 1372 1373 RET 1374 1375%macro PIXEL_SSD_16x4 0 1376 movu m1, [r0] 1377 pmovzxbw m0, m1 1378 punpckhbw m1, m6 1379 movu m3, [r2] 1380 pmovzxbw m2, m3 1381 punpckhbw m3, m6 1382 1383 psubw m0, m2 1384 psubw m1, m3 1385 1386 movu m5, [r0 + r1] 1387 pmovzxbw m4, m5 1388 punpckhbw m5, m6 1389 movu m3, [r2 + r3] 1390 pmovzxbw m2, m3 1391 punpckhbw m3, m6 1392 1393 psubw m4, m2 1394 psubw m5, m3 1395 1396 pmaddwd m0, m0 1397 pmaddwd m1, m1 1398 pmaddwd m4, m4 1399 pmaddwd m5, m5 1400 1401 paddd m0, m1 1402 paddd m4, m5 1403 paddd m4, m0 1404 paddd m7, m4 1405 1406 movu m1, [r0 + r6] 1407 pmovzxbw m0, m1 1408 punpckhbw m1, m6 1409 movu m3, [r2 + 2 * r3] 1410 pmovzxbw m2, m3 1411 punpckhbw m3, m6 1412 1413 psubw m0, m2 1414 psubw m1, m3 1415 1416 lea r0, [r0 + r6] 1417 lea r2, [r2 + 2 * r3] 1418 movu m5, [r0 + r1] 1419 pmovzxbw m4, m5 1420 punpckhbw m5, m6 1421 movu m3, [r2 + r3] 1422 pmovzxbw m2, m3 1423 punpckhbw m3, m6 1424 1425 psubw m4, m2 1426 psubw m5, m3 1427 1428 pmaddwd m0, m0 1429 pmaddwd m1, m1 1430 pmaddwd m4, m4 1431 pmaddwd m5, m5 1432 1433 paddd m0, m1 1434 paddd m4, m5 1435 paddd m4, m0 1436 paddd m7, m4 1437%endmacro 1438 1439cglobal pixel_ssd_16x16_internal 1440 PIXEL_SSD_16x4 1441 lea r0, [r0 + r6] 1442 lea r2, [r2 + 2 * r3] 1443 PIXEL_SSD_16x4 1444 lea r0, [r0 + r6] 1445 lea r2, [r2 + 2 * r3] 1446 PIXEL_SSD_16x4 1447 lea r0, [r0 + r6] 1448 lea r2, [r2 + 2 * r3] 1449 PIXEL_SSD_16x4 1450 ret 1451 1452;----------------------------------------------------------------------------- 1453; int pixel_ssd_48x64( uint8_t *, intptr_t, uint8_t *, intptr_t ) 1454;----------------------------------------------------------------------------- 1455INIT_XMM sse4 1456cglobal pixel_ssd_48x64, 4, 7, 8, src1, stride1, src2, stride2 1457 1458 pxor m7, m7 1459 pxor m6, m6 1460 mov r4, r0 1461 mov r5, r2 1462 lea r6, [r1 * 2] 1463 1464 call pixel_ssd_16x16_internal 1465 lea r0, [r0 + r6] 1466 lea r2, [r2 + 2 * r3] 1467 call pixel_ssd_16x16_internal 1468 lea r0, [r0 + r6] 1469 lea r2, [r2 + 2 * r3] 1470 call pixel_ssd_16x16_internal 1471 lea r0, [r0 + r6] 1472 lea r2, [r2 + 2 * r3] 1473 call pixel_ssd_16x16_internal 1474 lea r0, [r4 + 16] 1475 lea r2, [r5 + 16] 1476 call pixel_ssd_16x16_internal 1477 lea r0, [r0 + r6] 1478 lea r2, [r2 + 2 * r3] 1479 call pixel_ssd_16x16_internal 1480 lea r0, [r0 + r6] 1481 lea r2, [r2 + 2 * r3] 1482 call pixel_ssd_16x16_internal 1483 lea r0, [r0 + r6] 1484 lea r2, [r2 + 2 * r3] 1485 call pixel_ssd_16x16_internal 1486 lea r0, [r4 + 32] 1487 lea r2, [r5 + 32] 1488 call pixel_ssd_16x16_internal 1489 lea r0, [r0 + r6] 1490 lea r2, [r2 + 2 * r3] 1491 call pixel_ssd_16x16_internal 1492 lea r0, [r0 + r6] 1493 lea r2, [r2 + 2 * r3] 1494 call pixel_ssd_16x16_internal 1495 lea r0, [r0 + r6] 1496 lea r2, [r2 + 2 * r3] 1497 call pixel_ssd_16x16_internal 1498 1499 HADDD m7, m1 1500 movd eax, m7 1501 1502 RET 1503 1504;----------------------------------------------------------------------------- 1505; int pixel_ssd_64x16( uint8_t *, intptr_t, uint8_t *, intptr_t ) 1506;----------------------------------------------------------------------------- 1507INIT_XMM sse4 1508cglobal pixel_ssd_64x16, 4, 7, 8, src1, stride1, src2, stride2 1509 1510 pxor m7, m7 1511 pxor m6, m6 1512 mov r4, r0 1513 mov r5, r2 1514 lea r6, [r1 * 2] 1515 1516 call pixel_ssd_16x16_internal 1517 lea r0, [r4 + 16] 1518 lea r2, [r5 + 16] 1519 call pixel_ssd_16x16_internal 1520 lea r0, [r4 + 32] 1521 lea r2, [r5 + 32] 1522 call pixel_ssd_16x16_internal 1523 lea r0, [r4 + 48] 1524 lea r2, [r5 + 48] 1525 call pixel_ssd_16x16_internal 1526 1527 HADDD m7, m1 1528 movd eax, m7 1529 1530 RET 1531 1532;----------------------------------------------------------------------------- 1533; int pixel_ssd_64x32( uint8_t *, intptr_t, uint8_t *, intptr_t ) 1534;----------------------------------------------------------------------------- 1535INIT_XMM sse4 1536cglobal pixel_ssd_64x32, 4, 7, 8, src1, stride1, src2, stride2 1537 1538 pxor m7, m7 1539 pxor m6, m6 1540 mov r4, r0 1541 mov r5, r2 1542 lea r6, [r1 * 2] 1543 1544 call pixel_ssd_16x16_internal 1545 lea r0, [r0 + r6] 1546 lea r2, [r2 + 2 * r3] 1547 call pixel_ssd_16x16_internal 1548 lea r0, [r4 + 16] 1549 lea r2, [r5 + 16] 1550 call pixel_ssd_16x16_internal 1551 lea r0, [r0 + r6] 1552 lea r2, [r2 + 2 * r3] 1553 call pixel_ssd_16x16_internal 1554 lea r0, [r4 + 32] 1555 lea r2, [r5 + 32] 1556 call pixel_ssd_16x16_internal 1557 lea r0, [r0 + r6] 1558 lea r2, [r2 + 2 * r3] 1559 call pixel_ssd_16x16_internal 1560 lea r0, [r4 + 48] 1561 lea r2, [r5 + 48] 1562 call pixel_ssd_16x16_internal 1563 lea r0, [r0 + r6] 1564 lea r2, [r2 + 2 * r3] 1565 call pixel_ssd_16x16_internal 1566 1567 HADDD m7, m1 1568 movd eax, m7 1569 1570 RET 1571 1572;----------------------------------------------------------------------------- 1573; int pixel_ssd_64x48( uint8_t *, intptr_t, uint8_t *, intptr_t ) 1574;----------------------------------------------------------------------------- 1575INIT_XMM sse4 1576cglobal pixel_ssd_64x48, 4, 7, 8, src1, stride1, src2, stride2 1577 1578 pxor m7, m7 1579 pxor m6, m6 1580 mov r4, r0 1581 mov r5, r2 1582 lea r6, [r1 * 2] 1583 1584 call pixel_ssd_16x16_internal 1585 lea r0, [r0 + r6] 1586 lea r2, [r2 + 2 * r3] 1587 call pixel_ssd_16x16_internal 1588 lea r0, [r0 + r6] 1589 lea r2, [r2 + 2 * r3] 1590 call pixel_ssd_16x16_internal 1591 lea r0, [r4 + 16] 1592 lea r2, [r5 + 16] 1593 call pixel_ssd_16x16_internal 1594 lea r0, [r0 + r6] 1595 lea r2, [r2 + 2 * r3] 1596 call pixel_ssd_16x16_internal 1597 lea r0, [r0 + r6] 1598 lea r2, [r2 + 2 * r3] 1599 call pixel_ssd_16x16_internal 1600 lea r0, [r4 + 32] 1601 lea r2, [r5 + 32] 1602 call pixel_ssd_16x16_internal 1603 lea r0, [r0 + r6] 1604 lea r2, [r2 + 2 * r3] 1605 call pixel_ssd_16x16_internal 1606 lea r0, [r0 + r6] 1607 lea r2, [r2 + 2 * r3] 1608 call pixel_ssd_16x16_internal 1609 lea r0, [r4 + 48] 1610 lea r2, [r5 + 48] 1611 call pixel_ssd_16x16_internal 1612 lea r0, [r0 + r6] 1613 lea r2, [r2 + 2 * r3] 1614 call pixel_ssd_16x16_internal 1615 lea r0, [r0 + r6] 1616 lea r2, [r2 + 2 * r3] 1617 call pixel_ssd_16x16_internal 1618 1619 HADDD m7, m1 1620 movd eax, m7 1621 1622 RET 1623 1624;----------------------------------------------------------------------------- 1625; int pixel_ssd_64x64( uint8_t *, intptr_t, uint8_t *, intptr_t ) 1626;----------------------------------------------------------------------------- 1627INIT_XMM sse4 1628cglobal pixel_ssd_64x64, 4, 7, 8, src1, stride1, src2, stride2 1629 1630 pxor m7, m7 1631 pxor m6, m6 1632 mov r4, r0 1633 mov r5, r2 1634 lea r6, [r1 * 2] 1635 1636 call pixel_ssd_16x16_internal 1637 lea r0, [r0 + r6] 1638 lea r2, [r2 + 2 * r3] 1639 call pixel_ssd_16x16_internal 1640 lea r0, [r0 + r6] 1641 lea r2, [r2 + 2 * r3] 1642 call pixel_ssd_16x16_internal 1643 lea r0, [r0 + r6] 1644 lea r2, [r2 + 2 * r3] 1645 call pixel_ssd_16x16_internal 1646 lea r0, [r4 + 16] 1647 lea r2, [r5 + 16] 1648 call pixel_ssd_16x16_internal 1649 lea r0, [r0 + r6] 1650 lea r2, [r2 + 2 * r3] 1651 call pixel_ssd_16x16_internal 1652 lea r0, [r0 + r6] 1653 lea r2, [r2 + 2 * r3] 1654 call pixel_ssd_16x16_internal 1655 lea r0, [r0 + r6] 1656 lea r2, [r2 + 2 * r3] 1657 call pixel_ssd_16x16_internal 1658 lea r0, [r4 + 32] 1659 lea r2, [r5 + 32] 1660 call pixel_ssd_16x16_internal 1661 lea r0, [r0 + r6] 1662 lea r2, [r2 + 2 * r3] 1663 call pixel_ssd_16x16_internal 1664 lea r0, [r0 + r6] 1665 lea r2, [r2 + 2 * r3] 1666 call pixel_ssd_16x16_internal 1667 lea r0, [r0 + r6] 1668 lea r2, [r2 + 2 * r3] 1669 call pixel_ssd_16x16_internal 1670 lea r0, [r4 + 48] 1671 lea r2, [r5 + 48] 1672 call pixel_ssd_16x16_internal 1673 lea r0, [r0 + r6] 1674 lea r2, [r2 + 2 * r3] 1675 call pixel_ssd_16x16_internal 1676 lea r0, [r0 + r6] 1677 lea r2, [r2 + 2 * r3] 1678 call pixel_ssd_16x16_internal 1679 lea r0, [r0 + r6] 1680 lea r2, [r2 + 2 * r3] 1681 call pixel_ssd_16x16_internal 1682 1683 HADDD m7, m1 1684 movd eax, m7 1685 1686 RET 1687 1688;----------------------------------------------------------------------------- 1689; int pixel_ssd_sp ( int16_t *, intptr_t, uint8_t *, intptr_t ) 1690;----------------------------------------------------------------------------- 1691 1692cglobal pixel_ssd_sp_4x4_internal 1693 movh m0, [r0] 1694 movh m1, [r0 + r1] 1695 punpcklqdq m0, m1 1696 movd m2, [r2] 1697 movd m3, [r2 + r3] 1698 punpckldq m2, m3 1699 pmovzxbw m2, m2 1700 psubw m0, m2 1701 movh m4, [r0 + 2 * r1] 1702 movh m5, [r0 + r4] 1703 punpcklqdq m4, m5 1704 movd m6, [r2 + 2 * r3] 1705 lea r2, [r2 + 2 * r3] 1706 movd m1, [r2 + r3] 1707 punpckldq m6, m1 1708 pmovzxbw m6, m6 1709 psubw m4, m6 1710 pmaddwd m0, m0 1711 pmaddwd m4, m4 1712 paddd m0, m4 1713 paddd m7, m0 1714 ret 1715 1716;----------------------------------------------------------------------------- 1717; int pixel_ssd_sp_4x4( int16_t *, intptr_t, uint8_t *, intptr_t ) 1718;----------------------------------------------------------------------------- 1719INIT_XMM sse4 1720cglobal pixel_ssd_sp_4x4, 4, 5, 8, src1, stride1, src2, stride2 1721 pxor m7, m7 1722 add r1, r1 1723 lea r4, [r1 * 3] 1724 call pixel_ssd_sp_4x4_internal 1725 HADDD m7, m1 1726 movd eax, m7 1727 RET 1728 1729;----------------------------------------------------------------------------- 1730; int pixel_ssd_sp_4x8( int16_t *, intptr_t, uint8_t *, intptr_t ) 1731;----------------------------------------------------------------------------- 1732INIT_XMM sse4 1733cglobal pixel_ssd_sp_4x8, 4, 5, 8, src1, stride1, src2, stride2 1734 pxor m7, m7 1735 add r1, r1 1736 lea r4, [r1 * 3] 1737 call pixel_ssd_sp_4x4_internal 1738 lea r0, [r0 + 4 * r1] 1739 lea r2, [r2 + 2 * r3] 1740 call pixel_ssd_sp_4x4_internal 1741 HADDD m7, m1 1742 movd eax, m7 1743 RET 1744 1745;----------------------------------------------------------------------------- 1746; int pixel_ssd_sp_4x16( int16_t *, intptr_t, uint8_t *, intptr_t ) 1747;----------------------------------------------------------------------------- 1748INIT_XMM sse4 1749cglobal pixel_ssd_sp_4x16, 4, 5, 8, src1, stride1, src2, stride2 1750 pxor m7, m7 1751 add r1, r1 1752 lea r4, [r1 * 3] 1753 call pixel_ssd_sp_4x4_internal 1754 lea r0, [r0 + 4 * r1] 1755 lea r2, [r2 + 2 * r3] 1756 call pixel_ssd_sp_4x4_internal 1757 lea r0, [r0 + 4 * r1] 1758 lea r2, [r2 + 2 * r3] 1759 call pixel_ssd_sp_4x4_internal 1760 lea r0, [r0 + 4 * r1] 1761 lea r2, [r2 + 2 * r3] 1762 call pixel_ssd_sp_4x4_internal 1763 HADDD m7, m1 1764 movd eax, m7 1765 RET 1766 1767cglobal pixel_ssd_sp_8x4_internal 1768 movu m0, [r0] 1769 movu m1, [r0 + r1] 1770 movh m2, [r2] 1771 movh m3, [r2 + r3] 1772 pmovzxbw m2, m2 1773 pmovzxbw m3, m3 1774 1775 psubw m0, m2 1776 psubw m1, m3 1777 1778 movu m4, [r0 + 2 * r1] 1779 movu m5, [r0 + r4] 1780 movh m2, [r2 + 2 * r3] 1781 movh m3, [r2 + r5] 1782 pmovzxbw m2, m2 1783 pmovzxbw m3, m3 1784 1785 psubw m4, m2 1786 psubw m5, m3 1787 1788 pmaddwd m0, m0 1789 pmaddwd m1, m1 1790 pmaddwd m4, m4 1791 pmaddwd m5, m5 1792 1793 paddd m0, m1 1794 paddd m4, m5 1795 paddd m4, m0 1796 paddd m7, m4 1797 ret 1798 1799;----------------------------------------------------------------------------- 1800; int pixel_ssd_sp_8x4( int16_t *, intptr_t, uint8_t *, intptr_t ) 1801;----------------------------------------------------------------------------- 1802INIT_XMM sse4 1803cglobal pixel_ssd_sp_8x4, 4, 6, 8, src1, stride1, src2, stride2 1804 pxor m7, m7 1805 add r1, r1 1806 lea r4, [r1 * 3] 1807 lea r5, [r3 * 3] 1808 call pixel_ssd_sp_8x4_internal 1809 HADDD m7, m1 1810 movd eax, m7 1811 RET 1812 1813;----------------------------------------------------------------------------- 1814; int pixel_ssd_sp_8x8( int16_t *, intptr_t, uint8_t *, intptr_t ) 1815;----------------------------------------------------------------------------- 1816INIT_XMM sse4 1817cglobal pixel_ssd_sp_8x8, 4, 6, 8, src1, stride1, src2, stride2 1818 pxor m7, m7 1819 add r1, r1 1820 lea r4, [r1 * 3] 1821 lea r5, [r3 * 3] 1822 call pixel_ssd_sp_8x4_internal 1823 lea r0, [r0 + 4 * r1] 1824 lea r2, [r2 + 4 * r3] 1825 call pixel_ssd_sp_8x4_internal 1826 HADDD m7, m1 1827 movd eax, m7 1828 RET 1829 1830;----------------------------------------------------------------------------- 1831; int pixel_ssd_sp_8x16( int16_t *, intptr_t, uint8_t *, intptr_t ) 1832;----------------------------------------------------------------------------- 1833INIT_XMM sse4 1834cglobal pixel_ssd_sp_8x16, 4, 6, 8, src1, stride1, src2, stride2 1835 pxor m7, m7 1836 add r1, r1 1837 lea r4, [r1 * 3] 1838 lea r5, [r3 * 3] 1839 call pixel_ssd_sp_8x4_internal 1840 lea r0, [r0 + 4 * r1] 1841 lea r2, [r2 + 4 * r3] 1842 call pixel_ssd_sp_8x4_internal 1843 lea r0, [r0 + 4 * r1] 1844 lea r2, [r2 + 4 * r3] 1845 call pixel_ssd_sp_8x4_internal 1846 lea r0, [r0 + 4 * r1] 1847 lea r2, [r2 + 4 * r3] 1848 call pixel_ssd_sp_8x4_internal 1849 HADDD m7, m1 1850 movd eax, m7 1851 RET 1852 1853;----------------------------------------------------------------------------- 1854; int pixel_ssd_sp_8x32( int16_t *, intptr_t, uint8_t *, intptr_t ) 1855;----------------------------------------------------------------------------- 1856INIT_XMM sse4 1857cglobal pixel_ssd_sp_8x32, 4, 6, 8, src1, stride1, src2, stride2 1858 pxor m7, m7 1859 add r1, r1 1860 lea r4, [r1 * 3] 1861 lea r5, [r3 * 3] 1862 call pixel_ssd_sp_8x4_internal 1863 lea r0, [r0 + 4 * r1] 1864 lea r2, [r2 + 4 * r3] 1865 call pixel_ssd_sp_8x4_internal 1866 lea r0, [r0 + 4 * r1] 1867 lea r2, [r2 + 4 * r3] 1868 call pixel_ssd_sp_8x4_internal 1869 lea r0, [r0 + 4 * r1] 1870 lea r2, [r2 + 4 * r3] 1871 call pixel_ssd_sp_8x4_internal 1872 lea r0, [r0 + 4 * r1] 1873 lea r2, [r2 + 4 * r3] 1874 call pixel_ssd_sp_8x4_internal 1875 lea r0, [r0 + 4 * r1] 1876 lea r2, [r2 + 4 * r3] 1877 call pixel_ssd_sp_8x4_internal 1878 lea r0, [r0 + 4 * r1] 1879 lea r2, [r2 + 4 * r3] 1880 call pixel_ssd_sp_8x4_internal 1881 lea r0, [r0 + 4 * r1] 1882 lea r2, [r2 + 4 * r3] 1883 call pixel_ssd_sp_8x4_internal 1884 HADDD m7, m1 1885 movd eax, m7 1886 RET 1887 1888;----------------------------------------------------------------------------- 1889; int pixel_ssd_sp_12x16( int16_t *, intptr_t, uint8_t *, intptr_t ) 1890;----------------------------------------------------------------------------- 1891INIT_XMM sse4 1892cglobal pixel_ssd_sp_12x16, 4, 7, 8, src1, stride1, src2, stride2 1893 pxor m7, m7 1894 add r1, r1 1895 lea r4, [r1 * 3] 1896 mov r5, r0 1897 mov r6, r2 1898 call pixel_ssd_sp_4x4_internal 1899 lea r0, [r0 + 4 * r1] 1900 lea r2, [r2 + 2 * r3] 1901 call pixel_ssd_sp_4x4_internal 1902 lea r0, [r0 + 4 * r1] 1903 lea r2, [r2 + 2 * r3] 1904 call pixel_ssd_sp_4x4_internal 1905 lea r0, [r0 + 4 * r1] 1906 lea r2, [r2 + 2 * r3] 1907 call pixel_ssd_sp_4x4_internal 1908 lea r0, [r5 + 8] 1909 lea r2, [r6 + 4] 1910 lea r5, [r3 * 3] 1911 call pixel_ssd_sp_8x4_internal 1912 lea r0, [r0 + 4 * r1] 1913 lea r2, [r2 + 4 * r3] 1914 call pixel_ssd_sp_8x4_internal 1915 lea r0, [r0 + 4 * r1] 1916 lea r2, [r2 + 4 * r3] 1917 call pixel_ssd_sp_8x4_internal 1918 lea r0, [r0 + 4 * r1] 1919 lea r2, [r2 + 4 * r3] 1920 call pixel_ssd_sp_8x4_internal 1921 HADDD m7, m1 1922 movd eax, m7 1923 RET 1924 1925%macro PIXEL_SSD_SP_16x4 0 1926 movu m0, [r0] 1927 movu m1, [r0 + 16] 1928 movu m3, [r2] 1929 pmovzxbw m2, m3 1930 punpckhbw m3, m6 1931 1932 psubw m0, m2 1933 psubw m1, m3 1934 1935 movu m4, [r0 + r1] 1936 movu m5, [r0 + r1 +16] 1937 movu m3, [r2 + r3] 1938 pmovzxbw m2, m3 1939 punpckhbw m3, m6 1940 1941 psubw m4, m2 1942 psubw m5, m3 1943 1944 pmaddwd m0, m0 1945 pmaddwd m1, m1 1946 pmaddwd m4, m4 1947 pmaddwd m5, m5 1948 1949 paddd m0, m1 1950 paddd m4, m5 1951 paddd m4, m0 1952 paddd m7, m4 1953 1954 movu m0, [r0 + 2 * r1] 1955 movu m1, [r0 + 2 * r1 + 16] 1956 movu m3, [r2 + 2 * r3] 1957 pmovzxbw m2, m3 1958 punpckhbw m3, m6 1959 1960 psubw m0, m2 1961 psubw m1, m3 1962 1963 lea r0, [r0 + 2 * r1] 1964 lea r2, [r2 + 2 * r3] 1965 movu m4, [r0 + r1] 1966 movu m5, [r0 + r1 + 16] 1967 movu m3, [r2 + r3] 1968 pmovzxbw m2, m3 1969 punpckhbw m3, m6 1970 1971 psubw m4, m2 1972 psubw m5, m3 1973 1974 pmaddwd m0, m0 1975 pmaddwd m1, m1 1976 pmaddwd m4, m4 1977 pmaddwd m5, m5 1978 1979 paddd m0, m1 1980 paddd m4, m5 1981 paddd m4, m0 1982 paddd m7, m4 1983%endmacro 1984 1985;----------------------------------------------------------------------------- 1986; int pixel_ssd_sp_16x4( int16_t *, intptr_t, uint8_t *, intptr_t ) 1987;----------------------------------------------------------------------------- 1988INIT_XMM sse4 1989cglobal pixel_ssd_sp_16x4, 4, 6, 8, src1, stride1, src2, stride2 1990 1991 pxor m6, m6 1992 pxor m7, m7 1993 add r1, r1 1994 PIXEL_SSD_SP_16x4 1995 HADDD m7, m1 1996 movd eax, m7 1997 1998 RET 1999 2000;----------------------------------------------------------------------------- 2001; int pixel_ssd_sp_16x8( int16_t *, intptr_t, uint8_t *, intptr_t ) 2002;----------------------------------------------------------------------------- 2003INIT_XMM sse4 2004cglobal pixel_ssd_sp_16x8, 4, 4, 8, src1, stride1, src2, stride2 2005 2006 pxor m6, m6 2007 pxor m7, m7 2008 add r1, r1 2009 PIXEL_SSD_SP_16x4 2010 lea r0, [r0 + 2 * r1] 2011 lea r2, [r2 + 2 * r3] 2012 PIXEL_SSD_SP_16x4 2013 HADDD m7, m1 2014 movd eax, m7 2015 RET 2016 2017;----------------------------------------------------------------------------- 2018; int pixel_ssd_sp_16x12( int16_t *, intptr_t, uint8_t *, intptr_t ) 2019;----------------------------------------------------------------------------- 2020INIT_XMM sse4 2021cglobal pixel_ssd_sp_16x12, 4, 6, 8, src1, stride1, src2, stride2 2022 2023 pxor m6, m6 2024 pxor m7, m7 2025 add r1, r1 2026 lea r4, [r1 * 2] 2027 lea r5, [r3 * 2] 2028 PIXEL_SSD_SP_16x4 2029 lea r0, [r0 + r4] 2030 lea r2, [r2 + r5] 2031 PIXEL_SSD_SP_16x4 2032 lea r0, [r0 + r4] 2033 lea r2, [r2 + r5] 2034 PIXEL_SSD_SP_16x4 2035 HADDD m7, m1 2036 movd eax, m7 2037 RET 2038 2039;----------------------------------------------------------------------------- 2040; int pixel_ssd_sp_16x16( int16_t *, intptr_t, uint8_t *, intptr_t ) 2041;----------------------------------------------------------------------------- 2042INIT_XMM sse4 2043cglobal pixel_ssd_sp_16x16, 4, 6, 8, src1, stride1, src2, stride2 2044 2045 pxor m6, m6 2046 pxor m7, m7 2047 add r1, r1 2048 lea r4, [r1 * 2] 2049 lea r5, [r3 * 2] 2050 PIXEL_SSD_SP_16x4 2051 lea r0, [r0 + r4] 2052 lea r2, [r2 + r5] 2053 PIXEL_SSD_SP_16x4 2054 lea r0, [r0 + r4] 2055 lea r2, [r2 + r5] 2056 PIXEL_SSD_SP_16x4 2057 lea r0, [r0 + r4] 2058 lea r2, [r2 + r5] 2059 PIXEL_SSD_SP_16x4 2060 HADDD m7, m1 2061 movd eax, m7 2062 RET 2063 2064cglobal pixel_ssd_sp_16x16_internal 2065 PIXEL_SSD_SP_16x4 2066 lea r0, [r0 + r4] 2067 lea r2, [r2 + 2 * r3] 2068 PIXEL_SSD_SP_16x4 2069 lea r0, [r0 + r4] 2070 lea r2, [r2 + 2 * r3] 2071 PIXEL_SSD_SP_16x4 2072 lea r0, [r0 + r4] 2073 lea r2, [r2 + 2 * r3] 2074 PIXEL_SSD_SP_16x4 2075 ret 2076 2077;----------------------------------------------------------------------------- 2078; int pixel_ssd_sp_16x32( int16_t *, intptr_t, uint8_t *, intptr_t ) 2079;----------------------------------------------------------------------------- 2080INIT_XMM sse4 2081cglobal pixel_ssd_sp_16x32, 4, 5, 8, src1, stride1, src2, stride2 2082 2083 pxor m6, m6 2084 pxor m7, m7 2085 add r1, r1 2086 lea r4, [r1 * 2] 2087 call pixel_ssd_sp_16x16_internal 2088 lea r0, [r0 + r4] 2089 lea r2, [r2 + 2 * r3] 2090 call pixel_ssd_sp_16x16_internal 2091 HADDD m7, m1 2092 movd eax, m7 2093 RET 2094 2095;----------------------------------------------------------------------------- 2096; int pixel_ssd_sp_16x64( int16_t *, intptr_t, uint8_t *, intptr_t ) 2097;----------------------------------------------------------------------------- 2098INIT_XMM sse4 2099cglobal pixel_ssd_sp_16x64, 4, 6, 8, src1, stride1, src2, stride2 2100 2101 pxor m6, m6 2102 pxor m7, m7 2103 add r1, r1 2104 lea r4, [r1 * 2] 2105 lea r5, [r3 * 2] 2106 call pixel_ssd_sp_16x16_internal 2107 lea r0, [r0 + r4] 2108 lea r2, [r2 + r5] 2109 call pixel_ssd_sp_16x16_internal 2110 lea r0, [r0 + r4] 2111 lea r2, [r2 + r5] 2112 call pixel_ssd_sp_16x16_internal 2113 lea r0, [r0 + r4] 2114 lea r2, [r2 + r5] 2115 call pixel_ssd_sp_16x16_internal 2116 2117 HADDD m7, m1 2118 movd eax, m7 2119 RET 2120 2121;----------------------------------------------------------------------------- 2122; int pixel_ssd_sp_24x32( int16_t *, intptr_t, uint8_t *, intptr_t ) 2123;----------------------------------------------------------------------------- 2124INIT_XMM sse4 2125cglobal pixel_ssd_sp_24x32, 4, 7, 8, src1, stride1, src2, stride2 2126 pxor m6, m6 2127 pxor m7, m7 2128 add r1, r1 2129 lea r4, [r1 * 2] 2130 mov r5, r0 2131 mov r6, r2 2132 call pixel_ssd_sp_16x16_internal 2133 lea r0, [r0 + r4] 2134 lea r2, [r2 + 2 * r3] 2135 call pixel_ssd_sp_16x16_internal 2136 lea r0, [r5 + 32] 2137 lea r2, [r6 + 16] 2138 lea r4, [r1 * 3] 2139 lea r5, [r3 * 3] 2140 call pixel_ssd_sp_8x4_internal 2141 lea r0, [r0 + 4 * r1] 2142 lea r2, [r2 + 4 * r3] 2143 call pixel_ssd_sp_8x4_internal 2144 lea r0, [r0 + 4 * r1] 2145 lea r2, [r2 + 4 * r3] 2146 call pixel_ssd_sp_8x4_internal 2147 lea r0, [r0 + 4 * r1] 2148 lea r2, [r2 + 4 * r3] 2149 call pixel_ssd_sp_8x4_internal 2150 lea r0, [r0 + 4 * r1] 2151 lea r2, [r2 + 4 * r3] 2152 call pixel_ssd_sp_8x4_internal 2153 lea r0, [r0 + 4 * r1] 2154 lea r2, [r2 + 4 * r3] 2155 call pixel_ssd_sp_8x4_internal 2156 lea r0, [r0 + 4 * r1] 2157 lea r2, [r2 + 4 * r3] 2158 call pixel_ssd_sp_8x4_internal 2159 lea r0, [r0 + 4 * r1] 2160 lea r2, [r2 + 4 * r3] 2161 call pixel_ssd_sp_8x4_internal 2162 HADDD m7, m1 2163 movd eax, m7 2164 RET 2165 2166;----------------------------------------------------------------------------- 2167; int pixel_ssd_32x8( uint8_t *, intptr_t, uint8_t *, intptr_t ) 2168;----------------------------------------------------------------------------- 2169INIT_XMM sse4 2170cglobal pixel_ssd_sp_32x8, 4, 7, 8, src1, stride1, src2, stride2 2171 2172 pxor m7, m7 2173 pxor m6, m6 2174 mov r5, r0 2175 mov r6, r2 2176 add r1, r1 2177 lea r4, [r1 * 2] 2178 PIXEL_SSD_SP_16x4 2179 lea r0, [r0 + r4] 2180 lea r2, [r2 + 2 * r3] 2181 PIXEL_SSD_SP_16x4 2182 lea r0, [r5 + 32] 2183 lea r2, [r6 + 16] 2184 PIXEL_SSD_SP_16x4 2185 lea r0, [r0 + r4] 2186 lea r2, [r2 + 2 * r3] 2187 PIXEL_SSD_SP_16x4 2188 HADDD m7, m1 2189 movd eax, m7 2190 RET 2191 2192;----------------------------------------------------------------------------- 2193; int pixel_ssd_32x16( uint8_t *, intptr_t, uint8_t *, intptr_t ) 2194;----------------------------------------------------------------------------- 2195INIT_XMM sse4 2196cglobal pixel_ssd_sp_32x16, 4, 7, 8, src1, stride1, src2, stride2 2197 2198 pxor m7, m7 2199 pxor m6, m6 2200 mov r5, r0 2201 mov r6, r2 2202 add r1, r1 2203 lea r4, [r1 * 2] 2204 call pixel_ssd_sp_16x16_internal 2205 lea r0, [r5 + 32] 2206 lea r2, [r6 + 16] 2207 call pixel_ssd_sp_16x16_internal 2208 HADDD m7, m1 2209 movd eax, m7 2210 RET 2211 2212;----------------------------------------------------------------------------- 2213; int pixel_ssd_32x24( uint8_t *, intptr_t, uint8_t *, intptr_t ) 2214;----------------------------------------------------------------------------- 2215INIT_XMM sse4 2216cglobal pixel_ssd_sp_32x24, 4, 7, 8, src1, stride1, src2, stride2 2217 2218 pxor m7, m7 2219 pxor m6, m6 2220 mov r5, r0 2221 mov r6, r2 2222 add r1, r1 2223 lea r4, [r1 * 2] 2224 call pixel_ssd_sp_16x16_internal 2225 lea r0, [r0 + r4] 2226 lea r2, [r2 + 2 * r3] 2227 PIXEL_SSD_SP_16x4 2228 lea r0, [r0 + r4] 2229 lea r2, [r2 + 2 * r3] 2230 PIXEL_SSD_SP_16x4 2231 lea r0, [r5 + 32] 2232 lea r2, [r6 + 16] 2233 call pixel_ssd_sp_16x16_internal 2234 lea r0, [r0 + r4] 2235 lea r2, [r2 + 2 * r3] 2236 PIXEL_SSD_SP_16x4 2237 lea r0, [r0 + r4] 2238 lea r2, [r2 + 2 * r3] 2239 PIXEL_SSD_SP_16x4 2240 HADDD m7, m1 2241 movd eax, m7 2242 RET 2243 2244;----------------------------------------------------------------------------- 2245; int pixel_ssd_32x32( uint8_t *, intptr_t, uint8_t *, intptr_t ) 2246;----------------------------------------------------------------------------- 2247INIT_XMM sse4 2248cglobal pixel_ssd_sp_32x32, 4, 7, 8, src1, stride1, src2, stride2 2249 2250 pxor m7, m7 2251 pxor m6, m6 2252 mov r5, r0 2253 mov r6, r2 2254 add r1, r1 2255 lea r4, [r1 * 2] 2256 call pixel_ssd_sp_16x16_internal 2257 lea r0, [r0 + r4] 2258 lea r2, [r2 + 2 * r3] 2259 call pixel_ssd_sp_16x16_internal 2260 lea r0, [r5 + 32] 2261 lea r2, [r6 + 16] 2262 call pixel_ssd_sp_16x16_internal 2263 lea r0, [r0 + r4] 2264 lea r2, [r2 + 2 * r3] 2265 call pixel_ssd_sp_16x16_internal 2266 HADDD m7, m1 2267 movd eax, m7 2268 RET 2269 2270;----------------------------------------------------------------------------- 2271; int pixel_ssd_32x64( uint8_t *, intptr_t, uint8_t *, intptr_t ) 2272;----------------------------------------------------------------------------- 2273INIT_XMM sse4 2274cglobal pixel_ssd_sp_32x64, 4, 7, 8, src1, stride1, src2, stride2 2275 2276 pxor m7, m7 2277 pxor m6, m6 2278 mov r5, r0 2279 mov r6, r2 2280 add r1, r1 2281 lea r4, [r1 * 2] 2282 call pixel_ssd_sp_16x16_internal 2283 lea r0, [r0 + r4] 2284 lea r2, [r2 + 2 * r3] 2285 call pixel_ssd_sp_16x16_internal 2286 lea r0, [r0 + r4] 2287 lea r2, [r2 + 2 * r3] 2288 call pixel_ssd_sp_16x16_internal 2289 lea r0, [r0 + r4] 2290 lea r2, [r2 + 2 * r3] 2291 call pixel_ssd_sp_16x16_internal 2292 lea r0, [r5 + 32] 2293 lea r2, [r6 + 16] 2294 call pixel_ssd_sp_16x16_internal 2295 lea r0, [r0 + r4] 2296 lea r2, [r2 + 2 * r3] 2297 call pixel_ssd_sp_16x16_internal 2298 lea r0, [r0 + r4] 2299 lea r2, [r2 + 2 * r3] 2300 call pixel_ssd_sp_16x16_internal 2301 lea r0, [r0 + r4] 2302 lea r2, [r2 + 2 * r3] 2303 call pixel_ssd_sp_16x16_internal 2304 HADDD m7, m1 2305 movd eax, m7 2306 RET 2307 2308;----------------------------------------------------------------------------- 2309; int pixel_ssd_48x64( uint8_t *, intptr_t, uint8_t *, intptr_t ) 2310;----------------------------------------------------------------------------- 2311INIT_XMM sse4 2312cglobal pixel_ssd_sp_48x64, 4, 7, 8, src1, stride1, src2, stride2 2313 2314 pxor m7, m7 2315 pxor m6, m6 2316 mov r5, r0 2317 mov r6, r2 2318 add r1, r1 2319 lea r4, [r1 * 2] 2320 call pixel_ssd_sp_16x16_internal 2321 lea r0, [r0 + r4] 2322 lea r2, [r2 + 2 * r3] 2323 call pixel_ssd_sp_16x16_internal 2324 lea r0, [r0 + r4] 2325 lea r2, [r2 + 2 * r3] 2326 call pixel_ssd_sp_16x16_internal 2327 lea r0, [r0 + r4] 2328 lea r2, [r2 + 2 * r3] 2329 call pixel_ssd_sp_16x16_internal 2330 lea r0, [r5 + 32] 2331 lea r2, [r6 + 16] 2332 call pixel_ssd_sp_16x16_internal 2333 lea r0, [r0 + r4] 2334 lea r2, [r2 + 2 * r3] 2335 call pixel_ssd_sp_16x16_internal 2336 lea r0, [r0 + r4] 2337 lea r2, [r2 + 2 * r3] 2338 call pixel_ssd_sp_16x16_internal 2339 lea r0, [r0 + r4] 2340 lea r2, [r2 + 2 * r3] 2341 call pixel_ssd_sp_16x16_internal 2342 lea r0, [r5 + 64] 2343 lea r2, [r6 + 32] 2344 call pixel_ssd_sp_16x16_internal 2345 lea r0, [r0 + r4] 2346 lea r2, [r2 + 2 * r3] 2347 call pixel_ssd_sp_16x16_internal 2348 lea r0, [r0 + r4] 2349 lea r2, [r2 + 2 * r3] 2350 call pixel_ssd_sp_16x16_internal 2351 lea r0, [r0 + r4] 2352 lea r2, [r2 + 2 * r3] 2353 call pixel_ssd_sp_16x16_internal 2354 HADDD m7, m1 2355 movd eax, m7 2356 RET 2357 2358;----------------------------------------------------------------------------- 2359; int pixel_ssd_64x16( uint8_t *, intptr_t, uint8_t *, intptr_t ) 2360;----------------------------------------------------------------------------- 2361INIT_XMM sse4 2362cglobal pixel_ssd_sp_64x16, 4, 7, 8, src1, stride1, src2, stride2 2363 2364 pxor m7, m7 2365 pxor m6, m6 2366 mov r5, r0 2367 mov r6, r2 2368 add r1, r1 2369 lea r4, [r1 * 2] 2370 call pixel_ssd_sp_16x16_internal 2371 lea r0, [r5 + 32] 2372 lea r2, [r6 + 16] 2373 call pixel_ssd_sp_16x16_internal 2374 lea r0, [r5 + 64] 2375 lea r2, [r6 + 32] 2376 call pixel_ssd_sp_16x16_internal 2377 lea r0, [r5 + 96] 2378 lea r2, [r6 + 48] 2379 call pixel_ssd_sp_16x16_internal 2380 HADDD m7, m1 2381 movd eax, m7 2382 RET 2383 2384;----------------------------------------------------------------------------- 2385; int pixel_ssd_64x32( uint8_t *, intptr_t, uint8_t *, intptr_t ) 2386;----------------------------------------------------------------------------- 2387INIT_XMM sse4 2388cglobal pixel_ssd_sp_64x32, 4, 7, 8, src1, stride1, src2, stride2 2389 2390 pxor m7, m7 2391 pxor m6, m6 2392 mov r5, r0 2393 mov r6, r2 2394 add r1, r1 2395 lea r4, [r1 * 2] 2396 call pixel_ssd_sp_16x16_internal 2397 lea r0, [r0 + r4] 2398 lea r2, [r2 + 2 * r3] 2399 call pixel_ssd_sp_16x16_internal 2400 lea r0, [r5 + 32] 2401 lea r2, [r6 + 16] 2402 call pixel_ssd_sp_16x16_internal 2403 lea r0, [r0 + r4] 2404 lea r2, [r2 + 2 * r3] 2405 call pixel_ssd_sp_16x16_internal 2406 lea r0, [r5 + 64] 2407 lea r2, [r6 + 32] 2408 call pixel_ssd_sp_16x16_internal 2409 lea r0, [r0 + r4] 2410 lea r2, [r2 + 2 * r3] 2411 call pixel_ssd_sp_16x16_internal 2412 lea r0, [r5 + 96] 2413 lea r2, [r6 + 48] 2414 call pixel_ssd_sp_16x16_internal 2415 lea r0, [r0 + r4] 2416 lea r2, [r2 + 2 * r3] 2417 call pixel_ssd_sp_16x16_internal 2418 HADDD m7, m1 2419 movd eax, m7 2420 RET 2421 2422;----------------------------------------------------------------------------- 2423; int pixel_ssd_64x48( uint8_t *, intptr_t, uint8_t *, intptr_t ) 2424;----------------------------------------------------------------------------- 2425INIT_XMM sse4 2426cglobal pixel_ssd_sp_64x48, 4, 7, 8, src1, stride1, src2, stride2 2427 2428 pxor m7, m7 2429 pxor m6, m6 2430 mov r5, r0 2431 mov r6, r2 2432 add r1, r1 2433 lea r4, [r1 * 2] 2434 call pixel_ssd_sp_16x16_internal 2435 lea r0, [r0 + r4] 2436 lea r2, [r2 + 2 * r3] 2437 call pixel_ssd_sp_16x16_internal 2438 lea r0, [r0 + r4] 2439 lea r2, [r2 + 2 * r3] 2440 call pixel_ssd_sp_16x16_internal 2441 lea r0, [r5 + 32] 2442 lea r2, [r6 + 16] 2443 call pixel_ssd_sp_16x16_internal 2444 lea r0, [r0 + r4] 2445 lea r2, [r2 + 2 * r3] 2446 call pixel_ssd_sp_16x16_internal 2447 lea r0, [r0 + r4] 2448 lea r2, [r2 + 2 * r3] 2449 call pixel_ssd_sp_16x16_internal 2450 lea r0, [r5 + 64] 2451 lea r2, [r6 + 32] 2452 call pixel_ssd_sp_16x16_internal 2453 lea r0, [r0 + r4] 2454 lea r2, [r2 + 2 * r3] 2455 call pixel_ssd_sp_16x16_internal 2456 lea r0, [r0 + r4] 2457 lea r2, [r2 + 2 * r3] 2458 call pixel_ssd_sp_16x16_internal 2459 lea r0, [r5 + 96] 2460 lea r2, [r6 + 48] 2461 call pixel_ssd_sp_16x16_internal 2462 lea r0, [r0 + r4] 2463 lea r2, [r2 + 2 * r3] 2464 call pixel_ssd_sp_16x16_internal 2465 lea r0, [r0 + r4] 2466 lea r2, [r2 + 2 * r3] 2467 call pixel_ssd_sp_16x16_internal 2468 HADDD m7, m1 2469 movd eax, m7 2470 RET 2471 2472;----------------------------------------------------------------------------- 2473; int pixel_ssd_64x64( uint8_t *, intptr_t, uint8_t *, intptr_t ) 2474;----------------------------------------------------------------------------- 2475INIT_XMM sse4 2476cglobal pixel_ssd_sp_64x64, 4, 7, 8, src1, stride1, src2, stride2 2477 2478 pxor m7, m7 2479 pxor m6, m6 2480 mov r5, r0 2481 mov r6, r2 2482 add r1, r1 2483 lea r4, [r1 * 2] 2484 call pixel_ssd_sp_16x16_internal 2485 lea r0, [r0 + r4] 2486 lea r2, [r2 + 2 * r3] 2487 call pixel_ssd_sp_16x16_internal 2488 lea r0, [r0 + r4] 2489 lea r2, [r2 + 2 * r3] 2490 call pixel_ssd_sp_16x16_internal 2491 lea r0, [r0 + r4] 2492 lea r2, [r2 + 2 * r3] 2493 call pixel_ssd_sp_16x16_internal 2494 lea r0, [r5 + 32] 2495 lea r2, [r6 + 16] 2496 call pixel_ssd_sp_16x16_internal 2497 lea r0, [r0 + r4] 2498 lea r2, [r2 + 2 * r3] 2499 call pixel_ssd_sp_16x16_internal 2500 lea r0, [r0 + r4] 2501 lea r2, [r2 + 2 * r3] 2502 call pixel_ssd_sp_16x16_internal 2503 lea r0, [r0 + r4] 2504 lea r2, [r2 + 2 * r3] 2505 call pixel_ssd_sp_16x16_internal 2506 lea r0, [r5 + 64] 2507 lea r2, [r6 + 32] 2508 call pixel_ssd_sp_16x16_internal 2509 lea r0, [r0 + r4] 2510 lea r2, [r2 + 2 * r3] 2511 call pixel_ssd_sp_16x16_internal 2512 lea r0, [r0 + r4] 2513 lea r2, [r2 + 2 * r3] 2514 call pixel_ssd_sp_16x16_internal 2515 lea r0, [r0 + r4] 2516 lea r2, [r2 + 2 * r3] 2517 call pixel_ssd_sp_16x16_internal 2518 lea r0, [r5 + 96] 2519 lea r2, [r6 + 48] 2520 call pixel_ssd_sp_16x16_internal 2521 lea r0, [r0 + r4] 2522 lea r2, [r2 + 2 * r3] 2523 call pixel_ssd_sp_16x16_internal 2524 lea r0, [r0 + r4] 2525 lea r2, [r2 + 2 * r3] 2526 call pixel_ssd_sp_16x16_internal 2527 lea r0, [r0 + r4] 2528 lea r2, [r2 + 2 * r3] 2529 call pixel_ssd_sp_16x16_internal 2530 HADDD m7, m1 2531 movd eax, m7 2532 RET 2533 2534 2535;----------------------------------------------------------------------------- 2536; int pixel_ssd_s( int16_t *ref, intptr_t i_stride ) 2537;----------------------------------------------------------------------------- 2538INIT_XMM sse2 2539cglobal pixel_ssd_s_4, 2,2,2 2540 add r1, r1 2541 movh m0, [r0] 2542 movhps m0, [r0 + r1] 2543 2544 lea r0, [r0 + r1 * 2] 2545 movh m1, [r0] 2546 movhps m1, [r0 + r1] 2547 2548 pmaddwd m0, m0 2549 pmaddwd m1, m1 2550 paddd m0, m1 2551 2552 ; calculate sum and return 2553 HADDD m0, m1 2554 movd eax, m0 2555 RET 2556 2557 2558INIT_XMM sse2 2559cglobal pixel_ssd_s_8, 2,3,5 2560 add r1, r1 2561 lea r2, [r1 * 3] 2562 movu m0, [r0] 2563 movu m1, [r0 + r1] 2564 movu m2, [r0 + r1 * 2] 2565 movu m3, [r0 + r2] 2566 2567 pmaddwd m0, m0 2568 pmaddwd m1, m1 2569 pmaddwd m2, m2 2570 pmaddwd m3, m3 2571 paddd m0, m1 2572 paddd m2, m3 2573 paddd m0, m2 2574 2575 lea r0, [r0 + r1 * 4] 2576 movu m4, [r0] 2577 movu m1, [r0 + r1] 2578 movu m2, [r0 + r1 * 2] 2579 movu m3, [r0 + r2] 2580 2581 pmaddwd m4, m4 2582 pmaddwd m1, m1 2583 pmaddwd m2, m2 2584 pmaddwd m3, m3 2585 paddd m4, m1 2586 paddd m2, m3 2587 paddd m4, m2 2588 paddd m0, m4 2589 2590 ; calculate sum and return 2591 HADDD m0, m1 2592 movd eax, m0 2593 RET 2594 2595 2596INIT_XMM sse2 2597cglobal pixel_ssd_s_16, 2,3,5 2598 add r1, r1 2599 2600 mov r2d, 4 2601 pxor m0, m0 2602.loop: 2603 movu m1, [r0] 2604 movu m2, [r0 + mmsize] 2605 movu m3, [r0 + r1] 2606 movu m4, [r0 + r1 + mmsize] 2607 lea r0, [r0 + r1 * 2] 2608 2609 pmaddwd m1, m1 2610 pmaddwd m2, m2 2611 pmaddwd m3, m3 2612 pmaddwd m4, m4 2613 paddd m1, m2 2614 paddd m3, m4 2615 paddd m1, m3 2616 paddd m0, m1 2617 2618 movu m1, [r0] 2619 movu m2, [r0 + mmsize] 2620 movu m3, [r0 + r1] 2621 movu m4, [r0 + r1 + mmsize] 2622 lea r0, [r0 + r1 * 2] 2623 2624 pmaddwd m1, m1 2625 pmaddwd m2, m2 2626 pmaddwd m3, m3 2627 pmaddwd m4, m4 2628 paddd m1, m2 2629 paddd m3, m4 2630 paddd m1, m3 2631 paddd m0, m1 2632 2633 dec r2d 2634 jnz .loop 2635 2636 ; calculate sum and return 2637 HADDD m0, m1 2638 movd eax, m0 2639 RET 2640 2641 2642INIT_XMM sse2 2643cglobal pixel_ssd_s_32, 2,3,5 2644 add r1, r1 2645 2646 mov r2d, 16 2647 pxor m0, m0 2648.loop: 2649 movu m1, [r0 + 0 * mmsize] 2650 movu m2, [r0 + 1 * mmsize] 2651 movu m3, [r0 + 2 * mmsize] 2652 movu m4, [r0 + 3 * mmsize] 2653 add r0, r1 2654 2655 pmaddwd m1, m1 2656 pmaddwd m2, m2 2657 pmaddwd m3, m3 2658 pmaddwd m4, m4 2659 paddd m1, m2 2660 paddd m3, m4 2661 paddd m1, m3 2662 paddd m0, m1 2663 2664 movu m1, [r0 + 0 * mmsize] 2665 movu m2, [r0 + 1 * mmsize] 2666 movu m3, [r0 + 2 * mmsize] 2667 movu m4, [r0 + 3 * mmsize] 2668 add r0, r1 2669 2670 pmaddwd m1, m1 2671 pmaddwd m2, m2 2672 pmaddwd m3, m3 2673 pmaddwd m4, m4 2674 paddd m1, m2 2675 paddd m3, m4 2676 paddd m1, m3 2677 paddd m0, m1 2678 2679 dec r2d 2680 jnz .loop 2681 2682 ; calculate sum and return 2683 HADDD m0, m1 2684 movd eax, m0 2685 RET 2686 2687INIT_YMM avx2 2688cglobal pixel_ssd_s_16, 2,4,5 2689 add r1, r1 2690 lea r3, [r1 * 3] 2691 mov r2d, 16/4 2692 pxor m0, m0 2693.loop: 2694 movu m1, [r0] 2695 movu m2, [r0 + r1] 2696 movu m3, [r0 + 2 * r1] 2697 movu m4, [r0 + r3] 2698 2699 lea r0, [r0 + r1 * 4] 2700 pmaddwd m1, m1 2701 pmaddwd m2, m2 2702 pmaddwd m3, m3 2703 pmaddwd m4, m4 2704 paddd m1, m2 2705 paddd m3, m4 2706 paddd m1, m3 2707 paddd m0, m1 2708 2709 dec r2d 2710 jnz .loop 2711 2712 ; calculate sum and return 2713 HADDD m0, m1 2714 movd eax, xm0 2715 RET 2716 2717INIT_YMM avx2 2718cglobal pixel_ssd_s_32, 2,4,5 2719 add r1, r1 2720 lea r3, [r1 * 3] 2721 2722 mov r2d, 8 2723 pxor m0, m0 2724.loop: 2725 movu m1, [r0 + 0 * mmsize] 2726 movu m2, [r0 + 1 * mmsize] 2727 movu m3, [r0 + r1 + 0 * mmsize] 2728 movu m4, [r0 + r1 + 1 * mmsize] 2729 2730 pmaddwd m1, m1 2731 pmaddwd m2, m2 2732 pmaddwd m3, m3 2733 pmaddwd m4, m4 2734 paddd m1, m2 2735 paddd m3, m4 2736 paddd m1, m3 2737 paddd m0, m1 2738 2739 movu m1, [r0 + r1 * 2 + 0 * mmsize] 2740 movu m2, [r0 + r1 * 2 + 1 * mmsize] 2741 movu m3, [r0 + r3 + 0 * mmsize] 2742 movu m4, [r0 + r3 + 1 * mmsize] 2743 lea r0, [r0 + 4 * r1] 2744 2745 pmaddwd m1, m1 2746 pmaddwd m2, m2 2747 pmaddwd m3, m3 2748 pmaddwd m4, m4 2749 paddd m1, m2 2750 paddd m3, m4 2751 paddd m1, m3 2752 paddd m0, m1 2753 2754 dec r2d 2755 jnz .loop 2756 2757 ; calculate sum and return 2758 HADDD m0, m1 2759 movd eax, xm0 2760 RET 2761